# Linear Regression in Go

This is a juptyer notebook implementation of an example found in _Machine Learning With Go_ by Daniel Whitenack. The examples can be found [here](https://github.com/PacktPublishing/Machine-Learning-With-Go/tree/master/Chapter04/linear_regression)

## Import Data
 
Using dataframes we import and view the data stored in a `.csv` file.

In [1]:
import (
    "os"
    "log"
    "fmt"
    
    "github.com/kniren/gota/dataframe"
    )

advertFile, err := os.Open("data/Advertising.csv")
if err != nil {
    log.Fatal(err)
}


advertDF := dataframe.ReadCSV(advertFile)
advertSummary := advertDF.Describe()

fmt.Println(advertSummary)

advertFile.Close()

[7x5] DataFrame

    column   TV         Radio     Newspaper  Sales    
 0: mean     147.042500 23.264000 30.554000  14.022500
 1: stddev   85.854236  14.846809 21.778621  5.217457 
 2: min      0.700000   0.000000  0.300000   1.600000 
 3: 25%      73.400000  9.900000  12.600000  10.300000
 4: 50%      149.700000 22.500000 25.600000  12.900000
 5: 75%      218.500000 36.500000 45.100000  17.400000
 6: max      296.400000 49.600000 114.000000 27.000000
    <string> <float>    <float>   <float>    <float>  



## Visualize Data

using gonum/plot we are able to visualize the data and understand it.

In [2]:
import (
    "gonum.org/v1/plot"
    "gonum.org/v1/plot/plotter"
    "gonum.org/v1/plot/vg"
)

for _, colName := range advertDF.Names() {
    plotVals := make(plotter.Values, advertDF.Nrow())
    for i, floatVal := range advertDF.Col(colName).Float() {
        plotVals[i] = floatVal
    }
    
    p, err := plot.New()
    if err != nil {
        log.Fatal(err)
    }
    p.Title.Text = fmt.Sprintf("Histogram of a %s", colName)
    
    h, err := plotter.NewHist(plotVals, 16)
    if err != nil {
        log.Fatal(err)
    }
    
    h.Normalize(1)
    
    p.Add(h)
    
    if err := p.Save(4*vg.Inch, 4*vg.Inch, "graphs/"+colName+"_hist.png"); err !=nil {
        log.Fatal(err)
    }
}



## Independent variable

We are looking at how all of the advertising mediums affect sales

In [3]:

// we are comparing all of the other columns to the Sales column making Sales the dependent Variable
ySales := advertDF.Col("Sales").Float()

for _, colName := range advertDF.Names() {
    pts := make(plotter.XYs, advertDF.Nrow())
    
    for i, floatVal := range advertDF.Col(colName).Float(){
        pts[i].X = floatVal
        pts[i].Y = ySales[i]
    }
    
    p, err := plot.New()
    if err != nil {
        log.Fatal(err)
    }
    p.X.Label.Text = colName
    p.Y.Label.Text = "y"
    p.Add(plotter.NewGrid())
    
    s, err := plotter.NewScatter(pts)
    if err != nil {
        log.Fatal(err)
    }
    s.GlyphStyle.Radius = vg.Points(3)
    
    p.Add(s)
    
    if err := p.Save(4*vg.Inch, 4*vg.Inch, "graphs/"+colName+"_scatter.png"); err !=nil {
        log.Fatal(err)
    }
}

## Determine Train Data

We are using 80% of the data to train and 20% if the data to test.

In [4]:
import "bufio"

trainingNum := (4 * advertDF.Nrow()) / 5
testNum := advertDF.Nrow()/5
// adjust for odd amount
if trainingNum+testNum < advertDF.Nrow(){
    trainingNum++
}

trainingIndex := make([]int, trainingNum)
testIndex := make([]int, testNum)

for i := 0; i < trainingNum; i ++ {
    trainingIndex[i] = i
}

for i := 0; i < testNum; i++ {
    testIndex[i] = trainingNum + i
}

trainingDF := advertDF.Subset(trainingIndex)
testDF := advertDF.Subset(testIndex)

setMap := map[int]dataframe.DataFrame{
    0: trainingDF,
    1: testDF,
}

for i, setName := range []string{"data/training.csv", "data/test.csv"} {
    f, err := os.Create(setName)
    if err != nil {
        log.Fatal(err)
    }
    
    w := bufio.NewWriter(f)
    
    if err := setMap[i].WriteCSV(w); err != nil {
        log.Fatal(err)
    } 
}


## Train The Model

In [5]:
import (
    "encoding/csv"
    "strconv"
    
    "github.com/sajari/regression"
)

trainCSV, err := os.Open("data/training.csv")
if err != nil {
    log.Fatal(err)
}


readerTrain := csv.NewReader(trainCSV)

readerTrain.FieldsPerRecord = 4
trainingData, err := readerTrain.ReadAll()
if err != nil {
    log.Fatal(err)
}

var r regression.Regression
r.SetObserved("Sales")
r.SetVar(0, "TV")

for i, record := range trainingData {
    
    // skip header
    if i == 0{
        continue
    }
    
    // our sales dependent variable
    yVal, err := strconv.ParseFloat(record[3], 64)
    if err != nil {
        log.Fatal(err)
    }
    
    tvVal, err := strconv.ParseFloat(record[0], 64)
    if err != nil {
        log.Fatal(err)
    }
    
    r.Train(regression.DataPoint(yVal, []float64{tvVal}))
    
    r.Run()
}

fmt.Printf("\nRegression Formula:\n%v\n\n",r.Formula)




Regression Formula:
Predicted = 7.98 + TV*0.06



49 <nil>

## Test The Model

In [6]:
import "math"

testCSV, err := os.Open("data/test.csv")
if err != nil {
    log.Fatal(err)
}


readerTest := csv.NewReader(testCSV)

readerTest.FieldsPerRecord = 4

testData, err := readerTest.ReadAll()
if err != nil {
    log.Fatal(err)
}

var mAE float64
for i, record := range testData {
    
    // skip header
    if i == 0 {
        continue
    }
    
    yObserved, err :=strconv.ParseFloat(record[3], 64)
    if err != nil {
        log.Fatal(err)
    }
    
    tvVal, err := strconv.ParseFloat(record[0], 64)
    if err != nil {
        log.Fatal(err)
    }
    
    yPredicted, err := r.Predict([]float64{tvVal})
    
    mAE += math.Abs(yObserved-yPredicted) / float64(len(testData))
    
    fmt.Printf("MAE = %0.2F\n\n", mAE)
}



MAE = 0.10

MAE = 0.10

MAE = 0.22

MAE = 0.22

MAE = 0.29

MAE = 0.55

MAE = 0.58

MAE = 0.78

MAE = 0.88

MAE = 1.13

MAE = 1.20

MAE = 1.28

MAE = 1.32

MAE = 1.48

MAE = 1.73

MAE = 1.78

MAE = 1.85

MAE = 2.02

MAE = 2.34

MAE = 2.47

MAE = 2.64

MAE = 2.87

MAE = 2.93

MAE = 2.95

MAE = 3.09

MAE = 3.14

MAE = 3.29

MAE = 3.35

MAE = 3.59

MAE = 3.64

MAE = 3.65

MAE = 3.72

MAE = 3.80

MAE = 3.83

MAE = 3.84

MAE = 3.90

MAE = 4.00

MAE = 4.15

MAE = 4.15

MAE = 4.37



## Add Prediction to Scatter

In [1]:
func predict(val float64) float64{
    return 7.98 + val*0.06
}


pts := make(plotter.XYs, advertDF.Nrow())
ptsPred := make(plotter.XYs, advertDF.Nrow())

l, err := plotter.NewLine(ptsPred)
if err != nil {
    log.Fatal(err)
}

for i, floatVal := range advertDF.Col("TV").Float(){
    pts[i].X = floatVal
    pts[i].Y = ySales[i]
    ptsPred[i].X = floatVal
    ptsPred[i].Y = predict(floatVal)
    
    p, err := plot.New()
    if err != nil {
        log.Fatal(err)
    }
    
    s, err := plotter.NewScatter(pts)
    if err != nil {
        log.Fatal(err)
    }
    
    s.GlyphStyle.Radius = vg.Points(3)
    
    l, err := plotter.NewLine(ptsPred)
    if err != nil {
        log.Fatal(err)
    }
    l.LineStyle.Width = vg.Points(1)
    l.LineStyle.Dashes = []vg.Length{vg.Points(5), vg.Points(5)}
    
    p.Add(s, l)
    
    if err := p.Save(4*vg.Inch, 4*vg.Inch, "graphs/tv_sale_regression_line.png"); err !=nil {
        log.Fatal(err)
    }
}



ERROR: repl.go:6:13: undefined "plotter" in plotter.XYs <*ast.SelectorExpr>

# Multiple Linear Regression
This is an expansion of the above example, but instead of just TV as the independent variable we are using TV and Radio 

In [None]:
var multiRegress regression.Regression

multiRegress.SetObserved("Sales")
multiRegress.SetVar(0,"TV")
multiRegress.SetVar(1,"Radio")

for i, record := range trainingData {
    
    // skip header
    if i == 0{
        continue
    }
    
    // sales
    yVal, err := strconv.ParseFloat(record[3], 64)
    if err != nil {
        log.Fatal(err)
    }
    
    tvVal, err := strconv.ParseFloat(record[0], 64)
    if err != nil {
        log.Fatal(err)
    }
    
    radioVale, err := strconv.ParseFloat(record[1], 64)
    if err != nil {
        log.Fatal(err)
    }
}

In [None]:
trainCSV.Close()
testCSV.Close()