In [2]:
#r "nuget: Microsoft.ML"
#load "./Modules/MLWrapper.fs"
open Microsoft.ML
open Microsoft.ML.Data
open System.IO
open System.Net
open FunctionalMl

In [3]:
if not <| File.Exists("abalone.data") then
    use client = new WebClient()
    client.DownloadFile("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", "abalone.data")

File.ReadLines("abalone.data")
|> Seq.take 5

index,value
0,"M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15"
1,"M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7"
2,"F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9"
3,"M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10"
4,"I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7"


In [4]:
[<CLIMutable>]
type AbaloneData =
    {
        [<LoadColumn(0)>]
        Sex : string

        [<LoadColumn(1)>]
        Length : float32

        [<LoadColumn(2)>]
        Diameter : float32

        [<LoadColumn(3)>]
        Height : float32

        [<LoadColumn(4)>]
        WholeWeight : float32

        [<LoadColumn(5)>]
        ShuckedWeight : float32

        [<LoadColumn(6)>]
        VisceraWeight : float32

        [<LoadColumn(7)>]
        ShellWeight : float32

        [<LoadColumn(8)>]
        [<ColumnName("Label")>]
        Rings : single
    }

let allData = ML.context.Data.LoadFromTextFile<AbaloneData>("abalone.data", hasHeader = false, separatorChar = ',')

In [5]:
let trainData, testData =
    ML.shuffle allData
    |> ML.split 0.2

In [6]:
let featureColumns = [| "Sex"; "Length"; "Diameter"; "Height"; "WholeWeight"; "ShuckedWeight"; "VisceraWeight"; "ShellWeight" |]

In [7]:
let pipeline = 
    EstimatorChain()
    |> ML.append <| ML.onehot "Sex" // one-hot encode the Sex feature
    |> ML.append <| ML.concatenate "Features" featureColumns // Concatenate feature columns into a single new column
    |> ML.append <| ML.normalizeLp "Features" "FeaturesNorm" // Normalize features into a new column, FeaturesNorm

In [8]:
let transformer =
    pipeline
    |> ML.fit trainData // Fit our pipeline on the training data

In [10]:
let transformedData =
    trainData
    |> ML.transform transformer

Print the data as it was loaded from the file:

In [11]:
ML.context.Data.CreateEnumerable<AbaloneData>(trainData, reuseRowObject = false)
|> Seq.take 3

index,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,F,0.49,0.38,0.125,0.549,0.245,0.1075,0.174,10
1,M,0.685,0.52,0.15,1.343,0.4635,0.292,0.4,13
2,M,0.645,0.485,0.215,1.514,0.546,0.2615,0.635,16


Compare that to the data as transformed by the pipeline. First, we one-hot encoded the `Sex` column. Then we concatenated all of the feature columns into a single new vector column, `Features`. Lastly, we normalized the values and put them into a new vector column, `FeaturesNorm`. Notice that the first three values of `Features` are the one-hot encoded values of `Sex`.

In [12]:
[<CLIMutable>]
type AbaloneDataTransformed =
    {
        [<ColumnName("Label")>]
        Rings : single

        [<VectorType(10)>]
        Features : single[]

        [<VectorType(10)>]
        FeaturesNorm : single[]
    }

ML.context.Data.CreateEnumerable<AbaloneDataTransformed>(transformedData, reuseRowObject = false)
|> Seq.take 3

index,Rings,Features,FeaturesNorm
0,10,"[ 1, 0, 0, 0.49, 0.38, 0.125, 0.549, 0.245, 0.1075, 0.174 ]","[ 0.74465656, 0, 0, 0.36488172, 0.2829695, 0.09308207, 0.40881646, 0.18244086, 0.08005058, 0.12957023 ]"
1,13,"[ 0, 1, 0, 0.685, 0.52, 0.15, 1.343, 0.4635, 0.292, 0.4 ]","[ 0, 0.4983909, 0, 0.34139776, 0.25916326, 0.07475864, 0.669339, 0.23100418, 0.14553015, 0.19935636 ]"
2,16,"[ 0, 1, 0, 0.645, 0.485, 0.215, 1.514, 0.546, 0.2615, 0.635 ]","[ 0, 0.4583784, 0, 0.29565406, 0.22231354, 0.098551355, 0.6939849, 0.2502746, 0.119865954, 0.29107028 ]"


Create an estimator:

In [14]:
let estimator = 
    ML.context.Regression.Trainers.LbfgsPoissonRegression(featureColumnName = "FeaturesNorm")
    |> ML.downcastEstimator

Now, we use cross-validation to select the best performing model.

In [15]:
let model =
    trainData // Begin with the training data
    |> ML.transform transformer // Transform using the transformer built above
    |> ML.crossValidateRegression estimator 3 // 3-fold cross-validation
    |> ML.printRegressionCvMetrics // Print cross-fold metrics
    |> Seq.maxBy (fun cvResult -> cvResult.Metrics.RSquared) // Select the best model by R-squared
    |> fun cvResult -> cvResult.Model

------------------
Cross Validation Metrics
------------------
Mean Absolute Error: 1.550251
Mean Squared Error: 4.711480
Root Mean Squared Error: 2.170481
R-squared: 0.560272


Now we can evaluate our model against the test data.

In [16]:
model
|> ML.transform <| ML.transform transformer testData // Transform the test data and get predictions
|> ML.context.Regression.Evaluate // Get test metrics
|> ML.printRegressionMetrics

------------------
Test Metrics
------------------
Mean Absolute Error: 1.477898
Mean Squared Error: 4.036597
Root Mean Squared Error: 2.009128
R-squared: 0.552453


Now let's pretend we have new data (for convenience we are just randomly re-sampling the test data) to see what predictions our model makes. You will see two values:
- `Label`: the actual number of rings from the example being predicted. Our model never sees this value but we show it below so that you can see how close the predicted number of rings are to the actual.
- `Score`: the predicted number of rings made by the model. The closer this is to the Label, the more accurate is the prediction.

You can run this cell multiple times to get new random samples and their predictions!

In [18]:
[<CLIMutable>]
type RegressionPrediction = { Label : single; Score : single }

// Show some sample predictions
let sampleData =
    testData
    |> ML.shuffle 
    |> ML.transform transformer

let predictionEngine = ML.context.Model.CreatePredictionEngine<AbaloneDataTransformed, RegressionPrediction>(model)

ML.context.Data.CreateEnumerable<AbaloneDataTransformed>(sampleData, reuseRowObject = false)
|> Seq.take 5
|> Seq.map predictionEngine.Predict

index,Label,Score
0,11,8.895894
1,8,10.011565
2,7,6.0247774
3,9,7.9776998
4,11,10.351429
