In [1]:
#r "nuget: Microsoft.ML"
#load "./Modules/MLWrapper.fs"
open Microsoft.ML
open Microsoft.ML.Data
open System.IO
open System.Net
open FunctionalMl

Download data from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/machine-learning-databases/annealing):

In [2]:
if not <| File.Exists("anneal.data") then
    use client = new WebClient()
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/annealing/anneal.data", "anneal.data")
    
printfn "Train data file has %d lines" <| File.ReadLines("anneal.data").Count()
File.ReadLines("anneal.data")
|> Seq.take 5

Train data file has 798 lines


index,value
0,"?,C,A,08,00,?,S,?,000,?,?,G,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,COIL,0.700,0610.0,0000,?,0000,?,3"
1,"?,C,R,00,00,?,S,2,000,?,?,E,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,Y,?,?,?,COIL,3.200,0610.0,0000,?,0000,?,3"
2,"?,C,R,00,00,?,S,2,000,?,?,E,?,?,Y,?,B,?,?,?,?,?,?,?,?,?,?,?,?,?,?,SHEET,0.700,1300.0,0762,?,0000,?,3"
3,"?,C,A,00,60,T,?,?,000,?,?,G,?,?,?,?,M,?,?,?,?,?,?,?,?,?,?,?,?,?,?,COIL,2.801,0385.1,0000,?,0000,?,3"
4,"?,C,A,00,60,T,?,?,000,?,?,G,?,?,?,?,B,Y,?,?,?,Y,?,?,?,?,?,?,?,?,?,SHEET,0.801,0255.0,0269,?,0000,?,3"


Some datasets from the UCI Machine Learning Repository have two files, one for training data and one for test data. Notice how the .test dataset has a line of descriptive text at the start of the file. We don't want to load this line--you will see how to deal with that below.

In [3]:
if not <| File.Exists("anneal.test") then
    use client = new WebClient()
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/annealing/anneal.test", "anneal.test")

printfn "Train data file has %d lines" <| File.ReadLines("anneal.test").Count()
File.ReadLines("anneal.test")
|> Seq.take 5

Train data file has 100 lines


index,value
0,"?,C,A,00,45,?,S,?,000,?,?,D,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,COIL,1.600,0610.0,0000,?,0000,?,3"
1,"?,C,A,00,00,?,S,3,000,N,?,E,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,Y,?,?,?,COIL,0.699,0609.9,0000,?,0000,?,3"
2,"ZS,C,A,00,85,T,?,?,000,?,?,E,?,?,?,Y,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,SHEET,0.400,0610.0,0762,?,0000,?,U"
3,"ZS,C,A,00,50,T,?,?,000,?,?,E,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,COIL,0.400,0610.0,0000,?,0000,?,3"
4,"?,C,A,00,00,?,S,2,000,?,?,E,?,?,Y,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,COIL,0.699,1320.0,0000,?,0000,?,3"


Notice all of the `?` values--these stand for "missing" data and we will take care of them in our pipeline below. Next, we need to declare a type that defines the shape of our data. It's rather long so be patient:

In [4]:
[<CLIMutable>]
type AnnealingData =
    {
        [<LoadColumn(0)>]
        Family : string

        [<LoadColumn(1)>]
        ProductType : string

        [<LoadColumn(2)>]
        Steel : string

        [<LoadColumn(3)>]
        Carbon : float32

        [<LoadColumn(4)>]
        Hardness : float32

        [<LoadColumn(5)>]
        TemperRolling : string

        [<LoadColumn(6)>]
        Condition : string

        [<LoadColumn(7)>]
        Formability : string

        [<LoadColumn(8)>]
        Strength : float32

        [<LoadColumn(9)>]
        NonAgeing : string

        [<LoadColumn(10)>]
        SurfaceFinish : string

        [<LoadColumn(11)>]
        SurfaceQuality : string

        [<LoadColumn(12)>]
        Enamelability : string

        [<LoadColumn(13)>]
        Bc : string

        [<LoadColumn(14)>]
        Bf : string

        [<LoadColumn(15)>]
        Bt : string

        [<LoadColumn(16)>]
        BwMe : string

        [<LoadColumn(17)>]
        Bl : string

        [<LoadColumn(18)>]
        M : string

        [<LoadColumn(19)>]
        Chrom : string

        [<LoadColumn(20)>]
        Phos : string

        [<LoadColumn(21)>]
        Cbond : string

        [<LoadColumn(22)>]
        Marvi : string

        [<LoadColumn(23)>]
        Exptl : string

        [<LoadColumn(24)>]
        Ferro : string

        [<LoadColumn(25)>]
        Corr : string

        [<LoadColumn(26)>]
        BlueBrightVarnClean : string

        [<LoadColumn(27)>]
        Lustre : string

        [<LoadColumn(28)>]
        Jurofm : string

        [<LoadColumn(29)>]
        S : string

        [<LoadColumn(30)>]
        P : string

        [<LoadColumn(31)>]
        Shape : string

        [<LoadColumn(32)>]
        Thick : float32

        [<LoadColumn(33)>]
        Width : float32

        [<LoadColumn(34)>]
        Len : float32

        [<LoadColumn(35)>]
        Oil : string

        [<LoadColumn(36)>]
        Bore : string

        [<LoadColumn(37)>]
        Packing : string

        [<LoadColumn(38)>]
        [<ColumnName("Label")>]
        Classes : string
    }

Now that we have an `MLContext` (created by the FunctionalML module we loaded above) and a class to represent our data, we can load the file into a `DataView`. It is a good practice to shuffle the data after loading. Many datasets come ordered by some columns of values or even worse by the label. For training a model we want our data to be in a random order:

In [5]:
let trainData =
    ML.context.Data.LoadFromTextFile<AnnealingData>("anneal.data", hasHeader = false, separatorChar = ',')
    |> ML.shuffle

Next we will load the test data.

In [6]:
let testData =
    ML.context.Data.LoadFromTextFile<AnnealingData>("anneal.test", hasHeader = false, separatorChar = ',')
    |> ML.shuffle

Now we declare the features of the dataset that we will train our model on:

In [7]:
let featureColumns =
    [|
        "Family"; "ProductType"; "Steel"; "Carbon"; "Hardness"; "TemperRolling"; "Condition"; "Formability"; "Strength";
        "NonAgeing"; "SurfaceFinish"; "SurfaceQuality"; "Enamelability"; "Bc"; "Bf"; "Bt"; "BwMe"; "Bl"; "M"; "Chrom";
        "Phos"; "Cbond"; "Marvi"; "Exptl"; "Ferro"; "Corr"; "BlueBrightVarnClean"; "Lustre"; "Jurofm"; "S"; "P"; "Shape";
        "Thick"; "Width"; "Len"; "Oil"; "Bore"; "Packing"
    |]

There are a number of categorical columns (string values that reprsent discrete values) in the data. We will need to encode those columns so we declare which columns are categorical here:

In [8]:
let categoricalColumns =
    [|
        "Family"; "ProductType"; "Steel"; "TemperRolling"; "Condition"; "Formability"; "NonAgeing"; "SurfaceFinish";
        "SurfaceQuality"; "Enamelability"; "Bc"; "Bf"; "Bt"; "BwMe"; "Bl"; "M"; "Chrom"; "Phos"; "Cbond"; "Marvi";
        "Exptl"; "Ferro"; "Corr"; "BlueBrightVarnClean"; "Lustre"; "Jurofm"; "S"; "P"; "Shape"; "Oil"; "Bore"; "Packing"
    |]

What are we trying to predict with this data? The column called `Classes` which we have identified as the `Label` in our type is the value that we will try to predict. In order to determine whether this is a regression or a classification problem, we need to look at the values that Classes can take on--are they continuous or are they categorical?

In [9]:
trainData.GetColumn<string>("Label")
|> Seq.distinct

index,value
0,3
1,U
2,2
3,1
4,5


So all but one of `Classes` are numeric. Because of this, we will treat this as a classification problem which means that we need to treat `Classes` as a categorical variable. In our pipeline, we will map the value of our `Label` (i.e., `Classes`) column to a Key (see below). The first step in the pipeline will one-hot encode all of the categorical columns. We will also concatenate all of the feature columns into a single new column, `Features`. Finally, we map the original `Label` column to a new column, `LabelValue`, just for purposes of displaying later on.

In [10]:
let pipeline =
    categoricalColumns
    |> Seq.map ML.onehot // Create a one-hot encoder for each categorical column
    |> Seq.fold ML.append (EstimatorChain()) // Add the encoders to a new EstimatorChain
    |> ML.append <| ML.mapValueToKey "Label" "Label" // Map labels keys
    |> ML.append <| ML.concatenate "Features" featureColumns // Concatenate feature columns into a single new column
    |> ML.append <| ML.mapKeyToValue "Label" "LabelValue"

Fit the pipeline to our training data:

In [11]:
let transformer =
    pipeline
    |> ML.fit trainData

Let's first view the data as it was loaded from the downloaded file:

In [12]:
ML.context.Data.CreateEnumerable<AnnealingData>(trainData, reuseRowObject = false)
|> Seq.take 5

index,Family,ProductType,Steel,Carbon,Hardness,TemperRolling,Condition,Formability,Strength,NonAgeing,SurfaceFinish,SurfaceQuality,Enamelability,Bc,Bf,Bt,BwMe,Bl,M,Chrom,Phos,Cbond,Marvi,Exptl,Ferro,Corr,BlueBrightVarnClean,Lustre,Jurofm,S,P,Shape,Thick,Width,Len,Oil,Bore,Packing,Classes
0,?,C,W,0,0,?,?,?,310,?,?,G,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,SHEET,1.599,1275,762,?,0,?,3
1,?,C,R,0,0,?,S,2,0,?,?,E,?,?,Y,?,?,Y,?,?,?,?,?,?,?,?,?,?,?,?,?,SHEET,0.699,1320,4880,Y,0,?,3
2,?,C,R,0,0,?,S,3,0,?,?,E,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,SHEET,0.6,610,4880,?,0,?,3
3,?,C,K,65,0,?,?,?,0,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,COIL,1.09,610,0,?,0,?,3
4,?,C,A,0,85,T,?,?,0,?,?,G,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,COIL,3.2,610,0,Y,0,?,U


Now let's see what the data looks like after it has been transformed by our pipeline:

In [13]:
[<CLIMutable>]
type AnnealingDataTransformed =
    {
        [<ColumnName("LabelValue")>]
        Classes : string

        [<VectorType(84)>]
        Features : single[]
    }

let transformedData =
    trainData
    |> ML.transform transformer

ML.context.Data.CreateEnumerable<AnnealingDataTransformed>(transformedData, reuseRowObject = false)
|> Seq.take 3

index,Classes,Features
0,3,"[ 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 ... (74 more) ]"
1,3,"[ 1, 0, 0, 1, 0, 1, 0, 0, 0, 0 ... (74 more) ]"
2,3,"[ 1, 0, 0, 1, 0, 1, 0, 0, 0, 0 ... (74 more) ]"


Woah! Notice how there are 80+ columns after running the data through the pipeline! This is due to one-hot encoding which creates a new column for each discrete value in our categorical columns. Don't worry though, this is no problem for ML.NET which can deal with hundreds, even thousands of features in a dataset.

Now we will create a multiclass classification estimator. You can try different estimators to see how their accuracy differs.

In [14]:
let estimator =
    ML.context.MulticlassClassification.Trainers.LbfgsMaximumEntropy(featureColumnName = "Features")
    |> ML.downcastEstimator

Use cross-validation to select the best performing model. Along the way we will print the metrics for our model.

In [15]:
let model =
    trainData // Begin with the training data
    |> ML.transform transformer // Transform using the transformer built above
    |> ML.crossValidateMulticlassClassification estimator 3 // 3-fold cross-validation
    |> ML.printMulticlassClassificationCvMetrics // Print cross-fold metrics
    |> Seq.maxBy (fun cvResult -> cvResult.Metrics.MacroAccuracy) // Select the best model by Accuracy
    |> fun cvResult -> cvResult.Model

------------------
Cross Validation Metrics
------------------
Accuracy: 0.885410
Log Loss: 0.175310


OK, now we can use our best model on the test data.

In [16]:
model
|> ML.transform <| ML.transform transformer testData // Transform the test data and get predictions
|> ML.context.MulticlassClassification.Evaluate // Get test metrics
|> ML.printMulticlassClassificationMetrics

------------------
Test Metrics
------------------
Accuracy: 0.951754
Log Loss: 0.160146
Confusion Matrix:

Confusion table
PREDICTED ||     3 |     U |     2 |     1 |     5 | Recall
        3 ||    74 |     0 |     2 |     0 |     0 | 0.9737
        U ||     1 |     5 |     0 |     0 |     0 | 0.8333
        2 ||     0 |     0 |    11 |     0 |     0 | 1.0000
        1 ||     0 |     0 |     0 |     0 |     0 | 0.0000
        5 ||     0 |     0 |     0 |     0 |     7 | 1.0000
Precision ||0.9867 |1.0000 |0.8462 |0.0000 |1.0000 |



Now let's pretend we have new data (for convenience we are just randomly re-sampling the test data) to see what predictions our model makes. You will see three properties to each prediction:
- `LabelValue`: this is the actual `Label` value we are trying to predict. Our model doesn't know what the actual value is--it is shown here for comparison. Since our `Label` column is mapped to a key value, we need to map it to another column in order to actually see the human-readable key.
- `Score`: an array of probabilties per class.
- `PredictedLabelValue`: this is the actual prediction made by the model. Again, since our `PredictedLabel` is a Key value, we need to map it to a new, human-readable column in order to view it.

You can run this cell multiple times to get new random samples and their predictions!

In [17]:
[<CLIMutable>]
type MulticlassClassificationPrediction = { LabelValue: string; Score : single[]; PredictedLabelValue : string }

let sampleData =
    testData
    |> ML.shuffle
    |> ML.transform transformer
    |> ML.transform model

let mapValues =
    EstimatorChain()
    |> ML.append <| ML.mapKeyToValue "PredictedLabel" "PredictedLabelValue"
    |> ML.append <| ML.mapKeyToValue "Label" "LabelValue"
    |> ML.fit sampleData

let samplePredictions = mapValues.Transform(sampleData)
ML.context.Data.CreateEnumerable<MulticlassClassificationPrediction>(samplePredictions, reuseRowObject = false)
|> Seq.take 5

index,LabelValue,Score,PredictedLabelValue
0,3,"[ 0.9847105, 0.015290131, 3.50356E-24, 5.8728496E-15, 3.3196028E-21 ]",3
1,3,"[ 0.99201024, 0.00021785192, 0.0059491727, 0.0008131322, 0.0010096285 ]",3
2,3,"[ 0.90346605, 0.0006509572, 0.08337129, 0, 0.012506964 ]",3
3,2,"[ 0.37503505, 0, 0.62496334, 0, 0 ]",2
4,3,"[ 0.7016915, 0, 0.04904273, 0.24926533, 0 ]",3
