In [1]:
#r "nuget: Microsoft.ML"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;

In [2]:
if (!File.Exists("abalone.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", "abalone.data");
}

File.ReadLines("abalone.data").Take(5)

index,value
0,"M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15"
1,"M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7"
2,"F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9"
3,"M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10"
4,"I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7"


In [3]:
var context = new MLContext();

In [4]:
class AbaloneData
{
    [LoadColumn(0)]
    public string Sex { get; set; }

    [LoadColumn(1)]
    public float Length { get; set; }

    [LoadColumn(2)]
    public float Diameter { get; set; }

    [LoadColumn(3)]
    public float Height { get; set; }

    [LoadColumn(4)]
    public float WholeWeight { get; set; }

    [LoadColumn(5)]
    public float ShuckedWeight { get; set; }

    [LoadColumn(6)]
    public float VisceraWeight { get; set; }

    [LoadColumn(7)]
    public float ShellWeight { get; set; }

    [LoadColumn(8)]
    [ColumnName("Label")]
    public Single Rings { get; set; }

    public override string ToString() =>
        $"{{ Sex: {Sex}\n  Length: {Length}\n  Diameter: {Diameter}\n  Height: {Height}\n  WholeWeight: {WholeWeight}\n  ShuckedWeight: {ShuckedWeight}\n  VisceralWeight: {VisceraWeight}\n  ShellWeight: {ShellWeight}\n  Rings: {Rings} }}";

}


In [5]:
var allData = context.Data.LoadFromTextFile<AbaloneData>("abalone.data", hasHeader: false, separatorChar: ',');
allData = context.Data.ShuffleRows(allData);

In [6]:
var splitData = context.Data.TrainTestSplit(allData, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [7]:
var featureColumns = new[]
{
    nameof(AbaloneData.Sex), nameof(AbaloneData.Length), nameof(AbaloneData.Diameter), nameof(AbaloneData.Height),
    nameof(AbaloneData.WholeWeight), nameof(AbaloneData.ShuckedWeight), nameof(AbaloneData.VisceraWeight),
    nameof(AbaloneData.ShellWeight)
};

In [8]:
var pipeline = context
    .Transforms.Categorical.OneHotEncoding(nameof(AbaloneData.Sex))
    .Append(context.Transforms.Concatenate("Features", featureColumns))
    .Append(context.Transforms.NormalizeLpNorm("FeaturesNorm", "Features"));

In [9]:
var transformer = pipeline.Fit(trainData);

Print the data as it was loaded from the file:

In [10]:
var sourceItems = context.Data
    .CreateEnumerable<AbaloneData>(trainData, reuseRowObject: false)
    .Take(3);
sourceItems

index,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.29,0.23,0.075,0.1165,0.043,0.0255,0.04,7
1,M,0.58,0.45,0.175,1.068,0.425,0.203,0.32,13
2,I,0.605,0.47,0.14,0.939,0.3385,0.201,0.32,13


Compare that to the data as transformed by the pipeline. First, we one-hot encoded the `Sex` column. Then we concatenated all of the feature columns into a single new vector column, `Features`. Lastly, we normalized the values and put them into a new vector column, `FeaturesNorm`. Notice that the first three values of `Features` are the one-hot encoded values of `Sex`.

In [11]:
class AbaloneDataTransformed
{
    [ColumnName("Label")]
    public float Rings { get; set; }

    [VectorType(10)]
    public float[] Features { get; set; }

    [VectorType(10)]
    public float[] FeaturesNorm { get; set; }

    public override string ToString() =>
        $"{{ Rings: {Rings}\n  Features: {string.Join(",", Features)}\n  FeaturesNorm: {string.Join(",", FeaturesNorm)} }}";
}

Console.WriteLine("------------------\nTransformed Data\n------------------");
var transformedData = transformer.Transform(trainData);
var transformedItems = context.Data
    .CreateEnumerable<AbaloneDataTransformed>(transformedData, reuseRowObject: false)
    .Take(3);
transformedItems

------------------
Transformed Data
------------------


index,Rings,Features,FeaturesNorm
0,7,"[ 1, 0, 0, 0.29, 0.23, 0.075, 0.1165, 0.043, 0.0255, 0.04 ]","[ 0.928358, 0, 0, 0.2692238, 0.21352234, 0.06962685, 0.10815371, 0.039919395, 0.023673128, 0.03713432 ]"
1,13,"[ 1, 0, 0, 0.58, 0.45, 0.175, 1.068, 0.425, 0.203, 0.32 ]","[ 0.5740699, 0, 0, 0.33296055, 0.25833145, 0.100462236, 0.61310667, 0.24397972, 0.11653619, 0.18370236 ]"
2,13,"[ 0, 1, 0, 0.605, 0.47, 0.14, 0.939, 0.3385, 0.201, 0.32 ]","[ 0, 0.60350245, 0, 0.36511898, 0.28364617, 0.084490344, 0.56668884, 0.20428558, 0.121304, 0.19312078 ]"


Create an estimator:

In [12]:
var estimator = context.Regression.Trainers.LbfgsPoissonRegression(featureColumnName: "FeaturesNorm");

Now, we use cross-validation to select the best performing model.

In [13]:
var transformedTrainData = transformer.Transform(trainData);
var cvResults = context.Regression.CrossValidate(transformedTrainData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.RSquared)
    .First();

Here are the metrics of the model that we selected:

In [14]:
new Dictionary<string, double>
{
    ["Mean Absolute Error"] = cvResults.Average(x => x.Metrics.MeanAbsoluteError),
    ["Mean Squared Error"] = cvResults.Average(x => x.Metrics.MeanSquaredError),
    ["Root Mean Squared Error"] = cvResults.Average(x => x.Metrics.RootMeanSquaredError),
    ["R-squared"] = cvResults.Average(x => x.Metrics.RSquared),
}

key,value
Mean Absolute Error,1.529807120300549
Mean Squared Error,4.576383122660482
Root Mean Squared Error,2.1389691768988564
R-squared,0.548731899412541


Now we can evaluate our model against the test data.

In [15]:
var transformedTestData = transformer.Transform(testData);
var predictions = cvResult.Model.Transform(transformedTestData);
var metrics = context.Regression.Evaluate(predictions);

Here are the metrics for our test data:

In [16]:
new Dictionary<string, double>
{
    ["Mean Absolute Error"] = metrics.MeanAbsoluteError,
    ["Mean Squared Error"] = metrics.MeanSquaredError,
    ["Root Mean Squared Error"] = metrics.RootMeanSquaredError,
    ["R-squared"] = metrics.RSquared,
}

key,value
Mean Absolute Error,1.5834955890743718
Mean Squared Error,4.7985268699886605
Root Mean Squared Error,2.190554009831453
R-squared,0.5734000540741993


Now let's pretend we have new data (for convenience we are just randomly re-sampling the test data) to see what predictions our model makes. You will see two values:
- `Label`: the actual number of rings from the example being predicted. Our model never sees this value but we show it below so that you can see how close the predicted number of rings are to the actual.
- `Score`: the predicted number of rings made by the model. The closer this is to the Label, the more accurate is the prediction.

You can run this cell multiple times to get new random samples and their predictions!

In [17]:
class RegressionPrediction
{
    public Single Label { get; set; }

    public Single Score { get; set; }

    public override string ToString() =>
        $"Label: {Label}, Score: {Score}";
}

// Show some sample predictions
var sampleData = context.Data.ShuffleRows(testData);
var transformedSampleData = transformer.Transform(sampleData);

var predictionEngine = context.Model.CreatePredictionEngine<AbaloneDataTransformed, RegressionPrediction>(cvResult.Model);

context.Data.CreateEnumerable<AbaloneDataTransformed>(transformedSampleData, reuseRowObject: false)
    .Take(5)
    .Select(predictionEngine.Predict)

index,Label,Score
0,7,7.6620455
1,14,11.842239
2,8,9.775085
3,11,11.689067
4,6,6.720557
