In [16]:
#r "nuget: Microsoft.ML"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [2]:
if (!File.Exists("imports-85.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data", "imports-85.data");
}

File.ReadLines("imports-85.data").Take(5)

index,value
0,"3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,168.80,64.10,48.80,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495"
1,"3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,168.80,64.10,48.80,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111,5000,21,27,16500"
2,"1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.50,171.20,65.50,52.40,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500"
3,"2,164,audi,gas,std,four,sedan,fwd,front,99.80,176.60,66.20,54.30,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950"
4,"2,164,audi,gas,std,four,sedan,4wd,front,99.40,176.60,66.40,54.30,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450"


In [3]:
var context = new MLContext();

In [9]:
class AutosData
{
    [LoadColumn(0)]
    public string Symboling { get; set; }
    
    [LoadColumn(1)]
    public float  NormLosses { get; set; }
    
    [LoadColumn(2)]
    public string Make { get; set; }
    
    [LoadColumn(3)]
    public string FuelType { get; set; }
    
    [LoadColumn(4)]
    public string Aspiration { get; set; }
    
    [LoadColumn(5)]
    public string NumDoors { get; set; }
    
    [LoadColumn(6)]
    public string BodyStyle { get; set; }
    
    [LoadColumn(7)]
    public string DriveWheels { get; set; }
    
    [LoadColumn(8)]
    public string EngineLoc { get; set; }
    
    [LoadColumn(9)]
    public float WheelBase { get; set; }
    
    [LoadColumn(10)]
    public float Length { get; set; }
    
    [LoadColumn(11)]
    public float Width { get; set; }
    
    [LoadColumn(12)]
    public float Height { get; set; }
    
    [LoadColumn(13)]
    public float CurbWeight { get; set; }
    
    [LoadColumn(14)]
    public string EngineType { get; set; }
    
    [LoadColumn(15)]
    public string NumCylinders { get; set; }
    
    [LoadColumn(16)]
    public float EngineSize { get; set; }
    
    [LoadColumn(17)]
    public string FuelSystem { get; set; }
    
    [LoadColumn(18)]
    public float Bore { get; set; }
    
    [LoadColumn(19)]
    public float Stroke { get; set; }
    
    [LoadColumn(20)]
    public float CompressionRatio { get; set; }
    
    [LoadColumn(21)]
    public float Horsepower { get; set; }
    
    [LoadColumn(22)]
    public float PeakRpm { get; set; }
    
    [LoadColumn(23)]
    public float CityMpg { get; set; }
    
    [LoadColumn(24)]
    public float HighwayMpg { get; set; }
    
    [LoadColumn(25)]
    [ColumnName("Label")]
    public float Price { get; set; }
}

In [10]:
var allData = context.Data.LoadFromTextFile<AutosData>("imports-85.data", hasHeader: false, separatorChar: ',');
allData = context.Data.ShuffleRows(allData);

In [11]:
var splitData = context.Data.TrainTestSplit(allData, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [13]:
var featureColumns = new[]
{
    nameof(AutosData.Symboling), nameof(AutosData.NormLosses), nameof(AutosData.Make), nameof(AutosData.FuelType), nameof(AutosData.Aspiration),
    nameof(AutosData.NumDoors), nameof(AutosData.BodyStyle), nameof(AutosData.DriveWheels), nameof(AutosData.EngineLoc),
    nameof(AutosData.WheelBase), nameof(AutosData.Length), nameof(AutosData.Width), nameof(AutosData.Height), nameof(AutosData.CurbWeight),
    nameof(AutosData.EngineType), nameof(AutosData.NumCylinders), nameof(AutosData.EngineSize), nameof(AutosData.FuelSystem), nameof(AutosData.Bore),
    nameof(AutosData.Stroke), nameof(AutosData.CompressionRatio), nameof(AutosData.Horsepower), nameof(AutosData.PeakRpm), nameof(AutosData.CityMpg),
    nameof(AutosData.HighwayMpg)
};

In [14]:
var categoricalColumns = new[]
{
    nameof(AutosData.Symboling), nameof(AutosData.Make), nameof(AutosData.FuelType), nameof(AutosData.Aspiration), nameof(AutosData.NumDoors),
    nameof(AutosData.BodyStyle), nameof(AutosData.DriveWheels), nameof(AutosData.EngineLoc), nameof(AutosData.EngineType),
    nameof(AutosData.NumCylinders), nameof(AutosData.FuelSystem)
};

In [17]:
var chain = new EstimatorChain<OneHotEncodingTransformer>();
var pipeline = categoricalColumns
    .Aggregate(chain, (pl, col) => pl.Append(context.Transforms.Categorical.OneHotEncoding(col)))
    .Append(context.Transforms.Concatenate("Features", featureColumns))
    .Append(context.Transforms.ReplaceMissingValues("Features", replacementMode: MissingValueReplacingEstimator.ReplacementMode.DefaultValue))
    .Append(context.Transforms.NormalizeMinMax("FeaturesNorm", "Features"));

In [18]:
var transformer = pipeline.Fit(trainData);

In [19]:
var sourceItems = context.Data
    .CreateEnumerable<AutosData>(trainData, reuseRowObject: false)
    .Take(5);
sourceItems

index,Symboling,NormLosses,Make,FuelType,Aspiration,NumDoors,BodyStyle,DriveWheels,EngineLoc,WheelBase,Length,Width,Height,CurbWeight,EngineType,NumCylinders,EngineSize,FuelSystem,Bore,Stroke,CompressionRatio,Horsepower,PeakRpm,CityMpg,HighwayMpg,Price
0,-2,103,volvo,gas,turbo,four,sedan,rwd,front,104.3,188.8,67.2,56.2,3045,ohc,four,130,mpfi,3.62,3.15,7.5,162,5100,17,22,18420
1,1,113,mazda,gas,std,four,sedan,fwd,front,93.1,166.8,64.2,54.1,1945,ohc,four,91,2bbl,3.03,3.15,9.0,68,5000,31,38,6695
2,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.9,55.5,3062,ohc,four,141,mpfi,3.78,3.15,9.5,114,5400,19,25,22625
3,0,91,toyota,gas,std,four,hatchback,fwd,front,95.7,166.3,64.4,52.8,2109,ohc,four,98,2bbl,3.19,3.03,9.0,70,4800,30,37,7198
4,1,161,mitsubishi,gas,turbo,two,hatchback,fwd,front,93.0,157.3,63.8,50.8,2145,ohc,four,98,spdi,3.03,3.39,7.6,102,5500,24,30,7689


In [21]:
class AutosDataTransformed
{
    [ColumnName("Label")]
    public float Price { get; set; }

    [VectorType(81)]
    public float[] Features { get; set; }

    [VectorType(81)]
    public float[] FeaturesNorm { get; set; }
}

var transformedData = transformer.Transform(trainData);
context.Data
    .CreateEnumerable<AutosDataTransformed>(transformedData, reuseRowObject: false)
    .Take(3)

index,Price,Features,FeaturesNorm
0,18420,"[ 1, 0, 0, 0, 0, 0, 103, 1, 0, 0 ... (71 more) ]","[ 1, 0, 0, 0, 0, 0, 0.40234375, 1, 0, 0 ... (71 more) ]"
1,6695,"[ 0, 1, 0, 0, 0, 0, 113, 0, 1, 0 ... (71 more) ]","[ 0, 1, 0, 0, 0, 0, 0.44140625, 0, 1, 0 ... (71 more) ]"
2,22625,"[ 0, 0, 1, 0, 0, 0, 95, 1, 0, 0 ... (71 more) ]","[ 0, 0, 1, 0, 0, 0, 0.37109375, 1, 0, 0 ... (71 more) ]"


In [22]:
var estimator = context.Regression.Trainers.Sdca(featureColumnName: "FeaturesNorm");

In [23]:
var transformedTrainData = transformer.Transform(trainData);
var cvResults = context.Regression.CrossValidate(transformedTrainData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.RSquared)
    .First();

In [24]:
new Dictionary<string, double>
{
    ["Mean Absolute Error"] = cvResults.Average(x => x.Metrics.MeanAbsoluteError),
    ["Mean Squared Error"] = cvResults.Average(x => x.Metrics.MeanSquaredError),
    ["Root Mean Squared Error"] = cvResults.Average(x => x.Metrics.RootMeanSquaredError),
    ["R-squared"] = cvResults.Average(x => x.Metrics.RSquared),
}

key,value
Mean Absolute Error,1899.6594520291696
Mean Squared Error,7400438.041185128
Root Mean Squared Error,2719.9523289832127
R-squared,0.8791974315030332


In [25]:
var transformedTestData = transformer.Transform(testData);
var predictions = cvResult.Model.Transform(transformedTestData);
var metrics = context.Regression.Evaluate(predictions);

In [26]:
metrics

MeanAbsoluteError,MeanSquaredError,RootMeanSquaredError,LossFunction,RSquared
2067.1882161458334,10703392.731119333,3271.603999740698,10703392.534375,0.8171304713913203


In [27]:
class RegressionPrediction
{
    public Single Label { get; set; }

    public Single Score { get; set; }
}

// Show some sample predictions
var sampleData = context.Data.ShuffleRows(testData);
var transformedSampleData = transformer.Transform(sampleData);

var predictionEngine = context.Model.CreatePredictionEngine<AutosDataTransformed, RegressionPrediction>(cvResult.Model);

context.Data.CreateEnumerable<AutosDataTransformed>(transformedSampleData, reuseRowObject: false)
    .Take(5)
    .Select(predictionEngine.Predict)

index,Label,Score
0,9988,10572.376
1,10698,11575.641
2,19699,20869.21
3,6529,7426.263
4,17075,16525.73
