In [1]:
#r "nuget: Microsoft.ML"
#load "./Modules/MLWrapper.fs"
open Microsoft.ML
open Microsoft.ML.Data
open System.IO
open System.Net
open FunctionalMl

In [2]:
if not <| File.Exists("imports-85.data") then
    use client = new WebClient()
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data", "imports-85.data")

printfn "Data file has %d lines" <| File.ReadLines("imports-85.data").Count()
File.ReadLines("imports-85.data")
|> Seq.take 5

Data file has 205 lines


index,value
0,"3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,168.80,64.10,48.80,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495"
1,"3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,168.80,64.10,48.80,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111,5000,21,27,16500"
2,"1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.50,171.20,65.50,52.40,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500"
3,"2,164,audi,gas,std,four,sedan,fwd,front,99.80,176.60,66.20,54.30,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950"
4,"2,164,audi,gas,std,four,sedan,4wd,front,99.40,176.60,66.40,54.30,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450"


In [3]:
[<CLIMutable>]
type AutosData =
    {
        [<LoadColumn(0)>]
        Symboling : string
        
        [<LoadColumn(1)>] 
        NormLosses : float32
        
        [<LoadColumn(2)>]
        Make : string
        
        [<LoadColumn(3)>]
        FuelType : string
        
        [<LoadColumn(4)>]
        Aspiration : string
        
        [<LoadColumn(5)>]
        NumDoors : string
        
        [<LoadColumn(6)>]
        BodyStyle : string
        
        [<LoadColumn(7)>]
        DriveWheels : string
        
        [<LoadColumn(8)>]
        EngineLoc : string
        
        [<LoadColumn(9)>]
        WheelBase : float32
        
        [<LoadColumn(10)>]
        Length : float32
        
        [<LoadColumn(11)>]
        Width : float32
        
        [<LoadColumn(12)>]
        Height : float32
        
        [<LoadColumn(13)>]
        CurbWeight : float32
        
        [<LoadColumn(14)>]
        EngineType : string
        
        [<LoadColumn(15)>]
        NumCylinders : string
        
        [<LoadColumn(16)>]
        EngineSize : float32
        
        [<LoadColumn(17)>]
        FuelSystem : string
        
        [<LoadColumn(18)>]
        Bore : float32
        
        [<LoadColumn(19)>]
        Stroke : float32
        
        [<LoadColumn(20)>]
        CompressionRatio : float32
        
        [<LoadColumn(21)>]
        Horsepower : float32
        
        [<LoadColumn(22)>]
        PeakRpm : float32
        
        [<LoadColumn(23)>]
        CityMpg : float32
        
        [<LoadColumn(24)>]
        HighwayMpg : float32
        
        [<LoadColumn(25)>]
        [<ColumnName("Label")>]
        Price : float32
    }

let allData = ML.context.Data.LoadFromTextFile<AutosData>("imports-85.data", hasHeader = false, separatorChar = ',')

In [4]:
let trainData, testData =
    ML.shuffle allData
    |> ML.split 0.2

In [5]:
let featureColumns =
    [|
        "Symboling"; "NormLosses"; "Make"; "FuelType"; "Aspiration"; "NumDoors"; "BodyStyle"; "DriveWheels"; "EngineLoc";
        "WheelBase"; "Length"; "Width"; "Height"; "CurbWeight"; "EngineType"; "NumCylinders"; "EngineSize"; "FuelSystem";
        "Bore"; "Stroke"; "CompressionRatio"; "Horsepower"; "PeakRpm"; "CityMpg"; "HighwayMpg"
    |]

let categoricalColumns =
    [|
        "Symboling"; "Make"; "FuelType"; "Aspiration"; "NumDoors"; "BodyStyle"; "DriveWheels"; "EngineLoc"; "EngineType";
        "NumCylinders"; "FuelSystem"
    |]

In [61]:
let pipeline =
    categoricalColumns
    |> Seq.map ML.onehot
    |> Seq.fold ML.append (EstimatorChain())
    |> ML.append <| ML.concatenate "Features" featureColumns
    |> ML.append <| ML.replaceMissingWithDefault "Features" "Features"
    |> ML.append <| ML.normalizeMinMax "Features" "FeaturesNorm"

In [62]:
let transformer =
    pipeline
    |> ML.fit trainData // Fit our pipeline on the training data

In [63]:
let transformedData =
    trainData
    |> ML.transform transformer

In [64]:
ML.context.Data.CreateEnumerable<AutosData>(trainData, reuseRowObject = false)
|> Seq.take 5

index,Symboling,NormLosses,Make,FuelType,Aspiration,NumDoors,BodyStyle,DriveWheels,EngineLoc,WheelBase,Length,Width,Height,CurbWeight,EngineType,NumCylinders,EngineSize,FuelSystem,Bore,Stroke,CompressionRatio,Horsepower,PeakRpm,CityMpg,HighwayMpg,Price
0,1,107,honda,gas,std,two,sedan,fwd,front,96.5,169.1,66.0,51.0,2293,ohc,four,110,2bbl,3.15,3.58,9.1,100,5500,25,31,10345
1,3,194,nissan,gas,turbo,two,hatchback,rwd,front,91.3,170.7,67.9,49.7,3139,ohcv,six,181,mpfi,3.43,3.27,7.8,200,5200,17,23,19699
2,2,161,mitsubishi,gas,std,two,hatchback,fwd,front,93.7,157.3,64.4,50.8,1918,ohc,four,92,2bbl,2.97,3.23,9.4,68,5500,37,41,5389
3,1,74,toyota,gas,std,four,hatchback,fwd,front,95.7,158.7,63.6,54.5,2015,ohc,four,92,2bbl,3.05,3.03,9.0,62,4800,31,38,6488
4,-2,103,volvo,gas,turbo,four,sedan,rwd,front,104.3,188.8,67.2,56.2,3045,ohc,four,130,mpfi,3.62,3.15,7.5,162,5100,17,22,18420


In [65]:
[<CLIMutable>]
type AutosDataTransformed =
    {
        [<ColumnName("Label")>]
        Price : single

        [<VectorType(81)>]
        Features : single[]

        [<VectorType(81)>]
        FeaturesNorm : single[]
    }

ML.context.Data.CreateEnumerable<AutosDataTransformed>(transformedData, reuseRowObject = false)
|> Seq.take 5

index,Price,Features,FeaturesNorm
0,10345,"[ 1, 0, 0, 0, 0, 0, 107, 1, 0, 0 ... (71 more) ]","[ 1, 0, 0, 0, 0, 0, 0.41796875, 1, 0, 0 ... (71 more) ]"
1,19699,"[ 0, 1, 0, 0, 0, 0, 194, 0, 1, 0 ... (71 more) ]","[ 0, 1, 0, 0, 0, 0, 0.7578125, 0, 1, 0 ... (71 more) ]"
2,5389,"[ 0, 0, 1, 0, 0, 0, 161, 0, 0, 1 ... (71 more) ]","[ 0, 0, 1, 0, 0, 0, 0.62890625, 0, 0, 1 ... (71 more) ]"
3,6488,"[ 1, 0, 0, 0, 0, 0, 74, 0, 0, 0 ... (71 more) ]","[ 1, 0, 0, 0, 0, 0, 0.2890625, 0, 0, 0 ... (71 more) ]"
4,18420,"[ 0, 0, 0, 1, 0, 0, 103, 0, 0, 0 ... (71 more) ]","[ 0, 0, 0, 1, 0, 0, 0.40234375, 0, 0, 0 ... (71 more) ]"


In [66]:
let estimator = 
    ML.context.Regression.Trainers.Sdca(featureColumnName = "FeaturesNorm")
    |> ML.downcastEstimator

In [67]:
let model =
    trainData // Begin with the training data
    |> ML.transform transformer // Transform using the transformer built above
    |> ML.crossValidateRegression estimator 3 // 3-fold cross-validation
    |> ML.printRegressionCvMetrics // Print cross-fold metrics
    |> Seq.maxBy (fun cvResult -> cvResult.Metrics.RSquared) // Select the best model by R-squared
    |> fun cvResult -> cvResult.Model

------------------
Cross Validation Metrics
------------------
Mean Absolute Error: 1848.617544
Mean Squared Error: 7583718.191189
Root Mean Squared Error: 2715.370039
R-squared: 0.876045


In [68]:
model
|> ML.transform <| ML.transform transformer testData // Transform the test data and get predictions
|> ML.context.Regression.Evaluate // Get test metrics
|> ML.printRegressionMetrics

------------------
Test Metrics
------------------
Mean Absolute Error: 2315.846842
Mean Squared Error: 9834880.631863
Root Mean Squared Error: 3136.061325
R-squared: 0.876683


In [69]:
[<CLIMutable>]
type RegressionPrediction = { Label : single; Score : single }

// Show some sample predictions
let sampleData =
    testData
    |> ML.shuffle 
    |> ML.transform transformer

let predictionEngine = ML.context.Model.CreatePredictionEngine<AutosDataTransformed, RegressionPrediction>(model)

ML.context.Data.CreateEnumerable<AutosDataTransformed>(sampleData, reuseRowObject = false)
|> Seq.take 5
|> Seq.map predictionEngine.Predict

index,Label,Score
0,6529,7454.089
1,22625,18196.076
2,5195,7084.148
3,16900,16245.451
4,16845,15856.457
