In [1]:
#r "nuget: Microsoft.ML"
#r "nuget: Microsoft.ML.LightGbm"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [2]:
if (!File.Exists("balance-scale.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data", "balance-scale.data");
}

Console.WriteLine($"Data file has {File.ReadLines("balance-scale.data").Count():n0} lines");
File.ReadLines("balance-scale.data").Take(5)

Data file has 625 lines


index,value
0,"B,1,1,1,1"
1,"R,1,1,1,2"
2,"R,1,1,1,3"
3,"R,1,1,1,4"
4,"R,1,1,1,5"


> # Data Set Information:

> This data set was generated to model psychological experimental results. Each example is classified as having the balance scale tip to the right, tip to the left, or be balanced. The attributes are the left weight, the left distance, the right weight, and the right distance. The correct way to find the class is the greater of (left-distance * left-weight) and (right-distance * right-weight). If they are equal, it is balanced.

In [3]:
var context = new MLContext();

In [4]:
class BalanceScaleRow
{
    [ColumnName("Label")]
    [LoadColumn(0)]
    public string ClassName { get; set; }
    
    [LoadColumn(1)]
    public float LeftWeight { get; set; }
    
    [LoadColumn(2)]
    public float LeftDistance { get; set; }
    
    [LoadColumn(3)]
    public float RightWeight { get; set; }
    
    [LoadColumn(4)]
    public float RightDistance { get; set; }
}

class BalanceScaleData : BalanceScaleRow
{
    public float LeftFactor { get; set; }
    public float RightFactor { get; set; }
}

In [5]:
var allRows = context.Data.LoadFromTextFile<BalanceScaleRow>("balance-scale.data", hasHeader: false, separatorChar: ',');

var rowsEnum = context.Data.CreateEnumerable<BalanceScaleRow>(allRows, reuseRowObject: false);
var allData = rowsEnum.Select(d => new BalanceScaleData
{
    ClassName = d.ClassName,
    LeftWeight = d.LeftWeight,
    LeftDistance = d.LeftDistance,
    RightWeight = d.RightWeight,
    RightDistance = d.RightDistance,
    LeftFactor = d.LeftWeight * d.LeftDistance,
    RightFactor = d.RightWeight * d.RightDistance,
});

allData.Take(5)

index,LeftFactor,RightFactor,ClassName,LeftWeight,LeftDistance,RightWeight,RightDistance
0,1,1,B,1,1,1,1
1,1,2,R,1,1,1,2
2,1,3,R,1,1,1,3
3,1,4,R,1,1,1,4
4,1,5,R,1,1,1,5


In [6]:
var allDataView = context.Data.LoadFromEnumerable(allData);
allDataView = context.Data.ShuffleRows(allDataView);

In [7]:
var splitData = context.Data.TrainTestSplit(allDataView, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [8]:
var featureColumns = new[]
{
    nameof(BalanceScaleData.LeftWeight), nameof(BalanceScaleData.LeftDistance), nameof(BalanceScaleData.RightWeight), nameof(BalanceScaleData.RightDistance),
    nameof(BalanceScaleData.LeftFactor), nameof(BalanceScaleData.RightFactor)
};

In [9]:
trainData.GetColumn<string>("Label").Distinct()

index,value
0,R
1,B
2,L


In [10]:
var pipeline = 
    context.Transforms.Conversion.MapValueToKey("Label")
    .Append(context.Transforms.Conversion.MapKeyToValue("LabelValue", "Label"))
    .Append(context.Transforms.Concatenate("Features", featureColumns));

In [11]:
var transformer = pipeline.Fit(trainData);

In [12]:
class BalanceScaleDataTransformed
{
    [ColumnName("LabelValue")]
    public string Class { get; set; }

    [VectorType(84)]
    public float[] Features { get; set; }
}

var transformedData = transformer.Transform(trainData);
context.Data
    .CreateEnumerable<BalanceScaleDataTransformed>(transformedData, reuseRowObject: false)
    .Take(3)

index,Class,Features
0,R,"[ 2, 3, 5, 4, 6, 20 ]"
1,B,"[ 3, 2, 2, 3, 6, 6 ]"
2,R,"[ 3, 4, 3, 5, 12, 15 ]"


In [13]:
var estimator = context.MulticlassClassification.Trainers.LightGbm(featureColumnName: "Features");

In [14]:
var transformedTrainData = transformer.Transform(trainData);
var cvResults = context.MulticlassClassification.CrossValidate(transformedTrainData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.MicroAccuracy)
    .First();

In [15]:
cvResult.Metrics.ConfusionMatrix.GetFormattedConfusionTable()


Confusion table
PREDICTED ||     R |     B |     L | Recall
        R ||    68 |     2 |     0 | 0.9714
        B ||     4 |     2 |     5 | 0.1818
        L ||     0 |     1 |    90 | 0.9890
Precision ||0.9444 |0.4000 |0.9474 |


In [16]:
cvResult.Metrics

LogLoss,LogLossReduction,MacroAccuracy,MicroAccuracy,TopKAccuracy,TopKPredictionCount,PerClassLogLoss,ConfusionMatrix
0.2299354308223574,0.7382762750300119,0.714085914085914,0.9302325581395348,0,0,"[ 0.05347525221009736, 2.7459424058073902, 0.06154131849293823 ]","{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.9444444444444444, 0.4, 0.9473684210526315 ], PerClassRecall: [ 0.9714285714285714, 0.18181818181818182, 0.989010989010989 ], Counts: [ [ 68, 2, 0 ], [ 4, 2, 5 ], [ 0, 1, 90 ] ], NumberOfClasses: 3 }"


In [17]:
var transformedTestData = transformer.Transform(testData);
var predictions = cvResult.Model.Transform(transformedTestData);
var metrics = context.MulticlassClassification.Evaluate(predictions);
metrics.ConfusionMatrix.GetFormattedConfusionTable()


Confusion table
PREDICTED ||     R |     B |     L | Recall
        R ||    54 |     0 |     0 | 1.0000
        B ||     4 |     1 |     3 | 0.1250
        L ||     0 |     1 |    56 | 0.9825
Precision ||0.9310 |0.5000 |0.9492 |


In [18]:
metrics

LogLoss,LogLossReduction,MacroAccuracy,MicroAccuracy,TopKAccuracy,TopKPredictionCount,PerClassLogLoss,ConfusionMatrix
0.2429515112392795,0.7278202811012369,0.702485380116959,0.9327731092436976,0,0,"[ 0.030658050283661992, 2.8834298966337197, 0.0734781745453816 ]","{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.9310344827586207, 0.5, 0.9491525423728814 ], PerClassRecall: [ 1, 0.125, 0.9824561403508771 ], Counts: [ [ 54, 0, 0 ], [ 4, 1, 3 ], [ 0, 1, 56 ] ], NumberOfClasses: 3 }"


In [33]:
class MulticlassClassificationPrediction
{
    public string LabelValue { get; set; }

    public float[] Score { get; set; }

    public string PredictedLabelValue { get; set; }
}

var sampleData = context.Data.ShuffleRows(testData);
var transformedSampleData = transformer.Transform(sampleData);

var samplePredictions = cvResult.Model.Transform(transformedSampleData);
var mapValues = context.Transforms.Conversion
    .MapKeyToValue("PredictedLabelValue", "PredictedLabel")
    .Append(context.Transforms.Conversion.MapKeyToValue("LabelValue", "Label"))
    .Fit(samplePredictions);
samplePredictions = mapValues.Transform(samplePredictions);
var samplePredictionItems = context.Data.CreateEnumerable<MulticlassClassificationPrediction>(samplePredictions, reuseRowObject: false);

samplePredictionItems.Take(5)

index,LabelValue,Score,PredictedLabelValue
0,L,"[ 4.5974608E-05, 0.0025561184, 0.9973979 ]",L
1,L,"[ 4.3653403E-05, 2.3581317E-05, 0.99993277 ]",L
2,R,"[ 0.9983034, 0.0016201615, 7.644612E-05 ]",R
3,L,"[ 5.472069E-05, 0.004246467, 0.9956988 ]",L
4,B,"[ 0.28320578, 0.5134264, 0.20336781 ]",B
