In [75]:
#r "nuget: Microsoft.Data.Analysis"
#r "nuget: Microsoft.ML"
#r "nuget: Microsoft.ML.LightGbm"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [76]:
if (!File.Exists("balance-scale.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data", "balance-scale.data");
}

Console.WriteLine($"Data file has {File.ReadLines("balance-scale.data").Count():n0} lines");
File.ReadLines("balance-scale.data").Take(5)

Data file has 625 lines


index,value
0,"B,1,1,1,1"
1,"R,1,1,1,2"
2,"R,1,1,1,3"
3,"R,1,1,1,4"
4,"R,1,1,1,5"


> # Data Set Information:

> This data set was generated to model psychological experimental results. Each example is classified as having the balance scale tip to the right, tip to the left, or be balanced. The attributes are the left weight, the left distance, the right weight, and the right distance. The correct way to find the class is the greater of (left-distance * left-weight) and (right-distance * right-weight). If they are equal, it is balanced.

In [81]:
var columns = new[] { "ClassName", "LeftWeight", "LeftDistance", "RightWeight","RightDistance" };

var df = DataFrame.LoadCsv("balance-scale.data", header: false, guessRows: 100, columnNames: columns);

In [82]:
df.Columns
    .Select(column => (name: column.Name, type: column.DataType))
    .ToList()
    .ForEach(ctype => Console.WriteLine($"Column: {ctype.name}, Type: {ctype.type}"));

Column: ClassName, Type: System.String
Column: LeftWeight, Type: System.Single
Column: LeftDistance, Type: System.Single
Column: RightWeight, Type: System.Single
Column: RightDistance, Type: System.Single


In [84]:
df["ClassName"]
    .GroupColumnValues<string>()
    .Select((kv, _) => $"{kv.Key,6} n={kv.Value.Count}")
    .ToList()
    .ForEach(s => Console.WriteLine(s));

     B n=49
     R n=288
     L n=288


In [85]:
class BalanceScaleData
{
    public float LeftWeight { get; set; }
 
    public float LeftDistance { get; set; }
    
    public float RightWeight { get; set; }
    
    public float RightDistance { get; set; }
    
    public string ClassName { get; set; }
    
    public float LeftFactor { get; set; }
    
    public float RightFactor { get; set; }
}

In [88]:
df.Rows.Select(row =>
   new BalanceScaleData
   {
       LeftWeight = row[1],
       LeftDistance = row[2],
       RightWeight = row[3],
       RightDistance = row[4],
       LeftFactor = row[1] * row[2],
       ClassName = row[0]
   }
)

Unhandled exception: (4,21): error CS0266: Cannot implicitly convert type 'object' to 'float'. An explicit conversion exists (are you missing a cast?)
(5,23): error CS0266: Cannot implicitly convert type 'object' to 'float'. An explicit conversion exists (are you missing a cast?)
(6,22): error CS0266: Cannot implicitly convert type 'object' to 'float'. An explicit conversion exists (are you missing a cast?)
(7,24): error CS0266: Cannot implicitly convert type 'object' to 'float'. An explicit conversion exists (are you missing a cast?)
(8,21): error CS0019: Operator '*' cannot be applied to operands of type 'object' and 'object'
(9,20): error CS0266: Cannot implicitly convert type 'object' to 'string'. An explicit conversion exists (are you missing a cast?)

In [7]:
var context = new MLContext();

In [57]:
class BalanceScaleData
{
    [ColumnName("Label")]
    [LoadColumn(0)]
    public string ClassName { get; set; }
    
    [LoadColumn(1)]
    public float LeftWeight { get; set; }
    
    [LoadColumn(2)]
    public float LeftDistance { get; set; }
    
    [LoadColumn(3)]
    public float RightWeight { get; set; }
    
    [LoadColumn(4)]
    public float RightDistance { get; set; }
}

In [59]:
var allData = context.Data.LoadFromTextFile<BalanceScaleData>("balance-scale.data", hasHeader: false, separatorChar: ',');
allData = context.Data.ShuffleRows(allData);





// var allData = context.Data.CreateEnumerable<BalanceScaleData>(allRows, reuseRowObject: false);
// var allDataView = context.Data.LoadFromEnumerable(allData);


In [60]:
var splitData = context.Data.TrainTestSplit(allDataView, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [61]:
var featureColumns = new[]
{
    nameof(BalanceScaleData.LeftWeight), nameof(BalanceScaleData.LeftDistance), nameof(BalanceScaleData.RightWeight), nameof(BalanceScaleData.RightDistance)
};

In [62]:
trainData.GetColumn<string>("Label").Distinct()

index,value
0,R
1,L
2,B


In [67]:
var pipeline = 
    context.Transforms.Conversion.MapValueToKey("Label")
//     .Append(context.Transforms.Conversion.MapValueToKey("Label"))
    .Append(context.Transforms.Conversion.MapKeyToValue("LabelValue", "Label"))
//     .Append(mapping)
//     mapping
    .Append(context.Transforms.Concatenate("Features", featureColumns));

In [68]:
var transformer = pipeline.Fit(trainData);

In [69]:
context.Data.CreateEnumerable<BalanceScaleData>(trainData, reuseRowObject: false).Take(5)

index,ClassName,LeftWeight,LeftDistance,RightWeight,RightDistance
0,R,3,3,5,2
1,R,1,2,2,3
2,L,3,2,2,1
3,R,3,1,3,4
4,R,4,4,4,5


In [70]:
class BalanceScaleDataTransformed
{
    [ColumnName("LabelValue")]
    public string Class { get; set; }

    [VectorType(84)]
    public float[] Features { get; set; }
}

var transformedData = transformer.Transform(trainData);
context.Data
    .CreateEnumerable<BalanceScaleDataTransformed>(transformedData, reuseRowObject: false)
    .Take(3)

index,Class,Features
0,R,"[ 3, 3, 5, 2 ]"
1,R,"[ 1, 2, 2, 3 ]"
2,L,"[ 3, 2, 2, 1 ]"


In [72]:
var estimator = context.MulticlassClassification.Trainers.LightGbm(featureColumnName: "Features");

In [73]:
var transformedTrainData = transformer.Transform(trainData);
var cvResults = context.MulticlassClassification.CrossValidate(transformedTrainData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.MacroAccuracy)
    .First();

In [74]:
cvResult.Metrics.ConfusionMatrix.GetFormattedConfusionTable()


Confusion table
PREDICTED ||     R |     L |     B | Recall
        R ||    76 |     2 |     1 | 0.9620
        L ||     3 |    69 |     4 | 0.9079
        B ||     6 |     9 |     0 | 0.0000
Precision ||0.8941 |0.8625 |0.0000 |


In [50]:
cvResult.Metrics

LogLoss,LogLossReduction,MacroAccuracy,MicroAccuracy,TopKAccuracy,TopKPredictionCount,PerClassLogLoss,ConfusionMatrix
0.4222825276548844,0.5272953082704148,0.712987012987013,0.8941176470588236,0,0,"[ 0.16864074439802684, 0.14071722378574295, 3.924786544108495 ]","{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.9111111111111111, 0.9436619718309859, 0.3333333333333333 ], PerClassRecall: [ 0.9318181818181818, 0.9571428571428572, 0.25 ], Counts: [ [ 82, 2, 4 ], [ 1, 67, 2 ], [ 7, 2, 3 ] ], NumberOfClasses: 3 }"


In [51]:
var transformedTestData = transformer.Transform(testData);
var predictions = cvResult.Model.Transform(transformedTestData);
var metrics = context.MulticlassClassification.Evaluate(predictions);
metrics.ConfusionMatrix.GetFormattedConfusionTable()


Confusion table
PREDICTED ||     L |     R |     B | Recall
        L ||    46 |     1 |     3 | 0.9200
        R ||     2 |    51 |     7 | 0.8500
        B ||     3 |     5 |     1 | 0.1111
Precision ||0.9020 |0.8947 |0.0909 |


In [52]:
metrics

LogLoss,LogLossReduction,MacroAccuracy,MicroAccuracy,TopKAccuracy,TopKPredictionCount,PerClassLogLoss,ConfusionMatrix
0.6037802377794739,0.332739456487532,0.6270370370370371,0.8235294117647058,0,0,"[ 0.1537755646537661, 0.30461909427366946, 5.098213822960988 ]","{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.9019607843137255, 0.8947368421052632, 0.09090909090909091 ], PerClassRecall: [ 0.92, 0.85, 0.1111111111111111 ], Counts: [ [ 46, 1, 3 ], [ 2, 51, 7 ], [ 3, 5, 1 ] ], NumberOfClasses: 3 }"


In [57]:
class MulticlassClassificationPrediction
{
    public string LabelValue { get; set; }

    public float[] Score { get; set; }

    public string PredictedLabelValue { get; set; }
}

var sampleData = context.Data.ShuffleRows(testData);
var transformedSampleData = transformer.Transform(sampleData);

var samplePredictions = cvResult.Model.Transform(transformedSampleData);
var mapValues = context.Transforms.Conversion
    .MapKeyToValue("PredictedLabelValue", "PredictedLabel")
    .Append(context.Transforms.Conversion.MapKeyToValue("LabelValue", "Label"))
    .Fit(samplePredictions);
samplePredictions = mapValues.Transform(samplePredictions);
var samplePredictionItems = context.Data.CreateEnumerable<MulticlassClassificationPrediction>(samplePredictions, reuseRowObject: false);

samplePredictionItems.Take(5)

index,LabelValue,Score,PredictedLabelValue
0,R,"[ 7.887615E-05, 0.9788222, 0.021098927 ]",R
1,L,"[ 0.14832187, 0.83522844, 0.016449716 ]",R
2,B,"[ 0.17685778, 0.8230193, 0.00012290898 ]",R
3,R,"[ 0.00012063386, 0.99503136, 0.0048480364 ]",R
4,L,"[ 0.99997574, 1.8110366E-05, 6.158091E-06 ]",L
