In [1]:
#r "nuget: Microsoft.ML"
#r "nuget: Microsoft.ML.LightGbm"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [2]:
var files = new[] { "adult+stretch.data", "adult-stretch.data", "yellow-small+adult-stretch.data", "yellow-small.data" };

Directory.CreateDirectory("balloons");
foreach (var file in files)
{
    var saveFile = $"balloons/{file}";
    if (!File.Exists(saveFile))
    {
        using var client = new WebClient();
        client.DownloadFile($"https://archive.ics.uci.edu/ml/machine-learning-databases/balloons/{file}", saveFile);
    }

    Console.WriteLine($"{file} has {File.ReadLines(saveFile).Count():n0} lines");
}
File.ReadLines($"balloons/{files[0]}").Take(5)

adult+stretch.data has 20 lines
adult-stretch.data has 20 lines
yellow-small+adult-stretch.data has 16 lines
yellow-small.data has 20 lines


index,value
0,"YELLOW,SMALL,STRETCH,ADULT,T"
1,"YELLOW,SMALL,STRETCH,ADULT,T"
2,"YELLOW,SMALL,STRETCH,CHILD,F"
3,"YELLOW,SMALL,DIP,ADULT,F"
4,"YELLOW,SMALL,DIP,CHILD,F"


># Data Set Information:
>
>There are four data sets representing different conditions of an experiment. All have the same attributes.
>
> a. adult-stretch.data Inflated is true if age=adult or act=stretch
>
> b. adult+stretch.data Inflated is true if age=adult and act=stretch
>
> c. small-yellow.data Inflated is true if (color=yellow and size = small) or
>
> d. small-yellow+adult-stretch.data Inflated is true if (color=yellow and size = small) or (age=adult and act=stretch)

In [3]:
var context = new MLContext();

In [4]:
class BalloonsData
{
    [LoadColumn(0)]
    public string Color { get; set; }
    
    [LoadColumn(1)]
    public string Size { get; set; }
    
    [LoadColumn(2)]
    public string Act { get; set; }
    
    [LoadColumn(3)]
    public string Age { get; set; }
    
    [ColumnName("Label")]
    [LoadColumn(4)]
    public string Inflated { get; set; }
}

In [5]:
var allData = context.Data.LoadFromTextFile<BalloonsData>("balloons/*.data", hasHeader: false, separatorChar: ',');
allData = context.Data.ShuffleRows(allData);

In [6]:
var splitData = context.Data.TrainTestSplit(allData, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [7]:
var featureColumns = new[]
{
    nameof(BalloonsData.Color), nameof(BalloonsData.Size), nameof(BalloonsData.Act), nameof(BalloonsData.Age)
};

In [8]:
class BalloonsLabel
{
    public string Label { get; set; }
}

context.Data.CreateEnumerable<BalloonsLabel>(allData, reuseRowObject: false)
    .GroupBy(x => x.Label)
    .Select(x => new { Key = x.Key, Count = x.Count() })
    .OrderBy(x => x.Key)

index,Key,Count
0,F,41
1,T,35


In [10]:
var labelLookup = new Dictionary<string, bool>
{
    ["F"] = false,
    ["T"] = true,
};

In [11]:
var chain = new EstimatorChain<OneHotEncodingTransformer>();
var pipeline = featureColumns
    .Aggregate(chain, (pl, col) => pl.Append(context.Transforms.Categorical.OneHotEncoding(col)))
    .Append(context.Transforms.Conversion.MapValue("Label", labelLookup, "Label"))
    .Append(context.Transforms.Concatenate("Features", featureColumns));

In [12]:
var transformer = pipeline.Fit(trainData);

In [13]:
context.Data
    .CreateEnumerable<BalloonsData>(trainData, reuseRowObject: false)
    .Take(5)

index,Color,Size,Act,Age,Inflated
0,YELLOW,SMALL,DIP,CHILD,F
1,PURPLE,LARGE,DIP,CHILD,F
2,YELLOW,LARGE,DIP,CHILD,F
3,PURPLE,SMALL,STRETCH,ADULT,T
4,YELLOW,SMALL,DIP,CHILD,T


In [14]:
trainData.Preview().RowView.Take(5)

index,Values
0,"[ Color: YELLOW, Size: SMALL, Act: DIP, Age: CHILD, Label: F, SamplingKeyColumn: 0.5956414 ]"
1,"[ Color: PURPLE, Size: LARGE, Act: DIP, Age: CHILD, Label: F, SamplingKeyColumn: 0.58837676 ]"
2,"[ Color: YELLOW, Size: LARGE, Act: DIP, Age: CHILD, Label: F, SamplingKeyColumn: 0.7536782 ]"
3,"[ Color: PURPLE, Size: SMALL, Act: STRETCH, Age: ADULT, Label: T, SamplingKeyColumn: 0.96748567 ]"
4,"[ Color: YELLOW, Size: SMALL, Act: DIP, Age: CHILD, Label: T, SamplingKeyColumn: 0.9295975 ]"


In [22]:
class BalloonsDataTransformed
{
    [ColumnName("Label")]
    public bool Inflated { get; set; }

    [VectorType(8)]
    public float[] Features { get; set; }
}

var transformedData = transformer.Transform(trainData);
context.Data
    .CreateEnumerable<BalloonsDataTransformed>(transformedData, reuseRowObject: false)
    .Take(5)

index,Inflated,Features
0,False,"[ 1, 0, 1, 0, 1, 0, 1, 0 ]"
1,False,"[ 0, 1, 0, 1, 1, 0, 1, 0 ]"
2,False,"[ 1, 0, 0, 1, 1, 0, 1, 0 ]"
3,True,"[ 0, 1, 1, 0, 0, 1, 0, 1 ]"
4,True,"[ 1, 0, 1, 0, 1, 0, 1, 0 ]"


In [16]:
class BalloonsLabel
{
    public bool Label { get; set; }
}

context.Data.CreateEnumerable<BalloonsLabel>(transformedData, reuseRowObject: false)
    .GroupBy(x => x.Label)
    .Select(x => new { Key = x.Key, Count = x.Count() })
    .OrderBy(x => x.Key)

index,Key,Count
0,False,37
1,True,28


In [27]:
var estimator = context.BinaryClassification.Trainers.LightGbm(featureColumnName: "Features");

In [28]:
var cvResults = context.BinaryClassification.CrossValidate(transformedData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.Accuracy)
    .First();

In [29]:
new Dictionary<string, double>
{
    ["Accuracy"] = cvResults.Average(x => x.Metrics.Accuracy),
    ["Area Under Roc Curve"] = cvResults.Average(x => x.Metrics.AreaUnderRocCurve),
    ["F1 Score"] = cvResults.Average(x => x.Metrics.F1Score),
}

key,value
Accuracy,0.6970426065162907
Area Under Roc Curve,0.8005266955266955
F1 Score,0.6591036414565825


In [30]:
var transformedTestData = transformer.Transform(testData);
var predictions = cvResult.Model.Transform(transformedTestData);
context.BinaryClassification.Evaluate(predictions)

LogLoss,LogLossReduction,Entropy,AreaUnderRocCurve,Accuracy,PositivePrecision,PositiveRecall,NegativePrecision,NegativeRecall,F1Score,AreaUnderPrecisionRecallCurve,ConfusionMatrix
0.7167580733327071,0.2420554507303765,0.9456603046006402,0.8571428571428571,0.7272727272727273,0.8333333333333334,0.7142857142857143,0.6,0.75,0.7692307692307692,0.91797052154195,"{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.8333333333333334, 0.6 ], PerClassRecall: [ 0.7142857142857143, 0.75 ], Counts: [ [ 5, 2 ], [ 1, 3 ] ], NumberOfClasses: 2 }"


In [35]:
class BinaryClassificationPrediction
{
    public bool Label { get; set; }

    public float Probability { get; set; }

    public bool PredictedLabel { get; set; }
}

var sampleData = context.Data.ShuffleRows(testData);
var transformedSampleData = transformer.Transform(sampleData);

var predictionEngine = context.Model.CreatePredictionEngine<BalloonsDataTransformed, BinaryClassificationPrediction>(cvResult.Model);

context.Data.CreateEnumerable<BalloonsDataTransformed>(transformedSampleData, reuseRowObject: false)
    .Take(5)
    .Select(predictionEngine.Predict)

index,Label,Probability,PredictedLabel
0,True,0.6067279,True
1,True,0.8487241,True
2,True,0.68853253,True
3,False,0.30738977,False
4,True,0.8487241,True
