In [1]:
#r "nuget: Microsoft.ML"
#r "nuget: Microsoft.ML.LightGbm"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [2]:
if (!File.Exists("badges.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/badges/badges.data", "badges.data");
}

File.ReadLines("badges.data").Take(5)

index,value
0,
1,+ Naoki Abe
2,- Myriam Abramson
3,+ David W. Aha
4,+ Kamal M. Ali


> # Data Set Information:

> Part of the problem in using an automated program to discover the unknown target function is to decide how to encode names such that the program can be used. The data below are presented in the form of a +/- label followed by the person's name. It is up to the learning-system user to decide how to convert this data into something usable by the system (e.g., what attributes to use if your favorite learner requires feature-vector data).

In [3]:
class ParsedRow
{
    public bool IsPlus { get; set; }
    
    public string FirstName { get; set; }
    
    public float[] FirstNameChars => FirstName.ToCharArray().Select(c => (float)c).ToArray();
    
    public string MiddleInitial { get; set; }
    
    public float[] MiddleInitialChars => MiddleInitial.ToCharArray().Select(c => (float)c).ToArray();
    
    public string LastName { get; set; }
    
    public float[] LastNameChars => LastName.ToCharArray().Select(c => (float)c).ToArray();
}

In [4]:
using System.Text.RegularExpressions;

static bool IsPlus(string str) =>
    str switch
    {
        null => false,
        ""   => false,
        _    => str[0] switch
        {
            '+' => true,
            _  => false,
        }
    };

static string GetMatch(string str, string pattern)
{
    var groups = Regex.Match(str, pattern).Groups;
    return groups.Count switch
    {
        2 => groups[1].Value,
        _ => ""
    };
}

static string GetFirstName(string str) => GetMatch(str, @"^. ([a-zA-Z]+) ");
    
static string GetMiddle(string str) => GetMatch(str, @" ([a-zA-Z]{1})\. ");

static string GetLastName(string str) => GetMatch(str, @" ([a-zA-Z]+)$");

In [5]:
IsPlus("- Ray Vernagus")

In [6]:
GetFirstName("+ Ray Vernagus")

Ray

In [7]:
GetMiddle("+ Ray W. Vernagus")

W

In [8]:
GetLastName("+ Ray Vernagus")

Vernagus

In [9]:
File.ReadLines("badges.data")
    .Skip(1)
    .Take(5)
    .Select(line =>
        new ParsedRow
        {
            IsPlus = IsPlus(line),
            FirstName = GetFirstName(line),
            MiddleInitial = GetMiddle(line),
            LastName = GetLastName(line)
        })

index,IsPlus,FirstName,FirstNameChars,MiddleInitial,MiddleInitialChars,LastName,LastNameChars
0,True,Naoki,"[ 78, 97, 111, 107, 105 ]",,[ ],Abe,"[ 65, 98, 101 ]"
1,False,Myriam,"[ 77, 121, 114, 105, 97, 109 ]",,[ ],Abramson,"[ 65, 98, 114, 97, 109, 115, 111, 110 ]"
2,True,David,"[ 68, 97, 118, 105, 100 ]",W,[ 87 ],Aha,"[ 65, 104, 97 ]"
3,True,Kamal,"[ 75, 97, 109, 97, 108 ]",M,[ 77 ],Ali,"[ 65, 108, 105 ]"
4,False,Eric,"[ 69, 114, 105, 99 ]",,[ ],Allender,"[ 65, 108, 108, 101, 110, 100, 101, 114 ]"


In [10]:
var rows = File.ReadLines("badges.data")
    .Skip(1)
    .Select(line =>
        new ParsedRow
        {
            IsPlus = IsPlus(line),
            FirstName = GetFirstName(line),
            MiddleInitial = GetMiddle(line),
            LastName = GetLastName(line)
        })
    .ToList();

In [11]:
rows.Max(r => r.FirstName.Length)

In [12]:
rows.Max(r => r.LastName.Length)

In [13]:
rows.First().FirstNameChars

index,value
0,78
1,97
2,111
3,107
4,105


In [14]:
class BadgesData
{
    [VectorType(9)]
    public float[] FirstName { get; set; }
    
    [VectorType(1)]
    public float[] MiddleInitial { get; set; }
    
    [VectorType(16)]
    public float[] LastName { get; set; }
    
    [ColumnName("Label")]
    public bool IsPlus { get; set; }
}

In [15]:
var data = rows.Select(x =>
{
    var fnChars = x.FirstNameChars;
    Array.Resize(ref fnChars, 9);
    var mnChars = x.MiddleInitialChars;
    Array.Resize(ref mnChars, 1);
    var lnChars = x.LastNameChars;
    Array.Resize(ref lnChars, 16);
    
    return new BadgesData
    {
        FirstName = fnChars,
        MiddleInitial = mnChars,
        LastName = lnChars,
        IsPlus = x.IsPlus,
    };
});

var context = new MLContext();
var allData = context.Data.LoadFromEnumerable<BadgesData>(data);
allData = context.Data.ShuffleRows(allData);

In [16]:
var splitData = context.Data.TrainTestSplit(allData, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [17]:
var featureColumns = new[]
{
    nameof(BadgesData.FirstName), nameof(BadgesData.MiddleInitial), nameof(BadgesData.LastName)
};

In [18]:
var pipeline = context
    .Transforms.Concatenate("Features", featureColumns);

In [19]:
var transformer = pipeline.Fit(trainData);

In [22]:
class BadgesDataTransformed
{
    [VectorType(26)]
    public float[] Features { get; set; }
    
    [ColumnName("Label")]
    public bool IsPlus { get; set; }
}

In [23]:
var transformedData = transformer.Transform(trainData);
context.Data
    .CreateEnumerable<BadgesDataTransformed>(transformedData, reuseRowObject: false)
    .Take(5)

index,Features,IsPlus
0,"[ 83, 104, 97, 105, 0, 0, 0, 0, 0, 0 ... (16 more) ]",False
1,"[ 77, 97, 114, 116, 105, 110, 99, 104, 0, 0 ... (16 more) ]",True
2,"[ 87, 101, 101, 0, 0, 0, 0, 0, 0, 0 ... (16 more) ]",True
3,"[ 77, 105, 99, 104, 97, 101, 108, 0, 0, 0 ... (16 more) ]",True
4,"[ 69, 114, 105, 99, 0, 0, 0, 0, 0, 0 ... (16 more) ]",False


In [24]:
var estimator = context.BinaryClassification.Trainers.LightGbm(featureColumnName: "Features", learningRate: 0.1);

In [25]:
var transformedTrainData = transformer.Transform(trainData);
var cvResults = context.BinaryClassification.CrossValidate(transformedTrainData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.Accuracy)
    .First();

In [26]:
new Dictionary<string, double>
{
    ["Accuracy"] = cvResults.Average(x => x.Metrics.Accuracy),
    ["Area Under Roc Curve"] = cvResults.Average(x => x.Metrics.AreaUnderRocCurve),
    ["F1 Score"] = cvResults.Average(x => x.Metrics.F1Score),
}

key,value
Accuracy,0.8612867610157583
Area Under Roc Curve,0.9242610920646902
F1 Score,0.9014086404472624


In [27]:
var transformedTestData = transformer.Transform(testData);
var predictions = cvResult.Model.Transform(transformedTestData);
var metrics = context.BinaryClassification.Evaluate(predictions);
metrics

LogLoss,LogLossReduction,Entropy,AreaUnderRocCurve,Accuracy,PositivePrecision,PositiveRecall,NegativePrecision,NegativeRecall,F1Score,AreaUnderPrecisionRecallCurve,ConfusionMatrix
0.4874411375003971,0.2915590130277461,0.6880476235340796,0.9166666666666666,0.8163265306122449,0.9696969696969696,0.8,0.5,0.8888888888888888,0.8767123287671234,0.9818756998615734,"{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.9696969696969697, 0.5 ], PerClassRecall: [ 0.8, 0.8888888888888888 ], Counts: [ [ 32, 8 ], [ 1, 8 ] ], NumberOfClasses: 2 }"


In [28]:
metrics.ConfusionMatrix.GetFormattedConfusionTable()

TEST POSITIVE RATIO:	0.8163 (40.0/(40.0+9.0))
Confusion table
PREDICTED || positive | negative | Recall
 positive ||       32 |        8 | 0.8000
 negative ||        1 |        8 | 0.8889
Precision ||   0.9697 |   0.5000 |


In [41]:
class BinaryClassificationPrediction
{
    public bool Label { get; set; }

    public float Probability { get; set; }

    public bool PredictedLabel { get; set; }
}

var sampleData = context.Data.ShuffleRows(testData);
var transformedSampleData = transformer.Transform(sampleData);

var predictionEngine = context.Model.CreatePredictionEngine<BadgesDataTransformed, BinaryClassificationPrediction>(cvResult.Model);

context.Data.CreateEnumerable<BadgesDataTransformed>(transformedSampleData, reuseRowObject: false)
    .Take(5)
    .Select(predictionEngine.Predict)

index,Label,Probability,PredictedLabel
0,False,0.033191886,False
1,True,0.99590755,True
2,True,0.97195894,True
3,True,0.8905027,True
4,False,0.08796865,False
