In [1]:
#r "nuget: Microsoft.ML"
#r "nuget: Microsoft.ML.LightGbm"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [2]:
if (!File.Exists("breast-cancer.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", "breast-cancer.data");
}

Console.WriteLine($"Data file has {File.ReadLines("breast-cancer.data").Count():n0} lines");
File.ReadLines("breast-cancer.data").Take(5)

Data file has 286 lines


index,value
0,"no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no"
1,"no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no"
2,"no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no"
3,"no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no"
4,"no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no"


In [3]:
var context = new MLContext();

In [36]:
class BreastCancerData
{
    [LoadColumn(0)]
    public string Class { get; set; }

    [LoadColumn(1)]
    public string Age { get; set; }

    [LoadColumn(2)]
    public string Menopause { get; set; }

    [LoadColumn(3)]
    public string TumorSize { get; set; }

    [LoadColumn(4)]
    public string InvNodes { get; set; }

    [LoadColumn(5)]
    public string NodeCaps { get; set; }

    [LoadColumn(6)]
    public int DegMalig { get; set; }

    [LoadColumn(7)]
    public string Breast { get; set; }
    
    [LoadColumn(8)]
    public string BreastQuad { get; set; }
    
    [LoadColumn(9)]
    public string Irradiat { get; set; }
}

class BreastCancerTrainData : BreastCancerData
{
    public bool Label { get; set; }
    
    public uint AgeEncoded { get; set; }
    
    public uint MenopauseEncoded { get; set; }
    
    public uint TumorSizeEncoded { get; set; }
    
    public uint InvNodesEncoded { get; set; }
    
    public uint NodeCapsEncoded { get; set; }
    
    public uint BreastEncoded { get; set; }
    
    public uint BreastQuadEncoded { get; set; }
    
    public uint IrradiatEncoded { get; set; }
    
    public uint[] Features { get; set; }
}

In [28]:
var allData = context.Data.LoadFromTextFile<BreastCancerData>("breast-cancer.data", hasHeader: false, separatorChar: ',');
allData = context.Data.ShuffleRows(allData);
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false).Take(5)

index,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DegMalig,Breast,BreastQuad,Irradiat
0,recurrence-events,30-39,premeno,0-4,0-2,no,2,right,central,no
1,no-recurrence-events,30-39,premeno,30-34,6-8,yes,2,right,right_up,no
2,no-recurrence-events,30-39,premeno,40-44,0-2,no,2,right,right_up,no
3,recurrence-events,40-49,premeno,30-34,0-2,yes,3,right,right_up,no
4,no-recurrence-events,30-39,premeno,15-19,0-2,no,1,left,left_low,no


In [29]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Class)
    .Distinct()

index,value
0,recurrence-events
1,no-recurrence-events


In [30]:
Action<BreastCancerData, BreastCancerTrainData> mapping = (input, output) =>
{
    output.Class = input.Class;
    output.Age = input.Age;
    output.Menopause = input.Menopause;
    output.TumorSize = input.TumorSize;
    output.InvNodes = input.InvNodes;
    output.NodeCaps = input.NodeCaps;
    output.DegMalig = input.DegMalig;
    output.Breast = input.Breast;
    output.BreastQuad = input.BreastQuad;
    output.Irradiat = input.Irradiat;
    output.Label = 
        input.Class switch
        {
            "no-recurrence-events" => false,
            "recurrence-events" => true
        };
};
var mapLabel = context.Transforms.CustomMapping(mapping, contractName: null);
var transformedData = mapLabel.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false)
    .Select(x => new { Class=x.Class, Label=x.Label })
    .Take(5)

index,Class,Label
0,recurrence-events,True
1,no-recurrence-events,False
2,no-recurrence-events,False
3,recurrence-events,True
4,no-recurrence-events,False


In [41]:
var rand = new Random();

In [31]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Age)
    .Distinct()

index,value
0,30-39
1,40-49
2,50-59
3,60-69
4,70-79
5,20-29


In [45]:
var ageLookup = new Dictionary<string, uint>
{
    ["20-29"] = 1U,
    ["30-39"] = 2U,
    ["40-49"] = 3U,
    ["50-59"] = 4U,
    ["60-69"] = 5U,
    ["70-79"] = 6U,
};
var encodeAge = context.Transforms.Conversion.MapValue(inputColumnName: "Age", outputColumnName: "AgeEncoded", keyValuePairs: ageLookup);
var transformedData = encodeAge.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false, ignoreMissingColumns: true)
    .Select(x => new { x.Age, x.AgeEncoded })
    .Distinct()

index,Age,AgeEncoded
0,30-39,2
1,40-49,3
2,50-59,4
3,60-69,5
4,70-79,6
5,20-29,1


In [9]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Menopause)
    .Distinct()

index,value
0,ge40
1,premeno
2,lt40


In [44]:
var menopauseLookup = new Dictionary<string, uint>
{
    ["lt40"] = 1U,
    ["ge40"] = 2U,
    ["premeno"] = 3U,
};

var encodeMenopause = context.Transforms.Conversion.MapValue(inputColumnName: "Menopause", outputColumnName: "MenopauseEncoded", keyValuePairs: menopauseLookup);
var transformedData = encodeMenopause.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false, ignoreMissingColumns: true)
    .Select(x => new { x.Menopause, x.MenopauseEncoded })
    .Distinct()

index,Menopause,MenopauseEncoded
0,premeno,3
1,ge40,2
2,lt40,1


In [10]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.TumorSize)
    .Distinct()

index,value
0,30-34
1,15-19
2,25-29
3,20-24
4,5-9
5,35-39
6,40-44
7,10-14
8,0-4
9,50-54


In [48]:
var tumorSizeLookup = new Dictionary<string, uint>
{
    ["0-4"] = 1U,
    ["5-9"] = 2U,
    ["10-14"] = 3U,
    ["15-19"] = 4U,
    ["20-24"] = 5U,
    ["25-29"] = 6U,
    ["30-34"] = 7U,
    ["35-39"] = 8U,
    ["40-44"] = 9U,
    ["45-49"] = 10U,
    ["50-54"] = 11U,
};

var encodeTumorSize = context.Transforms.Conversion.MapValue(inputColumnName: "TumorSize", outputColumnName: "TumorSizeEncoded", keyValuePairs: tumorSizeLookup);
var transformedData = encodeTumorSize.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false, ignoreMissingColumns: true)
    .Select(x => new { x.TumorSize, x.TumorSizeEncoded })
    .Distinct()

index,TumorSize,TumorSizeEncoded
0,0-4,1
1,30-34,7
2,40-44,9
3,15-19,4
4,10-14,3
5,20-24,5
6,25-29,6
7,35-39,8
8,50-54,11
9,45-49,10


In [11]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.InvNodes)
    .Distinct()

index,value
0,9-11
1,0-2
2,6-8
3,15-17
4,12-14
5,3-5
6,24-26


In [12]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.NodeCaps)
    .Distinct()

index,value
0,?
1,no
2,yes


In [13]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Breast)
    .Distinct()

index,value
0,left
1,right


In [14]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.BreastQuad)
    .Distinct()

index,value
0,left_up
1,?
2,left_low
3,right_up
4,central
5,right_low


In [15]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Irradiat)
    .Distinct()

index,value
0,yes
1,no


In [47]:
var encodeCategorical = context.Transforms.Conversion.MapValueToKey(
    new[] {
//         new  InputOutputColumnPair("AgeEncoded", "Age"),
        new  InputOutputColumnPair("MenopauseEncoded", "Menopause"),
        new  InputOutputColumnPair("TumorSizeEncoded", "TumorSize"),
        new  InputOutputColumnPair("InvNodesEncoded", "InvNodes"),
        new  InputOutputColumnPair("NodeCapsEncoded", "NodeCaps"),
        new  InputOutputColumnPair("BreastEncoded", "Breast"),
        new  InputOutputColumnPair("BreastQuadEncoded", "BreastQuad"),
        new  InputOutputColumnPair("IrradiatEncoded", "Irradiat"),
    },
    keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue,
    addKeyValueAnnotationsAsText: true
);

var transformedData = encodeCategorical.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false, ignoreMissingColumns: true).Take(5)

index,Label,AgeEncoded,MenopauseEncoded,TumorSizeEncoded,InvNodesEncoded,NodeCapsEncoded,BreastEncoded,BreastQuadEncoded,IrradiatEncoded,Features,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DegMalig,Breast,BreastQuad,Irradiat
0,False,0,3,1,1,2,2,2,1,<null>,recurrence-events,30-39,premeno,0-4,0-2,no,2,right,central,no
1,False,0,3,6,6,3,2,6,1,<null>,no-recurrence-events,30-39,premeno,30-34,6-8,yes,2,right,right_up,no
2,False,0,3,8,1,2,2,6,1,<null>,no-recurrence-events,30-39,premeno,40-44,0-2,no,2,right,right_up,no
3,False,0,3,6,1,3,2,6,1,<null>,recurrence-events,40-49,premeno,30-34,0-2,yes,3,right,right_up,no
4,False,0,3,3,1,2,1,3,1,<null>,no-recurrence-events,30-39,premeno,15-19,0-2,no,1,left,left_low,no


In [50]:
var featureColumns = new[]
{
    "AgeEncoded", "MenopauseEncoded", "TumorSizeEncoded"
};
var pipeline = mapLabel
    .Append(encodeAge)
    .Append(encodeMenopause)
    .Append(encodeTumorSize)
    .Append(context.Transforms.Concatenate("Features", featureColumns));
var transformedData = pipeline.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false, ignoreMissingColumns: true).Take(15)

index,Label,AgeEncoded,MenopauseEncoded,TumorSizeEncoded,InvNodesEncoded,NodeCapsEncoded,BreastEncoded,BreastQuadEncoded,IrradiatEncoded,Features,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DegMalig,Breast,BreastQuad,Irradiat
0,True,2,3,1,0,0,0,0,0,"[ 2, 3, 1 ]",recurrence-events,30-39,premeno,0-4,0-2,no,2,right,central,no
1,False,2,3,7,0,0,0,0,0,"[ 2, 3, 7 ]",no-recurrence-events,30-39,premeno,30-34,6-8,yes,2,right,right_up,no
2,False,2,3,9,0,0,0,0,0,"[ 2, 3, 9 ]",no-recurrence-events,30-39,premeno,40-44,0-2,no,2,right,right_up,no
3,True,3,3,7,0,0,0,0,0,"[ 3, 3, 7 ]",recurrence-events,40-49,premeno,30-34,0-2,yes,3,right,right_up,no
4,False,2,3,4,0,0,0,0,0,"[ 2, 3, 4 ]",no-recurrence-events,30-39,premeno,15-19,0-2,no,1,left,left_low,no
5,False,4,2,1,0,0,0,0,0,"[ 4, 2, 1 ]",no-recurrence-events,50-59,ge40,0-4,0-2,no,1,right,central,no
6,False,3,3,3,0,0,0,0,0,"[ 3, 3, 3 ]",no-recurrence-events,40-49,premeno,10-14,0-2,no,1,right,right_low,no
7,False,3,3,5,0,0,0,0,0,"[ 3, 3, 5 ]",no-recurrence-events,40-49,premeno,20-24,0-2,no,3,right,left_low,yes
8,False,4,3,4,0,0,0,0,0,"[ 4, 3, 4 ]",no-recurrence-events,50-59,premeno,15-19,0-2,no,2,right,left_low,no
9,False,5,2,4,0,0,0,0,0,"[ 5, 2, 4 ]",no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_low,no


In [79]:
var splitData = context.Data.TrainTestSplit(allData, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [80]:
var transformer = pipeline.Fit(trainData);

In [81]:
var estimator = context.BinaryClassification.Trainers.SdcaLogisticRegression(featureColumnName: "Features");

In [None]:
var transformedTrainData = transformer.Transform(trainData);
var cvResults = context.BinaryClassification.CrossValidate(transformedTrainData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.Accuracy)
    .First();