In [1]:
#r "nuget: Microsoft.ML"
#r "nuget: Microsoft.ML.LightGbm"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [2]:
if (!File.Exists("breast-cancer.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", "breast-cancer.data");
}

Console.WriteLine($"Data file has {File.ReadLines("breast-cancer.data").Count():n0} lines");
File.ReadLines("breast-cancer.data").Take(5)

Data file has 286 lines


index,value
0,"no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no"
1,"no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no"
2,"no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no"
3,"no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no"
4,"no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no"


In [3]:
var context = new MLContext();

In [69]:
class BreastCancerData
{
    [LoadColumn(0)]
    public string Class { get; set; }

    [LoadColumn(1)]
    public string Age { get; set; }

    [LoadColumn(2)]
    public string Menopause { get; set; }

    [LoadColumn(3)]
    public string TumorSize { get; set; }

    [LoadColumn(4)]
    public string InvNodes { get; set; }

    [LoadColumn(5)]
    public string NodeCaps { get; set; }

    [LoadColumn(6)]
    public int DegMalig { get; set; }

    [LoadColumn(7)]
    public string Breast { get; set; }
    
    [LoadColumn(8)]
    public string BreastQuad { get; set; }
    
    [LoadColumn(9)]
    public string Irradiat { get; set; }
}

class BreastCancerTrainData : BreastCancerData
{
    public bool Label { get; set; }
    
    public uint AgeEncoded { get; set; }
    
    public uint MenopauseEncoded { get; set; }
    
    public uint TumorSizeEncoded { get; set; }
    
    public uint InvNodesEncoded { get; set; }
    
    public uint NodeCapsEncoded { get; set; }
    
    public uint BreastEncoded { get; set; }
    
    public uint BreastQuadEncoded { get; set; }
    
    public uint IrradiatEncoded { get; set; }
}

In [5]:
var allData = context.Data.LoadFromTextFile<BreastCancerData>("breast-cancer.data", hasHeader: false, separatorChar: ',');
allData = context.Data.ShuffleRows(allData);
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false).Take(5)

index,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DegMalig,Breast,BreastQuad,Irradiat
0,no-recurrence-events,40-49,premeno,35-39,0-2,no,1,left,left_low,no
1,no-recurrence-events,40-49,premeno,15-19,12-14,no,3,right,right_low,yes
2,no-recurrence-events,40-49,premeno,25-29,0-2,no,2,right,left_low,no
3,no-recurrence-events,50-59,lt40,15-19,0-2,no,2,left,left_low,no
4,no-recurrence-events,40-49,premeno,40-44,0-2,no,2,right,left_low,no


In [6]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Class)
    .Distinct()

index,value
0,no-recurrence-events
1,recurrence-events


In [7]:
Action<BreastCancerData, BreastCancerTrainData> mapping = (input, output) =>
{
    output.Class = input.Class;
    output.Age = input.Age;
    output.Menopause = input.Menopause;
    output.TumorSize = input.TumorSize;
    output.InvNodes = input.InvNodes;
    output.NodeCaps = input.NodeCaps;
    output.DegMalig = input.DegMalig;
    output.Breast = input.Breast;
    output.BreastQuad = input.BreastQuad;
    output.Irradiat = input.Irradiat;
    output.Label = 
        input.Class switch
        {
            "no-recurrence-events" => false,
            "recurrence-events" => true
        };
};
var mapLabel = context.Transforms.CustomMapping(mapping, contractName: null);
var transformedData = mapLabel.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false).Take(5)

index,Label,AgeEncoded,MenopauseEncoded,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DegMalig,Breast,BreastQuad,Irradiat
0,False,<null>,<null>,no-recurrence-events,40-49,premeno,35-39,0-2,no,1,left,left_low,no
1,False,<null>,<null>,no-recurrence-events,40-49,premeno,15-19,12-14,no,3,right,right_low,yes
2,False,<null>,<null>,no-recurrence-events,40-49,premeno,25-29,0-2,no,2,right,left_low,no
3,False,<null>,<null>,no-recurrence-events,50-59,lt40,15-19,0-2,no,2,left,left_low,no
4,False,<null>,<null>,no-recurrence-events,40-49,premeno,40-44,0-2,no,2,right,left_low,no
5,False,<null>,<null>,no-recurrence-events,40-49,premeno,10-14,0-2,no,1,right,left_up,no
6,False,<null>,<null>,no-recurrence-events,50-59,ge40,10-14,0-2,no,1,left,left_low,no
7,False,<null>,<null>,no-recurrence-events,40-49,premeno,45-49,0-2,no,2,left,left_low,yes
8,False,<null>,<null>,no-recurrence-events,40-49,premeno,10-14,0-2,no,1,right,left_up,no
9,False,<null>,<null>,no-recurrence-events,60-69,ge40,25-29,0-2,no,2,left,left_low,no


In [58]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Age)
    .Distinct()

index,value
0,40-49
1,50-59
2,60-69
3,30-39
4,70-79
5,20-29


In [10]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Menopause)
    .Distinct()

index,value
0,premeno
1,lt40
2,ge40


In [13]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.TumorSize)
    .Distinct()

index,value
0,35-39
1,15-19
2,25-29
3,40-44
4,10-14
5,45-49
6,0-4
7,30-34
8,20-24
9,5-9


In [66]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.InvNodes)
    .Distinct()

index,value
0,0-2
1,12-14
2,3-5
3,15-17
4,9-11
5,6-8
6,24-26


In [70]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.NodeCaps)
    .Distinct()

index,value
0,no
1,yes
2,?


In [72]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Breast)
    .Distinct()

index,value
0,left
1,right


In [74]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.BreastQuad)
    .Distinct()

index,value
0,left_low
1,right_low
2,left_up
3,central
4,right_up
5,?


In [77]:
context.Data.CreateEnumerable<BreastCancerData>(allData, reuseRowObject: false)
    .Select(x => x.Irradiat)
    .Distinct()

index,value
0,no
1,yes


In [78]:
var encodeCategorical = context.Transforms.Conversion.MapValueToKey(
    new[] {
        new  InputOutputColumnPair("AgeEncoded", "Age"),
        new  InputOutputColumnPair("MenopauseEncoded", "Menopause"),
        new  InputOutputColumnPair("TumorSizeEncoded", "TumorSize"),
        new  InputOutputColumnPair("InvNodesEncoded", "InvNodes"),
        new  InputOutputColumnPair("NodeCapsEncoded", "NodeCaps"),
        new  InputOutputColumnPair("BreastEncoded", "Breast"),
        new  InputOutputColumnPair("BreastQuadEncoded", "BreastQuad"),
        new  InputOutputColumnPair("IrradiatEncoded", "Irradiat"),
    },
    keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue,
    addKeyValueAnnotationsAsText: true
);

var transformedData = encodeCategorical.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false, ignoreMissingColumns: true).Take(15)

index,Label,AgeEncoded,MenopauseEncoded,TumorSizeEncoded,InvNodesEncoded,NodeCapsEncoded,BreastEncoded,BreastQuadEncoded,IrradiatEncoded,Class,Age,Menopause,TumorSize,InvNodes,NodeCaps,DegMalig,Breast,BreastQuad,Irradiat
0,False,3,3,7,1,2,1,3,1,no-recurrence-events,40-49,premeno,35-39,0-2,no,1,left,left_low,no
1,False,3,3,3,2,2,2,5,2,no-recurrence-events,40-49,premeno,15-19,12-14,no,3,right,right_low,yes
2,False,3,3,5,1,2,2,3,1,no-recurrence-events,40-49,premeno,25-29,0-2,no,2,right,left_low,no
3,False,4,2,3,1,2,1,3,1,no-recurrence-events,50-59,lt40,15-19,0-2,no,2,left,left_low,no
4,False,3,3,8,1,2,2,3,1,no-recurrence-events,40-49,premeno,40-44,0-2,no,2,right,left_low,no
5,False,3,3,2,1,2,2,4,1,no-recurrence-events,40-49,premeno,10-14,0-2,no,1,right,left_up,no
6,False,4,1,2,1,2,1,3,1,no-recurrence-events,50-59,ge40,10-14,0-2,no,1,left,left_low,no
7,False,3,3,9,1,2,1,3,2,no-recurrence-events,40-49,premeno,45-49,0-2,no,2,left,left_low,yes
8,False,3,3,2,1,2,2,4,1,no-recurrence-events,40-49,premeno,10-14,0-2,no,1,right,left_up,no
9,False,5,1,5,1,2,1,3,1,no-recurrence-events,60-69,ge40,25-29,0-2,no,2,left,left_low,no


In [83]:
var featureColumns = new[]
{
    "AgeEncoded", "MenopauseEncoded", "TumorSizeEncoded", "InvNodesEncoded", "NodeCapsEncoded", "BreastEncoded", "BreastQuadEncoded",
    "IrradiatEncoded", "DegMalig"
};
var pipeline = mapLabel.Append(encodeCategorical).Append(context.Transforms.Concatenate("Features", featureColumns));
var transformedData = pipeline.Fit(allData).Transform(allData);
context.Data.CreateEnumerable<BreastCancerTrainData>(transformedData, reuseRowObject: false, ignoreMissingColumns: true).Take(15)

Unhandled exception: System.InvalidOperationException: Column 'AgeEncoded' is key. Concatenation of keys is unsupported.
   at Microsoft.ML.Transforms.ColumnConcatenatingEstimator.CheckInputsAndMakeColumn(SchemaShape inputSchema, String name, String[] sources)
   at Microsoft.ML.Transforms.ColumnConcatenatingEstimator.GetOutputSchema(SchemaShape inputSchema)
   at Microsoft.ML.Data.EstimatorChain`1.GetOutputSchema(SchemaShape inputSchema)
   at Microsoft.ML.Data.EstimatorChain`1.Fit(IDataView input)
   at Submission#86.<<Initialize>>d__0.MoveNext()
--- End of stack trace from previous location where exception was thrown ---
   at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)

In [79]:
var splitData = context.Data.TrainTestSplit(allData, testFraction: 0.2);
var (trainData, testData) = (splitData.TrainSet, splitData.TestSet);

In [80]:
var transformer = pipeline.Fit(trainData);

In [81]:
var estimator = context.BinaryClassification.Trainers.SdcaLogisticRegression(featureColumnName: "Features");

In [None]:
var transformedTrainData = transformer.Transform(trainData);
var cvResults = context.BinaryClassification.CrossValidate(transformedTrainData, estimator, numberOfFolds: 3);
var cvResult = cvResults
    .OrderByDescending(x => x.Metrics.Accuracy)
    .First();