In [1]:
#r "nuget:Microsoft.Data.Analysis,0.2.0"
#r "nuget:Microsoft.ML,1.4.0"
#r "nuget:Microsoft.ML.AutoML,0.16.0"

In [2]:
using Microsoft.Data.Analysis;
using XPlot.Plotly;

### Introduction to DataFrame
https://devblogs.microsoft.com/dotnet/an-introduction-to-dataframe/

In [3]:
using Microsoft.AspNetCore.Html;
Formatter<DataFrame>.Register((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c.Name)));
    var rows = new List<List<IHtmlContent>>();
    var take = 20;
    for (var i = 0; i < Math.Min(take, df.Rows.Count); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df.Rows[i])
        {
            cells.Add(td(obj));
        }
        rows.Add(cells);
    }
    
    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));
    
    writer.Write(t);
}, "text/html");

In [51]:
string fileName = @"D:\softaware\samples-ml\data\tablesoccer-export.csv";

var tablesoccerData = DataFrame.LoadCsv(fileName);

In [52]:
tablesoccerData.Description()

index,Description,Hour,GoalsTeam1,GoalsTeam2,GoalDifference,Result
0,Length (excluding null values),611.0,611.0,611.0,611.0,611.0
1,Max,18.0,10.0,10.0,10.0,2.0
2,Min,1.0,0.0,0.0,-8.0,1.0
3,Mean,7.759411,3.6841245,3.5040917,0.18003273,1.4942716


In [None]:
display(tablesoccerData["GoalDifference"].Mean());
display(tablesoccerData["GoalDifference"].Median());


tablesoccerData["GoalDifference"].ValueCounts()

In [None]:
tablesoccerData

In [None]:
Chart.Plot(
    new Graph.Histogram()
    {
        x = tablesoccerData["Hour"]
    }
)

In [None]:
Chart.Plot(
    new Graph.Histogram() {
        x = tablesoccerData["GoalsTeam2"]
    }
)

In [53]:
tablesoccerData = tablesoccerData.Filter(tablesoccerData["GoalsTeam1"].ElementwiseLessThanOrEqual(5));
tablesoccerData = tablesoccerData.Filter(tablesoccerData["GoalsTeam2"].ElementwiseLessThanOrEqual(5));

display(tablesoccerData.Rows.Count);

In [None]:
tablesoccerData.Columns.Remove("GoalsTeam1");
tablesoccerData.Columns.Remove("GoalsTeam2");
tablesoccerData.Columns.Remove("Result");

In [7]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.AutoML;

In [54]:
static T[] Shuffle<T>(T[] array)
{
    Random rand = new Random();
    for (int i = 0; i < array.Length; i++)
    {
        int r = i + rand.Next(array.Length - i);
        T temp = array[r];
        array[r] = array[i];
        array[i] = temp;
    }
    return array;
}

int[] randomIndices = Shuffle(Enumerable.Range(0, (int)tablesoccerData.Rows.Count).ToArray());
int testSize = (int)(tablesoccerData.Rows.Count * .3);
int[] trainRows = randomIndices[testSize..];
int[] testRows = randomIndices[..testSize];

DataFrame tablesoccerData_train = tablesoccerData[trainRows];
DataFrame tablesoccerData_test = tablesoccerData[testRows];

display(tablesoccerData_train.Rows.Count);
display(tablesoccerData_test.Rows.Count);

In [79]:
tablesoccerData_train = tablesoccerData_train.Sort("Result")

index,Hour,Weekday,Team1GoalKeeper,Team1Striker,Team2GoalKeeper,Team2Striker,GoalsTeam1,GoalsTeam2,GoalDifference,Result
0,1,Thursday,Patrik,Simon,Markus,Felix,5,1,4,1
1,1,Thursday,Philipp,Christoph,Michael,Roman,5,4,1,1
2,1,Thursday,Felix,Patrik,Christoph,Roman,5,3,2,1
3,1,Thursday,Roman,Dominik D.,Kathi,Felix,5,3,2,1
4,15,Friday,Patrik,Daniel S.,Felix,Roman,5,2,3,1
5,15,Friday,Roman,Patrik,Felix,Dominik D.,5,0,5,1
6,15,Thursday,Daniel S.,Felix,Markus,Michael,5,3,2,1
7,1,Thursday,Felix,Christoph,Markus,Daniel S.,5,1,4,1
8,12,Friday,Simon,Roman,Patrik,Felix,5,1,4,1
9,13,Thursday,Markus,Daniel S.,Dominik L.,Dominik D.,5,2,3,1


In [80]:
%%time

var mlContext = new MLContext();

var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey("Result", "Result")
    .Append(mlContext.Transforms.Categorical.OneHotEncoding(
    new[] 
    { 
        new InputOutputColumnPair("Team1GoalKeeper", "Team1GoalKeeper"), 
        new InputOutputColumnPair("Team1Striker", "Team1Striker"), 
        new InputOutputColumnPair("Team2GoalKeeper", "Team2GoalKeeper"), 
        new InputOutputColumnPair("Team2Striker", "Team2Striker") 
    }))
  .Append(mlContext.Transforms.Concatenate(
      "Features", 
      new[] 
      { 
          "Team1GoalKeeper", "Team1Striker", "Team2GoalKeeper", "Team2Striker"          
      }))
  .Append(mlContext.Transforms.NormalizeMinMax("Features", "Features"))
  .AppendCacheCheckpoint(mlContext);

var trainer = mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy(labelColumnName: "Result", featureColumnName: "Features")
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel", "PredictedLabel"));

var trainingPipeline = dataProcessPipeline.Append(trainer);

var model = trainingPipeline.Fit(tablesoccerData_train);

Wall time: 1052,0511ms

In [46]:
public static void PrintMulticlassClassificationFoldsAverageMetrics(IEnumerable<TrainCatalogBase.CrossValidationResult<MulticlassClassificationMetrics>> crossValResults)
{
    var metricsInMultipleFolds = crossValResults.Select(r => r.Metrics);

    var microAccuracyValues = metricsInMultipleFolds.Select(m => m.MicroAccuracy);
    var microAccuracyAverage = microAccuracyValues.Average();
    var microAccuraciesStdDeviation = CalculateStandardDeviation(microAccuracyValues);
    var microAccuraciesConfidenceInterval95 = CalculateConfidenceInterval95(microAccuracyValues);

    var macroAccuracyValues = metricsInMultipleFolds.Select(m => m.MacroAccuracy);
    var macroAccuracyAverage = macroAccuracyValues.Average();
    var macroAccuraciesStdDeviation = CalculateStandardDeviation(macroAccuracyValues);
    var macroAccuraciesConfidenceInterval95 = CalculateConfidenceInterval95(macroAccuracyValues);

    var logLossValues = metricsInMultipleFolds.Select(m => m.LogLoss);
    var logLossAverage = logLossValues.Average();
    var logLossStdDeviation = CalculateStandardDeviation(logLossValues);
    var logLossConfidenceInterval95 = CalculateConfidenceInterval95(logLossValues);

    var logLossReductionValues = metricsInMultipleFolds.Select(m => m.LogLossReduction);
    var logLossReductionAverage = logLossReductionValues.Average();
    var logLossReductionStdDeviation = CalculateStandardDeviation(logLossReductionValues);
    var logLossReductionConfidenceInterval95 = CalculateConfidenceInterval95(logLossReductionValues);

    Console.WriteLine($"*************************************************************************************************************");
    Console.WriteLine($"*       Metrics for Multi-class Classification model      ");
    Console.WriteLine($"*------------------------------------------------------------------------------------------------------------");
    Console.WriteLine($"*       Average MicroAccuracy:    {microAccuracyAverage:0.###}  - Standard deviation: ({microAccuraciesStdDeviation:#.###})  - Confidence Interval 95%: ({microAccuraciesConfidenceInterval95:#.###})");
    Console.WriteLine($"*       Average MacroAccuracy:    {macroAccuracyAverage:0.###}  - Standard deviation: ({macroAccuraciesStdDeviation:#.###})  - Confidence Interval 95%: ({macroAccuraciesConfidenceInterval95:#.###})");
    Console.WriteLine($"*       Average LogLoss:          {logLossAverage:#.###}  - Standard deviation: ({logLossStdDeviation:#.###})  - Confidence Interval 95%: ({logLossConfidenceInterval95:#.###})");
    Console.WriteLine($"*       Average LogLossReduction: {logLossReductionAverage:#.###}  - Standard deviation: ({logLossReductionStdDeviation:#.###})  - Confidence Interval 95%: ({logLossReductionConfidenceInterval95:#.###})");
    Console.WriteLine($"*************************************************************************************************************");
}


public static double CalculateStandardDeviation(IEnumerable<double> values)
{
    double average = values.Average();
    double sumOfSquaresOfDifferences = values.Select(val => (val - average) * (val - average)).Sum();
    double standardDeviation = Math.Sqrt(sumOfSquaresOfDifferences / (values.Count() - 1));
    return standardDeviation;
}

public static double CalculateConfidenceInterval95(IEnumerable<double> values)
{
    double confidenceInterval95 = 1.96 * CalculateStandardDeviation(values) / Math.Sqrt((values.Count() - 1));
    return confidenceInterval95;
}

In [48]:
var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(tablesoccerData_test, trainingPipeline, numberOfFolds: 5, labelColumnName: "Result");

PrintMulticlassClassificationFoldsAverageMetrics(crossValidationResults);

*************************************************************************************************************
*       Metrics for Multi-class Classification model      
*------------------------------------------------------------------------------------------------------------
*       Average MicroAccuracy:    0,58  - Standard deviation: (,037)  - Confidence Interval 95%: (,036)
*       Average MacroAccuracy:    0,576  - Standard deviation: (,041)  - Confidence Interval 95%: (,04)
*       Average LogLoss:          3,64  - Standard deviation: (1,297)  - Confidence Interval 95%: (1,271)
*       Average LogLossReduction: -4,511  - Standard deviation: (2,037)  - Confidence Interval 95%: (1,996)
*************************************************************************************************************


In [66]:
class Game
{
    public string Team1GoalKeeper { get; set; }
    public string Team1Striker { get; set; }
    public string Team2GoalKeeper { get; set; }
    public string Team2Striker { get; set; }
    public float Result { get; set; }
}

class Result
{
    [ColumnName("PredictedLabel")]
    public System.Single Prediction { get; set; }
    
    public float[] Score { get; set; }
}

In [83]:
var predictionEngine = mlContext.Model.CreatePredictionEngine<Game, Result>(model);

In [84]:
var result = predictionEngine.Predict(new Game()
{
    Team1GoalKeeper = "Philipp",
    Team1Striker = "Roman",
    Team2GoalKeeper = "Alexander",
    Team2Striker = "Kathi"
});

// Because of this mechanism, when using AutoML for ML.NET it's 
// a good idea to rearrange the first few items in the training dataset 
// so that each class to predict appears one by one in a logical order of some kind.

result

Prediction,Score
1,"[ 0.9999819, 1.8228935E-05 ]"


In [38]:
var predictions = new List<(float, float)>();

var data = tablesoccerData;

for (int i=0; i < data.Rows.Count; i++)
{
    var game = new Game()
    {
        Team1GoalKeeper = (string)data["Team1GoalKeeper"][i],
        Team1Striker = (string)data["Team1Striker"][i],
        Team2GoalKeeper = (string)data["Team2GoalKeeper"][i],
        Team2Striker = (string)data["Team2Striker"][i],
    };
    
    var result = (float)data["GoalDifference"][i];
    var prediction = predictionEngine.Predict(game);
    
    predictions.Add((result, prediction.Score));
}

var predictionTrace = new Graph.Scatter()
                        {
                            x = predictions.Select(p => p.Item1).ToList(),
                            y = predictions.Select(p => Math.Round(p.Item2)).ToList(),
                            //z = predictions.Select((p, index) => index).ToList(),
                            mode = "markers",
                            marker = new Graph.Marker()
                            {
                                size = 7,
                                opacity = 0.2
                            }
                        };

var correctTrace = new Graph.Scatter()
    {
        x = new int[] { -5, 5 },
        y = new int[] { -5, 5 }
    };

display(Chart.Plot(new [] { predictionTrace, correctTrace }));
