# Using regression algorithms with ML.NET

### Install NuGet-Packages
_The Microsoft.ML.FastTree package has to be the very first in the jupyter notebook due to a bug in dotnet test. See https://github.com/dotnet/interactive/issues/55._

In [None]:
#r "nuget:Microsoft.ML.FastTree,1.4.0"

In [None]:
#r "nuget:Microsoft.Data.Analysis,0.2.0"
#r "nuget:Microsoft.ML,1.4.0"

### Using-Statements and Helper-Methods

In [None]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.Data.Analysis;
using XPlot.Plotly;

In [None]:
using Microsoft.AspNetCore.Html;
Formatter<DataFrame>.Register((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c.Name)));
    var rows = new List<List<IHtmlContent>>();
    var take = 20;
    for (var i = 0; i < Math.Min(take, df.Rows.Count); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df.Rows[i])
        {
            cells.Add(td(obj));
        }
        rows.Add(cells);
    }
    
    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));
    
    writer.Write(t);
}, "text/html");

In [None]:
static void PrintRegressionFoldsAverageMetrics(IEnumerable<TrainCatalogBase.CrossValidationResult<RegressionMetrics>> crossValidationResults)
{
    var L1 = crossValidationResults.Select(r => r.Metrics.MeanAbsoluteError);
    var L2 = crossValidationResults.Select(r => r.Metrics.MeanSquaredError);
    var RMS = crossValidationResults.Select(r => r.Metrics.RootMeanSquaredError);
    var lossFunction = crossValidationResults.Select(r => r.Metrics.LossFunction);
    var R2 = crossValidationResults.Select(r => r.Metrics.RSquared);

    Console.WriteLine($"*************************************************************************************************************");
    Console.WriteLine($"*       Metrics for Regression model      ");
    Console.WriteLine($"*------------------------------------------------------------------------------------------------------------");
    Console.WriteLine($"*       Average L1 Loss:       {L1.Average():0.###} ");
    Console.WriteLine($"*       Average L2 Loss:       {L2.Average():0.###}  ");
    Console.WriteLine($"*       Average RMS:           {RMS.Average():0.###}  ");
    Console.WriteLine($"*       Average Loss Function: {lossFunction.Average():0.###}  ");
    Console.WriteLine($"*       Average R-squared:     {R2.Average():0.###}  ");
    Console.WriteLine($"*************************************************************************************************************");
}

### Load the data

In [None]:
%%time

string fileName = @"D:\softaware\samples-ml\data\calls.csv";

var calls = DataFrame.LoadCsv(fileName);

### Analyze the data

In [None]:
calls

In [None]:
display(calls["Calls"].Mean());
display(calls["Calls"].Median());
display(calls["Hour"].ValueCounts());

In [None]:
Chart.Plot(
    new Graph.Histogram()
    {
        x = calls["Calls"]
    }
)

In [None]:
var callsGrouped = calls.GroupBy("WeatherConditions").Sum("Calls");

var weatherCount = calls["WeatherConditions"].ValueCounts();
display(weatherCount);
display(callsGrouped);

var merged = callsGrouped.Merge<string>(weatherCount, "WeatherConditions", "Values");
merged["Calls"] /= merged["Counts"];

display(merged);

Chart.Plot(
    new Graph.Scatter()
    {
        x = merged["WeatherConditions"],
        y = merged["Calls"]
    }
)

### Test-/Train split

In [None]:
double splitFactor = 0.3;

static T[] Shuffle<T>(T[] array)
{
    Random rand = new Random();
    for (int i = 0; i < array.Length; i++)
    {
        int r = i + rand.Next(array.Length - i);
        T temp = array[r];
        array[r] = array[i];
        array[i] = temp;
    }
    return array;
}

int[] randomIndices = Shuffle(Enumerable.Range(0, (int)calls.Rows.Count).ToArray());
int testSize = (int)(calls.Rows.Count * splitFactor);
int[] trainRows = randomIndices[testSize..];
int[] testRows = randomIndices[..testSize];

DataFrame calls_train = calls[trainRows];
DataFrame calls_test = calls[testRows];

display($"{calls_train.Rows.Count} rows for training");
display($"{calls_test.Rows.Count} rows for testing");

### Input and Output classes

In [None]:
class ModelInput
{
    public float Hour { get; set; }
    public float Month { get; set; }
    public string DayOfWeek { get; set; }
    public string WeatherConditions { get; set; }
}

class ModelOutput
{
    public float Score { get; set; }
}

### Model pipeline and training

In [None]:
%%time

var mlContext = new MLContext();

var dataProcessPipeline = 
    mlContext.Transforms.Categorical.OneHotEncoding(
        new[] 
        { 
            new InputOutputColumnPair("DayOfWeek", "DayOfWeek"), 
            new InputOutputColumnPair("WeatherConditions", "WeatherConditions"),
            new InputOutputColumnPair("Hour", "Hour"),
            new InputOutputColumnPair("Month", "Month"),
        })
        .Append(
            mlContext.Transforms.Concatenate(
                "Features", 
                new[] { "DayOfWeek", "WeatherConditions", "Hour", "Month" }));

// Set the training algorithm 
var trainer = mlContext.Regression.Trainers.FastTreeTweedie(labelColumnName: "Calls", featureColumnName: "Features", numberOfTrees:50);
var trainingPipeline = dataProcessPipeline.Append(trainer);

var model = trainingPipeline.Fit(calls_train);

var predictionEngine = mlContext.Model.CreatePredictionEngine<ModelInput, ModelOutput>(model);

### Cross validation

In [None]:
var crossValidationResults = mlContext.Regression.CrossValidate(calls_test, trainingPipeline, numberOfFolds: 5, labelColumnName: "Calls");

PrintRegressionFoldsAverageMetrics(crossValidationResults);

### Show actual and predicted values for a random sample

In [None]:
// Take a random sample from the dataset
var randomSample = calls.Sample(1);

// Create a scatter trace for plotting actual values
var actualValues = calls.Filter(calls["Date"].ElementwiseEquals(randomSample["Date"][0]));

var actual = new Graph.Scatter()
{
    x = actualValues["Hour"],
    y = actualValues["Calls"],
    mode = "markers",
    name = "actual"
};

// Predict values for each hour of this day
var predictedValues = new List<(int hour, float calls)>();

for (int i=0; i<24; i++) 
{
    var prediction = predictionEngine.Predict(new ModelInput()
    {
        Hour = (float)i,
        Month = (float)randomSample["Month"][0],
        DayOfWeek = (string)randomSample["DayOfWeek"][0],
        WeatherConditions = (string)randomSample["WeatherConditions"][0]
    });
    
    predictedValues.Add((i, prediction.Score));
}

// Create a scatter chart for the predicted values
var predicted = new Graph.Scatter()
{
    x = predictedValues.Select(p => p.hour),
    y = predictedValues.Select(p => p.calls),
    mode = "markers",
    name = "predicted"
};

// Show the results
display(h1(randomSample["Date"][0]));
display(randomSample["DayOfWeek"][0]);
display("Weather conditions: " + randomSample["WeatherConditions"][0]);

display(Chart.Plot(new [] { actual, predicted }));

### Analyze a whole year

In [None]:
// Create a scatter chart for the actual values
var groupedByDate = calls.GroupBy("Date").Sum("Calls");

var actual = new Graph.Scatter() {
        x = groupedByDate["Date"],
        y = groupedByDate["Calls"],
        name = "actual"
    };

// Predict a whole year, starting with startDate
DateTime startDate = new DateTime(2019, 1, 1);
var predictedValues = new List<(string date, float calls)>();

for (DateTime date = startDate; date < startDate.AddYears(1); date = date.AddDays(1))
{
    var dateString = date.ToString("MM'/'dd'/'yyyy 00:00:00");
    
    var row = calls.Filter(calls["Date"].ElementwiseEquals(dateString));
    
    string weatherConditions = (string)row["WeatherConditions"][0];

    float callCount = 0;
    for (int i=0; i<24; i++) 
    {
        var prediction = predictionEngine.Predict(new ModelInput()
        {
            Hour = (float)i,
            Month = (float)date.Month,
            DayOfWeek = date.DayOfWeek.ToString(),
            WeatherConditions = weatherConditions
        });
        
        callCount += prediction.Score;
    }
    
    predictedValues.Add((dateString, callCount));
}

// Create a scatter chart for the predicted values
var predicted = new Graph.Scatter {
    x = predictedValues.Select(p => p.date),
    y = predictedValues.Select(p => p.calls),
    name = "predicted"
};

display(Chart.Plot(new [] { actual, predicted }));

### Export model

In [None]:
%%time

mlContext.Model.Save(model, ((IDataView)calls_train).Schema, "Model.zip");