# Sentiment Analysis for Movie Reviews

## Installing packages and getting set-up

In [1]:
#r "nuget:Microsoft.ML,1.5.2"
    
using System;
using System.Collections.Generic;
using System.IO;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.DataOperationsCatalog;

## Declaring data-classes

In [2]:
// a class for the movie reviews we're going to analyse
public class SentimentReview
{
    [LoadColumn(1)]
    public string Sentiment { get; set; }

    [LoadColumn(0)]
    public string Review { get; set; }
}

In [3]:
// a class for the predictions we're going to make
public class SentimentPrediction
{
    [ColumnName("PredictedLabel")]
    public bool Prediction { get; set; }

    public float Probability { get; set; }

    public float Score { get; set; }
}

In [4]:
// a class that will help us out later and be used to transform data so that our model can better understand it
public class LookupMap
{
    public string Value { get; set; }
    public bool Category { get; set; }
}

## Building the model

In [5]:
// create mlContext, using a seed so that results are deterministic
MLContext mlContext = new MLContext(seed: 0);

In [6]:
// load the data into an IDataView and then display its form (or schema)
string dataPath = "./imdbdataset.csv";
IDataView dataView = mlContext.Data.LoadFromTextFile<SentimentReview>(dataPath, hasHeader: true, separatorChar: ',', allowQuoting: true);
display(dataView.Schema);

index,Name,Index,IsHidden,Type,Annotations
0,Sentiment,0,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
1,Review,1,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }


In [7]:
// split data into training and testing sets
TrainTestData trainTestSplit = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);
IDataView trainingData = trainTestSplit.TrainSet;
IDataView testData = trainTestSplit.TestSet;

// and now take a quick look at both sets
display(h4("trainingData Schema"));
display(trainingData.Schema);

display(h4("testData Schema"));
display(testData.Schema);

index,Name,Index,IsHidden,Type,Annotations
0,Sentiment,0,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
1,Review,1,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }


index,Name,Index,IsHidden,Type,Annotations
0,Sentiment,0,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
1,Review,1,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }


In [8]:
// define table used to map from string values in our csv to bool values that our model can work with  
var lookupData = new[] {
    new LookupMap { Value = "negative", Category = false },
    new LookupMap { Value = "positive", Category = true }
};

var lookupIdvMap = mlContext.Data.LoadFromEnumerable(lookupData);

display(lookupIdvMap.Schema)

index,Name,Index,IsHidden,Type,Annotations
0,Value,0,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
1,Category,1,False,{ Microsoft.ML.Data.BooleanDataViewType: RawType: System.Boolean },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }


In [9]:
// make pipeline (by applying the table from the previous cell)
var dataProcessPipeline = mlContext.Transforms.Conversion.MapValue(outputColumnName: "Label", lookupMap: lookupIdvMap, lookupIdvMap.Schema["Value"], lookupIdvMap.Schema["Category"], inputColumnName: nameof(SentimentReview.Sentiment))
    .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "Features", inputColumnName: nameof(SentimentReview.Review)));

// set the training algorithm                         
var trainer = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(labelColumnName: "Label", featureColumnName: "Features");

// add the training algorithm to the pipeline
var trainingPipeline = dataProcessPipeline.Append(trainer);

display(trainingPipeline)

LastEstimator
"{ Microsoft.ML.Trainers.SdcaLogisticRegressionBinaryTrainer: Info: { Microsoft.ML.TrainerInfo: NeedNormalization: True, WantCaching: True }, FeatureColumn: { Microsoft.ML.SchemaShape+Column: Name: Features, Kind: { Microsoft.ML.SchemaShape+Column+VectorKind: value__: 1 }, ItemType: { Microsoft.ML.Data.NumberDataViewType: RawType: System.Single }, IsKey: False, Annotations: [ ] }, LabelColumn: { Microsoft.ML.SchemaShape+Column: Name: Label, Kind: { Microsoft.ML.SchemaShape+Column+VectorKind: value__: 0 }, ItemType: { Microsoft.ML.Data.BooleanDataViewType: RawType: System.Boolean }, IsKey: False, Annotations: [ ] }, WeightColumn: { Microsoft.ML.SchemaShape+Column: Name: <null>, Kind: { Microsoft.ML.SchemaShape+Column+VectorKind: value__: 0 }, ItemType: <null>, IsKey: False, Annotations: <null> } }"


In [10]:
// train the model (fitting to the trainingData)
Console.WriteLine("Please wait. The model is currently being trained (and tested)...");

ITransformer trainedModel = trainingPipeline.Fit(trainingData);

Console.WriteLine("Model trained!")

Please wait. The model is currently being trained (and tested)...
Model trained!


In [11]:
// evaluate the model on the test data
var predictions = trainedModel.Transform(testData);

var metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "Label", scoreColumnName: "Score");

display(metrics)

LogLoss,LogLossReduction,Entropy,AreaUnderRocCurve,Accuracy,PositivePrecision,PositiveRecall,NegativePrecision,NegativeRecall,F1Score,AreaUnderPrecisionRecallCurve,ConfusionMatrix
0.5006565972153878,0.4992911597115844,0.9998956617722234,0.9292770964818572,0.8579664049299275,0.8553959627329193,0.8656452563347083,0.8606640863719699,0.8501006036217303,0.8604900907937127,0.9282273148610036,"{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.8553959627329193, 0.8606640863719699 ], PerClassRecall: [ 0.8656452563347083, 0.8501006036217303 ], Counts: [ [ 4407, 684 ], [ 745, 4225 ] ], NumberOfClasses: 2 }"


In [12]:
// create a prediction engine using the trained model
var predEngine = mlContext.Model.CreatePredictionEngine<SentimentReview, SentimentPrediction>(trainedModel);

Console.WriteLine("Prediction model/engine built!")

Prediction model/engine built!


## Using the model

In [13]:
// create some example reviews (for testing the prediction engine)
SentimentReview badReview = new SentimentReview { Review = "I hate this movie! It is terrible!" };
SentimentReview goodReview = new SentimentReview { Review = "I love this movie! It is great!" };
SentimentReview neutralReview = new SentimentReview { Review = "I don't know about this movie. It is OK." };

display(h4("Bad Review"));
display(badReview);

display(h4("Good Review"));
display(goodReview);

display(h4("Neutral Review"));
display(neutralReview);

Sentiment,Review
<null>,I hate this movie! It is terrible!


Sentiment,Review
<null>,I love this movie! It is great!


Sentiment,Review
<null>,I don't know about this movie. It is OK.


In [14]:
// predict whether each example review has a positive or negative sentiment
var predBadReview = predEngine.Predict(badReview);
var predGoodReview = predEngine.Predict(goodReview);
var predNeutralReview = predEngine.Predict(neutralReview);

display(h4("Bad Review"));
display(predBadReview);

display(h4("Good Review"));
display(predGoodReview);

display(h4("Neutral Review"));
display(predNeutralReview);

Prediction,Probability,Score
False,0.105195455,-2.1407852


Prediction,Probability,Score
True,0.9982779,6.362488


Prediction,Probability,Score
True,0.5594775,0.2390418
