__Objective__: Load the model trained using the script run_pipeline.py and do quality assessment

__TODO__: add figure and models to git, removing from gitignore

In [1]:
%load_ext blackcellmagic

In [2]:
import os, pdb
import pandas as pd
import seaborn as sns
from train_model import train_model, compare_methods



# Parameters

In [3]:
TARGET_CUTOFF = 7.5
PROJECT_BASE_DIR = "/home/rohail/projects/imdb_ratings/"
model_save_dir = "models/"
plot_write_dir = "reports/figures/"
idx_columns = ["imdb_title_id", "title", "original_title"]
target_columns = ["avg_vote", "avg_vote_flag"]
# determined via classification threshold
classification_threshold = 0.8

In [4]:
df_model = pd.read_csv(os.path.join(PROJECT_BASE_DIR, model_save_dir, "df_model.csv"))
y_test = pd.read_csv(os.path.join(PROJECT_BASE_DIR, model_save_dir, "y_test.csv"))
x_test = pd.read_csv(os.path.join(PROJECT_BASE_DIR, model_save_dir, "x_test.csv"))

In [5]:
# regression
parameters = {
    "plot_write_dir": os.path.join(PROJECT_BASE_DIR, plot_write_dir),
    "model_save_dir": os.path.join(PROJECT_BASE_DIR, model_save_dir),
    "model_type": "regression",  # , classification
    "idx_columns": idx_columns,
    "test_set_size": 0.1,
    "training_parameters": {
        "class_weight": "balanced",  # vs providing sample weight to fit --> does it make a difference?
        "n_jobs": -1,
        "max_iter" : 10000,
        "scoring": "balanced_accuracy",
    },
}

reg_model, df_reg_coefs, _, _ = train_model(
    df_model,
    parameters=parameters,
    load_from_disk="regression_2020_03_04_13_56.joblib"
)

# classification
parameters.update({"model_type": "classification"})
clf_model, df_clf_coefs, _, _ = train_model(
    df_model,
    parameters=parameters,
    load_from_disk="classification_2020_03_04_14_58.joblib",
)

Diagnostic plots for this model can be found in the following directory: 
/home/rohail/projects/imdb_ratings/reports/figures/
The model itself is saved in the following directory: /home/rohail/projects/imdb_ratings/models/

Loading model from /home/rohail/projects/imdb_ratings/models/regression_2020_03_04_13_56.joblib
Diagnostic plots for this model can be found in the following directory: 
/home/rohail/projects/imdb_ratings/reports/figures/
The model itself is saved in the following directory: /home/rohail/projects/imdb_ratings/models/

Loading model from /home/rohail/projects/imdb_ratings/models/classification_2020_03_04_14_58.joblib


# Validate model

In [6]:
parameters = {
    "target_columns": target_columns,
    "classification_threshold":classification_threshold, # determined from looking at diagnostic plot....
    "regression_threshold": TARGET_CUTOFF,
    "idx_columns": idx_columns,
}

train_movies_sample = {
    "The Dark Knight",
    "Anchorman: The Legend of Ron Burgundy",
    "The Big Lebowski",
    "Batman v Superman: Dawn of Justice",
    "Black Panther",
    "Kabhi Khushi Kabhie Gham...",
    "3 Idiots",
    "The Intouchables",
    "Amélie",
    "The Matrix",
    "The Matrix Reloaded",
    "V for Vendetta",
    "Kill Bill: Vol. 1",
    "La vita è bella",
    "Die Hard",
    "Requiem for a Dream",
    "Terminator 3: Rise of the Machines",
    "The Terminator",
    "Terminator 2: Judgment Day",
    "Titanic",
    "The Departed",
    "Groundhog Day",
    "Love in Kilnerry",
    "Jinnah",
    "Jawani Phir Nahi Ani",
    "Bol",
    "Das letzte Mahl",
    "The Lives of Others",
    "Das Experiment",
}

# predict on unseen examples depending on model type...
df_predict_test, df_predict_train  = compare_methods(
    df_model, reg_model, clf_model, x_test, y_test, train_movies_sample, parameters
)

Making predictions on test data
       reg_rating_prediction  clf_prob_prediction
count            6651.000000          6651.000000
mean                6.624809             0.860669
std                 1.195558             0.100118
min                -5.361077             0.024658
25%                 5.855755             0.805919
50%                 6.573938             0.884124
75%                 7.336742             0.935293
max                15.439234             0.999633
Regression and classification predictions the same? False
Balanced accuracy for regression:  0.5254132927882691
Balanced accuracy for classification:  0.4765079744088544
Making predictions on sample data from train data
       reg_rating_prediction  clf_prob_prediction
count              32.000000            32.000000
mean                8.495867             0.829495
std                 2.785052             0.076770
min                 3.792930             0.715504
25%                 6.647194             0.75938