# Machine Learning Extension

We will use regression and classification models to further analyze the movie data and how different features can be used to predict the total gross revenue of a movie.

## Data Prep

In [91]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, explained_variance_score

# Suite of Machine Learning Algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

# Setup to Ignore Version Errors and Deprecations
import warnings
warnings.filterwarnings("ignore")

In [92]:
# loading the data
merged_data = pd.read_csv("../data/merged_data.csv")
merged_data.head()

Unnamed: 0,movie_id,primary_title,genres,individual_genre,runtime_minutes,title,studio,domestic_gross,foreign_gross,year,averagerating,numvotes,director_id,director_name,total_gross
0,tt0315642,Wazir,"Action,Crime,Drama",Action,103.0,Wazir,Relbig.,1100000.0,0.0,2016,7.1,15378,nm2349060,Bejoy Nambiar,1100000.0
1,tt0315642,Wazir,"Action,Crime,Drama",Crime,103.0,Wazir,Relbig.,1100000.0,0.0,2016,7.1,15378,nm2349060,Bejoy Nambiar,1100000.0
2,tt0315642,Wazir,"Action,Crime,Drama",Drama,103.0,Wazir,Relbig.,1100000.0,0.0,2016,7.1,15378,nm2349060,Bejoy Nambiar,1100000.0
3,tt0337692,On the Road,"Adventure,Drama,Romance",Adventure,124.0,On the Road,IFC,744000.0,8000000.0,2012,6.1,37886,nm0758574,Walter Salles,8744000.0
4,tt0337692,On the Road,"Adventure,Drama,Romance",Drama,124.0,On the Road,IFC,744000.0,8000000.0,2012,6.1,37886,nm0758574,Walter Salles,8744000.0


### Data Cleaning & Feature Engineering

In [93]:
# get the features that we want to analyze in the models
model_data = merged_data[["individual_genre", "runtime_minutes", "studio", "year", "averagerating", "director_id", "total_gross"]]

model_data.head()

Unnamed: 0,individual_genre,runtime_minutes,studio,year,averagerating,director_id,total_gross
0,Action,103.0,Relbig.,2016,7.1,nm2349060,1100000.0
1,Crime,103.0,Relbig.,2016,7.1,nm2349060,1100000.0
2,Drama,103.0,Relbig.,2016,7.1,nm2349060,1100000.0
3,Adventure,124.0,IFC,2012,6.1,nm0758574,8744000.0
4,Drama,124.0,IFC,2012,6.1,nm0758574,8744000.0


In [94]:
# check for null values
model_data.isna().sum()

individual_genre     0
runtime_minutes     87
studio               0
year                 0
averagerating        0
director_id          0
total_gross          0
dtype: int64

In [95]:
# drop null values of runtime_minutes
model_data = model_data.dropna(subset=["runtime_minutes"])

# Machine Learning

### Helper Functions

In [96]:
# reset scores for model performance tests for different X_train variations
def reset_model_scores(models):
  for name in models:
    models[name]["All_Scores"] = list()
    models[name]["Top_Score"] = float()
    models[name]["Mean_Score"] = float()
    models[name]["Std_Score"] = float()

  return models

In [97]:
# helper function to test multiple model performances using cross_val_score
def test_models_performance(models, x_train, y_train, isRegressor, num_folds = 10):

  # reset the performance scores first using function above
  reset_model_scores(models)

  # set scoring type based on model type
  scoring = "neg_mean_squared_error" if isRegressor else "accuracy"

  # get the performance scores for each model and add them to the
  # corresponding result list
  for name in models:

    folds = KFold(n_splits=num_folds) if isRegressor else StratifiedKFold(n_splits=num_folds)

    results = cross_val_score(estimator=models[name]["Estimator"],
                              X=x_train,
                              y=y_train,
                              cv=folds,
                              scoring=scoring)
    models[name]["Top_Score"] = results.max()
    models[name]["Mean_Score"] = results.mean()
    models[name]["Std_Score"] = results.std()

    for result in results:
      models[name]["All_Scores"].append(result)

  # print the results
  for name in models:
    print("\n[MODEL TYPE: {}]\n".format(name))
    print(">>>> Top Performance: \t\t{:.4f}".format(models[name]["Top_Score"]))
    print(">>>> Average Performance: \t{:.4f}".format(models[name]["Mean_Score"]))
    print(">>>> Spread of Performance: \t{:.4f}".format(models[name]["Std_Score"]))

In [98]:
# printing accuracy scores
def print_accuracy(y_test, y_pred, isRegressor):

  if isRegressor:
    accuracy = 100 * explained_variance_score(y_test, y_pred)
  else:
    accuracy = 100 * accuracy_score(y_true=y_test,
                                y_pred=y_pred)

  print("> ACCURACY: \t{:.2f}%".format(accuracy))

In [99]:
# helper function to fit and predict a model
# prints the accuracy and returns the predicted y values
def fit_predict(model, X_train, y_train, X_test, y_test, isRegressor):
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print_accuracy(y_test, y_pred, isRegressor=isRegressor)

    return y_pred

In [100]:
# helper function to use LabelEncoder on string objects in a dataframe
def encode_strings(df):
  # Apply LabelEncoder to each text column in the DataFrame
  for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

## "Shotgun Approach": Linear Regression

In [101]:
TARGET = ["total_gross"]

X, y = model_data.drop(columns=TARGET, axis=1), model_data[TARGET]

In [102]:
encode_strings(X)

In [103]:
# Train test split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                        train_size=0.7,
                                        test_size=0.3,
                                        random_state=42)

In [104]:
# instantiate Linear Regression model
lin_reg_model = LinearRegression()

y_pred = fit_predict(lin_reg_model, X_train, y_train, X_test, y_test, isRegressor=True)

> ACCURACY: 	5.63%


In [105]:
model_data["total_gross"].describe()

count    7.682000e+03
mean     9.497949e+07
std      1.988016e+08
min      1.000000e+02
25%      4.264000e+05
50%      1.010000e+07
75%      8.710000e+07
max      1.405400e+09
Name: total_gross, dtype: float64

The linear regression had an accuracy of 5.63%, which is really bad. Although we can try to improve that, let's try making our target value (total_gross) a categorical value and using classification models instead to see if accuracy improves.

## "Shotgun Approach": Classification Models

### Data Preprocessing

In [106]:
# We want to use total_gross as a categorical value
# let's create buckets for different areas following percentiles

buckets = [0, 435000, 87300000, 1405400000]
labels = ["low", "medium", "high"]

model_data["gross_class"] = pd.cut(model_data["total_gross"], bins=buckets, labels=labels)
model_data.head()

Unnamed: 0,individual_genre,runtime_minutes,studio,year,averagerating,director_id,total_gross,gross_class
0,Action,103.0,Relbig.,2016,7.1,nm2349060,1100000.0,medium
1,Crime,103.0,Relbig.,2016,7.1,nm2349060,1100000.0,medium
2,Drama,103.0,Relbig.,2016,7.1,nm2349060,1100000.0,medium
3,Adventure,124.0,IFC,2012,6.1,nm0758574,8744000.0,medium
4,Drama,124.0,IFC,2012,6.1,nm0758574,8744000.0,medium


In [107]:
# drop the total_gross column and put in a new DF for categorical testing
cat_model_data = model_data.drop(columns=["total_gross"])
cat_model_data.head()

Unnamed: 0,individual_genre,runtime_minutes,studio,year,averagerating,director_id,gross_class
0,Action,103.0,Relbig.,2016,7.1,nm2349060,medium
1,Crime,103.0,Relbig.,2016,7.1,nm2349060,medium
2,Drama,103.0,Relbig.,2016,7.1,nm2349060,medium
3,Adventure,124.0,IFC,2012,6.1,nm0758574,medium
4,Drama,124.0,IFC,2012,6.1,nm0758574,medium


### Shotgun Approach using 5 different classification models

In [108]:
# new target
TARGET = ["gross_class"]
X_cat, y_cat = cat_model_data.drop(columns=TARGET), cat_model_data[TARGET]

In [109]:
# encode strings in X_cat
encode_strings(X_cat)

In [110]:
# Split X and Y into train and test
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_cat, y_cat,
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=42)

In [111]:
# creating models that we want to test to see which is most optimized
# we will use this to test different X_train variations
cat_models = {
    "KNN": {
        "Estimator": KNeighborsClassifier(),
        },
    "SVM": {
        "Estimator": SVC(),
        },
    "CART": {
        "Estimator": DecisionTreeClassifier(),
        },
    "NB": {
        "Estimator": GaussianNB(),
        },
    "LOGREG": {
        "Estimator": LogisticRegression(),
        }
}

In [112]:
# test performance of different models using X_train
test_models_performance(cat_models, X_cat_train, y_cat_train, isRegressor=False)


[MODEL TYPE: KNN]

>>>> Top Performance: 		0.6564
>>>> Average Performance: 	0.6207
>>>> Spread of Performance: 	0.0173

[MODEL TYPE: SVM]

>>>> Top Performance: 		0.4984
>>>> Average Performance: 	0.4980
>>>> Spread of Performance: 	0.0004

[MODEL TYPE: CART]

>>>> Top Performance: 		0.8780
>>>> Average Performance: 	0.8329
>>>> Spread of Performance: 	0.0210

[MODEL TYPE: NB]

>>>> Top Performance: 		0.5456
>>>> Average Performance: 	0.5141
>>>> Spread of Performance: 	0.0202

[MODEL TYPE: LOGREG]

>>>> Top Performance: 		0.5391
>>>> Average Performance: 	0.5097
>>>> Spread of Performance: 	0.0135


A decision tree classifier seems to be the best performing model with 83.12% average accuracy.

In [113]:
# let's create an instance of that and tune it to have even better accuracy
dt_model = DecisionTreeClassifier()

y_pred = fit_predict(dt_model, X_cat_train, y_cat_train, X_cat_test, y_cat_test, isRegressor=False)

> ACCURACY: 	81.85%


# Accuracy to beat: 81.78%

### Tuning: Standard Scaler

In [114]:
# use standard scaler
# check if that will gain better results
scaler = StandardScaler()
X_cat_train_scaled = scaler.fit_transform(X_cat_train)
X_cat_test_scaled = scaler.transform(X_cat_test)

In [115]:
y_pred = fit_predict(dt_model, X_cat_train_scaled, y_cat_train, X_cat_test_scaled, y_cat_test, isRegressor=False)

> ACCURACY: 	81.65%


The standard scaler didn't increase the accuracy by much.

# Accuracy to beat: 81.59%

### Tuning: MinMax Scaler

In [116]:
# Let's try using a minmax scaler
scaler = MinMaxScaler()
X_cat_train_mm_scaled = scaler.fit_transform(X_cat_train)
X_cat_test_mm_scaled = scaler.transform(X_cat_test)

In [117]:
y_pred = fit_predict(dt_model, X_cat_train_mm_scaled, y_cat_train, X_cat_test_mm_scaled, y_cat_test, isRegressor=False)

> ACCURACY: 	81.46%


The minmax scaler also didn't increase the accuracy by much.

# Accuracy to beat: 81.65%

### Tuning: GridSearchCV

In [118]:
# let's investigate the expressed signal from each of our features
importances, features = dt_model.feature_importances_, list(X_cat)

feature_importances = [(features[iteration], importances[iteration]) for iteration in range(len(features))]
feature_importances.sort(reverse=True, key=lambda X_cat: X_cat[1])

feature_importances

[('studio', 0.3232623026076705),
 ('director_id', 0.1950744242217082),
 ('runtime_minutes', 0.16867287187327468),
 ('averagerating', 0.14880657745983003),
 ('year', 0.09937146957516377),
 ('individual_genre', 0.06481235426235275)]

In [119]:
# choose hyperparamters to test in the GridSearchCV
hyperparameters = {
		 'criterion': ['gini', 'entropy'],
		 'max_depth': [10, 20, 30],
		 'max_leaf_nodes': [1000, 5000, 10000],
		 'min_samples_leaf': [20, 50, 100],
		 'min_samples_split': [10, 50, 100]
}

In [120]:
tuned_model = DecisionTreeClassifier(random_state=42)
model_tuner = GridSearchCV(tuned_model, hyperparameters, cv=10)

In [121]:
model_tuner.fit(X_cat_train, y_cat_train)

In [122]:
optimally_tuned_classifier = model_tuner.best_estimator_

optimally_tuned_classifier

From the hyperparameters fed into the GridSearchCV, we see the most optimal hyperparameters for our DecisionTreeClassifier.

In [123]:
# Get accuracy of the most optimal classifier
y_pred = optimally_tuned_classifier.predict(X_cat_test)

print_accuracy(y_cat_test, y_pred, isRegressor=False)

> ACCURACY: 	66.69%


Hmmm...our accuracy went down by 15%! Let's look at the parameters of our original DecisionTreeClassifier to see what hyperparamters were used to get the 81% accuracy.

In [124]:
# Getting parameters of our original decision tree
dt_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

Our original decision tree seems to just be a a stump or singular line, which may mean the model is overfit to the original data. This suggests that an ensemble model would do well with this data because an ensemble is just a group of stumps.

In [125]:
# creating models that we want to test to see which is most optimized
# we will use this to test different X_train variations
ensemble_models = {
    "RandomForest": {
        "Estimator": RandomForestClassifier(),
        },
    "ADABoost": {
        "Estimator": AdaBoostClassifier(),
        }
}

In [126]:
# test performance of different models using X_train
test_models_performance(ensemble_models, X_cat_train, y_cat_train, isRegressor=False)


[MODEL TYPE: RandomForest]

>>>> Top Performance: 		0.8648
>>>> Average Performance: 	0.8573
>>>> Spread of Performance: 	0.0047

[MODEL TYPE: ADABoost]

>>>> Top Performance: 		0.6450
>>>> Average Performance: 	0.6166
>>>> Spread of Performance: 	0.0212


Random forest has the best average performance, even better than our Decision Tree! Let's create an instance of the Random Forest.

In [127]:
rf_model = RandomForestClassifier()

y_pred = fit_predict(rf_model, X_cat_train, y_cat_train, X_cat_test, y_cat_test, False)

> ACCURACY: 	85.69%


In [128]:
# Check paramters of this random forest
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

# Accuracy to beat: 85.30%

# Summary

The Random Forest was the model that performed the best with our movie data. We can see an 85.30% accuracy of predicting whether a movie will have low, medium, or high revenue based on the selected features.

Moving forward, we can do more hyperparameter tuning to improve the model even more. We can perform a selective GridSearchCV to understand what hyperparameters would be better. We could also do more feature selection to choose the most significant features and combine or drop features that aren't as significant.