# Modeling profitability (binary classification)

## Imports

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
import itertools

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [None]:
df = pd.read_csv('/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/clean_data_v2.csv')

## Dropping non-numerical and unnecessary cols

In [None]:
df.head()

In [None]:
# Going to try modeling without competitor, because this probably makes things
# easier for end users who want to predict their film's potential profitability;
# it would probably be difficult for many end users to guess at competitor 
# metrics to input into the mode. Additionally,

# I noticed in EDA that the relationship btw competition and profit looks 
# random/non-existent. Also, when running logistic regression, I observed
# that including competitor metrics actually LOWERS accuracy
# and does not meaningfully improve fallout

cols_X = ['runtime', 'releases', 'cast_rev', 'cast_prof', 'cast_films', 'cast_prof_films', 'dir_rev',
       'dir_prof', 'dir_films', 'dir_prof_films', 'writ_rev', 'writ_prof', 'writ_films',
       'writ_prof_films', 'adj_budget', 'cast_dir_avg_rev', 
       'fall', 'spring', 'summer', 'winter', 'Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'None', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']

### competitor metrics column names:
# , 'compet_cast_rev', 'compet_cast_prof', 'compet_cast_films',
#        'compet_cast_prof_films', 'compet_dir_rev', 'compet_dir_prof', 'compet_dir_films',
#        'compet_dir_prof_films', 'compet_writ_rev', 'compet_writ_prof', 'compet_writ_films',
#        'compet_writ_prof_films']

In [None]:
col_y = ['made_money']

In [None]:
X = df.loc[:, cols_X].values
y = df.loc[:, col_y].values.ravel() 
#need to use ravel for y array to be correct shape for analysis, 
#otherwise np.mean(y_pred == y_test) does not work properly later

In [None]:
print(X.shape)
print(y.shape)

### Scaled data for use with logsitic regression and SVM:

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

## Functions to show confusion matrix

#### source: https://www.kaggle.com/dstuerzer/optimized-logistic-regression

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
#    else:
#        print('Confusion matrix, without normalization')

#    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

def show_data(cm, print_res = 0):
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    if print_res == 1:
        print('Accuracy =     {:.3f}'.format((tp+tn)/(tp+fp+tn+fn)))  #my addition
        print('Precision =     {:.3f}'.format(tp/(tp+fp)))
        print('Recall (TPR) =  {:.3f}'.format(tp/(tp+fn)))
        print('Fallout (FPR) = {:.3f}'.format(fp/(fp+tn)))
    return (tp+tn)/(tp+fp+tn+fn), tp/(tp+fp), tp/(tp+fn), fp/(fp+tn)

## Logistic regression

#### Data is imbalanced (has mostly profitable films), so using stratified train-test split

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=313)
for train_index, test_index in skf.split(X_scaled, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    break

#### Training the model

In [None]:
lrn = LogisticRegression(penalty='l1')  #l1 penalty performs MUCH better than l2 penalty

lrn.fit(X_train, y_train)
y_pred = lrn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
if lrn.classes_[0] == 1:
    cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

plot_confusion_matrix(cm, ['0', '1'], )
acc, pr, tpr, fpr = show_data(cm, print_res = 1)

#### Accuracy is the percentage of correct predictions (classifications) made by the model.
#### Precision is the probability that a film that is classified as profitable actually made money.
#### Recall (True Positive Rate) is the probability that a profitable film is classified as such.
#### Fallout (False Positive Rate) is the probability that an unprofitable film is wrongly classified as profitable. **Really important to minimize fallout if we don't want to make unprofitable movies.**

In [None]:
np.mean(y_pred == y_test) #accuracy checks out

### Parameter tuning - looking at ROC curves & confusion matrices

#### Functions to plot ROC. Source: https://www.kaggle.com/dstuerzer/optimized-logistic-regression

In [None]:
def ROC(X, y, c, r):
#makes cross_validation for given parameters c,r. Returns FPR, TPR (averaged)
    dic_weight = {1:len(y)/(r*np.sum(y)), 0:len(y)/(len(y)-r*np.sum(y))} 
    #unfortunately this takes too long to run with l1 penalty, so looking at l2
    lrn = LogisticRegression(penalty = 'l2', C = c, class_weight = dic_weight,
                             solver='liblinear') #specifying default solver to avoid a zillion warnings printing
    
    N = 5      #how much k-fold
    N_iter = 3    #repeat how often (taking the mean)
    mean_tpr = 0.0
    mean_thresh = 0.0
    mean_fpr = np.linspace(0, 1, 50000)
    

    for it in range(N_iter):
        skf = StratifiedKFold(n_splits = N, shuffle = True)
        for train_index, test_index in skf.split(X, y):
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]
         
            lrn.fit(X_train, y_train)
            y_prob = lrn.predict_proba(X_test)[:,lrn.classes_[1]]
            
            fpr, tpr, thresholds = roc_curve(y_test, y_prob)
            mean_tpr += np.interp(mean_fpr, fpr, tpr)
            mean_thresh += np.interp(mean_fpr, fpr, thresholds)
            mean_tpr[0] = 0.0

    mean_tpr /= (N*N_iter)
    mean_thresh /= (N*N_iter)
    mean_tpr[-1] = 1.0
    return mean_fpr, mean_tpr, roc_auc_score(y_test, y_prob), mean_thresh

def plot_roc(X,y, list_par_1, par_1 = 'C', par_2 = 1):

    f = plt.figure(figsize = (9,6));
    for p in list_par_1:
        if par_1 == 'C':
            c = p
            r = par_2
        else:
            r = p
            c = par_2
        list_FP, list_TP, AUC, mean_thresh = ROC(X, y, c, r)      
        plt.plot(list_FP, list_TP, label = 'C = {}, r = {}, TPR(3e-4) = {:.4f}, AUC = {:.4f}'.format(c,r,list_TP[10],AUC));
    plt.legend(title = 'values', loc='lower right')
    plt.xlim(0, 1)   #we are only interested in small values of FPR
    plt.ylim(0, 1)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC detail')
    desired_fpr = 0.2
    plt.axvline(desired_fpr, color='b', linestyle='dashed', linewidth=2) #line to show where desired FPR occurs 
    plt.show()
    plt.close()

#### Casual/visual grid search 1: Look for optimal class_weight, 'r'

In [None]:
plot_roc(X,y, [10, 1,.1,.01,.001], 'r', 1)
# looks like r = 1 (default) is best
# I think this makes sense because we did stratified train-test split earlier

#### Casual/visual grid search 2: Look for optimal 'C' (inverse regularization strength)

In [None]:
plot_roc(X,y, [0.001, 0.01, 0.1, 1, 10], 'C', 1)
# differences are so small it hardly matters
# running this repeatedly shows no pattern in terms of one C value standing out
# so we'll stick with the default C = 1.0

#### Casual/visual grid search 3: checking different decision boundaries - 60, 65, 70, 80%

In [None]:
y_pred = (lrn.predict_proba(X_test)[:,1] >= 0.6).astype(bool)

cm = confusion_matrix(y_test, y_pred)
if lrn.classes_[0] == 1:
    cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

plot_confusion_matrix(cm, ['0', '1'], )
acc, pr, tpr, fpr = show_data(cm, print_res = 1)

In [None]:
y_pred = (lrn.predict_proba(X_test)[:,1] >= 0.7).astype(bool)

cm = confusion_matrix(y_test, y_pred)
if lrn.classes_[0] == 1:
    cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

plot_confusion_matrix(cm, ['0', '1'], )
acc, pr, tpr, fpr = show_data(cm, print_res = 1)

In [None]:
y_pred = (lrn.predict_proba(X_test)[:,1] >= 0.8).astype(bool)

cm = confusion_matrix(y_test, y_pred)
if lrn.classes_[0] == 1:
    cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

plot_confusion_matrix(cm, ['0', '1'], )
acc, pr, tpr, fpr = show_data(cm, print_res = 1)

In [None]:
y_pred = (lrn.predict_proba(X_test)[:,1] >= 0.65).astype(bool)

cm = confusion_matrix(y_test, y_pred)
if lrn.classes_[0] == 1:
    cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

plot_confusion_matrix(cm, ['0', '1'], )
acc, pr, tpr, fpr = show_data(cm, print_res = 1)

#### **Hard to say which decision boundary is optimal. We want to minimize FPR, but accuracy is getting trashed at higher thresholds. Time to try other models: SVM, RF, GB...**

## Support Vector Classification

In [None]:
svc = SVC()  #tried all kernels, SVC just doesn't do well here

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
if lrn.classes_[0] == 1:
    cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

plot_confusion_matrix(cm, ['0', '1'], )
acc, pr, tpr, fpr = show_data(cm, print_res = 1)

## Random Forest

In [None]:
# using unscaled data (seems to do slightly better than with scaled data):

skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=313)
for train_index, test_index in skf.split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    break

accs = []
prs = []
tprs = []
fprs = []
for i in range(100):
    rf = RandomForestClassifier() #

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    if lrn.classes_[0] == 1:
        cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

#     plot_confusion_matrix(cm, ['0', '1'], )
    acc, pr, tpr, fpr = show_data(cm, print_res = 0)
    
    accs.append(acc)
    prs.append(pr)
    tprs.append(tpr)
    fprs.append(fprs)
    
print("avg accuracy: ", np.mean(accs))
print("avg precision: ", np.mean(prs))
print("avg recall: ", np.mean(tprs))
print("avg fallout: ", np.mean(fprs))

## sklearn Gradient Boosting

In [None]:
# using unscaled data (seems to make no difference scaled vs. unscaled):

skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=313)
for train_index, test_index in skf.split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    break

gb = GradientBoostingClassifier() #

gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
if lrn.classes_[0] == 1:
    cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

plot_confusion_matrix(cm, ['0', '1'], )
acc, pr, tpr, fpr = show_data(cm, print_res = 1)