# Prediction Problems

## Loading the dataset

In [97]:
import os

import pandas as pd

import holcrawl.shared

In [2]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()

In [3]:
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')

In [4]:
df = pd.read_csv(dataset_path)

## Feature Generation

In [5]:
df['total_screens'] = df['avg_screens'] * df['num_weekends'] 

In [6]:
df['norm_gross'] = df['gross_income'] / df['budget']

In [72]:
df['starting_letter'] = df['name'].map(lambda name: name[0].lower())

In [73]:
df['name_lenth'] = df['name'].map(lambda name: len(name))

In [74]:
# df['opening_weekend_date']

In [75]:
FEAT_TO_KEEP = [
    'duration', 'starting_letter', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
    'avg_mc_critic_by_opening', 'name_lenth'  # 'avg_mc_user_by_opening'
]

In [76]:
FEAT_TO_KEEP += [col for col in df.columns if 'genres' in col]

In [77]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)

In [78]:
letter_dummies = pd.get_dummies(dataset['starting_letter'], drop_first=True, prefix='fl')

In [79]:
dataset = dataset.assign(**{col: letter_dummies[col] for col in letter_dummies.columns})

In [80]:
dataset = dataset.drop('starting_letter', axis=1)

In [81]:
dataset = dataset.dropna(axis=0)

In [None]:
# pd.options.display.max_columns = 999
# dataset

## Prediction

In [106]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from scipy.special import expit

### Predicting failed movies

In [99]:
failed = df['norm_gross'].ix[dataset.index] < 1

In [100]:
X = dataset
Y = failed

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [102]:
logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [103]:
mean_accuracy = logreg.score(X_test, y_test)
mean_accuracy

0.57446808510638303

In [108]:
y_predict = logreg.predict(X_test)
confidence = logreg.decision_function(X_test)
expit_confidence = expit(logreg.decision_function(X_test))
probablility = logreg.predict_proba(X_test)[:, 1]
# log_res['decision'] = logreg.decision_function(x_test)
coef_df = pd.DataFrame({'coef': logreg.coef_[0]}, index=X_train.columns)

## Predicting return on investment

In [117]:
dataset

Index(['budget', 'duration', 'year', 'genres.action', 'genres.adventure',
       'genres.animation', 'genres.biography', 'genres.comedy', 'genres.crime',
       'genres.documentary', 'genres.drama', 'genres.family', 'genres.fantasy',
       'genres.history', 'genres.horror', 'genres.music', 'genres.musical',
       'genres.mystery', 'genres.romance', 'genres.sci-fi', 'genres.sport',
       'genres.thriller', 'genres.war', 'genres.western',
       'avg_mc_critic_by_opening', 'opening_month', 'opening_day',
       'opening_day_of_year', 'name_lenth', 'fl_2', 'fl_3', 'fl_4', 'fl_5',
       'fl_a', 'fl_b', 'fl_c', 'fl_d', 'fl_e', 'fl_f', 'fl_g', 'fl_h', 'fl_i',
       'fl_j', 'fl_k', 'fl_l', 'fl_m', 'fl_n', 'fl_o', 'fl_p', 'fl_r', 'fl_s',
       'fl_t', 'fl_u', 'fl_v', 'fl_w'],
      dtype='object')

In [109]:
X = dataset
Y = df['norm_gross'].ix[dataset.index]

In [110]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [112]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [115]:
coef_df = pd.DataFrame({'coef': regr.coef_[0]}, index=X_train.columns)
coef_df

In [116]:
regr.score(X_test, y_test) 

-0.93319393276110452