# Modeling profitability (binary classification)

## Imports

In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [2]:
df = pd.read_csv('/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/clean_data_v2.csv')

## Dropping non-numerical and unnecessary cols

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,budget,id,release_date,revenue,runtime,title,genre,profit,made_money,cast_0,cast_1,cast_2,cast_3,cast_4,director,writer,releases,cast_rev,cast_prof,cast_films,cast_prof_films,dir_rev,dir_prof,dir_films,dir_prof_films,writ_rev,writ_prof,writ_films,writ_prof_films,compet_cast_rev,compet_cast_prof,compet_cast_films,compet_cast_prof_films,compet_dir_rev,compet_dir_prof,compet_dir_films,compet_dir_prof_films,compet_writ_rev,compet_writ_prof,compet_writ_films,compet_writ_prof_films,adj_budget,cast_dir_avg_rev,month,year,decade,season,fall,spring,summer,winter,Action,Adventure,Animation,Comedy,Crime,Drama,Family,Fantasy,History,Horror,Music,Mystery,None,Romance,Science Fiction,Thriller,War,Western
0,0,4000000.0,5,1995-12-09,4300000.0,98.0,Four Rooms,Crime,300000.0,1,Tim Roth,Antonio Banderas,Jennifer Beals,Madonna,Marisa Tomei,Allison Anders,Allison Anders,30,52386490.0,35736490.0,9.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101749800.0,75626090.0,15.166667,9.333333,40462120.0,26476010.0,1.833333,1.0,17444690.0,8161356.0,1.333333,0.5,22193250.0,26193250.0,12,1995,1990,winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,11000000.0,11,1977-05-25,775398007.0,121.0,Star Wars,Adventure,764398007.0,1,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Mayhew,Anthony Daniels,George Lucas,George Lucas,67,4420000.0,2820000.0,1.0,1.0,71218500.0,70441500.0,2.0,2.0,71218500.0,70441500.0,2.0,2.0,18184020.0,14196520.0,5.0,4.0,0.0,0.0,0.0,0.0,15473330.0,-4526667.0,1.0,0.0,26819250.0,37819250.0,5,1977,1970,summer,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,94000000.0,12,2003-05-30,940335536.0,100.0,Finding Nemo,Animation,846335536.0,1,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,Brad Garrett,Andrew Stanton,Andrew Stanton,62,69915280.0,43896050.0,15.0,9.0,0.0,0.0,0.0,0.0,474480700.0,366147328.0,3.0,3.0,72575310.0,37616630.0,13.6,9.8,35387790.0,22661130.0,1.4,1.2,34690140.0,25801640.0,0.7,0.7,-59042360.0,34957640.0,5,2003,2000,summer,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,55000000.0,13,1994-07-06,677945399.0,142.0,Forrest Gump,Comedy,622945399.0,1,Tom Hanks,Robin Wright,Gary Sinise,Mykelti Williamson,Sally Field,Robert Zemeckis,Eric Roth,39,50778680.0,37076600.0,14.0,12.0,192273625.0,155107000.0,6.0,6.0,0.0,0.0,0.0,0.0,78070740.0,60592120.0,13.142857,10.857143,44212140.0,32179990.0,1.142857,1.142857,114531700.0,97073390.0,1.571429,1.571429,66526150.0,121526200.0,7,1994,1990,summer,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,15000000.0,14,1999-09-15,356296601.0,122.0,American Beauty,Drama,341296601.0,1,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,Mena Suvari,Sam Mendes,Alan Ball,54,57766900.0,20692830.0,16.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68815500.0,40229400.0,11.666667,8.166667,19884730.0,8995846.0,0.5,0.333333,77657960.0,63345460.0,0.666667,0.583333,13883450.0,28883450.0,9,1999,1990,fall,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# going to try modeling without competitor data first, because this
# makes it easier for end users who want to predict their film's potential profitability;
# would probably be difficult for most end users to come up with competitor metrics.
# also noticed in EDA that relationship btw competition and profit looks random/non-existent

cols_X = ['runtime', 'releases', 'cast_rev', 'cast_prof', 'cast_films', 'cast_prof_films', 'dir_rev',
       'dir_prof', 'dir_films', 'dir_prof_films', 'writ_rev', 'writ_prof', 'writ_films',
       'writ_prof_films', 'adj_budget', 'cast_dir_avg_rev', 
       'fall', 'spring', 'summer', 'winter', 'Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'None', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
# , 'compet_cast_rev', 'compet_cast_prof', 'compet_cast_films',
#        'compet_cast_prof_films', 'compet_dir_rev', 'compet_dir_prof', 'compet_dir_films',
#        'compet_dir_prof_films', 'compet_writ_rev', 'compet_writ_prof', 'compet_writ_films',
#        'compet_writ_prof_films']

In [5]:
col_y = ['made_money']

In [31]:
X = df.loc[:, cols_X].values
y = df.loc[:, col_y].values

## Train-test split

#### Data is imbalanced (has mostly profitable movies), so going to use stratified train-test split

In [39]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=313)
for train_index, test_index in skf.split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    break