In [0]:
import sys
in_colab = 'google.colab' in sys.modules

if in_colab:
    # Install packages in Colab
    !pip install -r requirements.txt
    !pip install category_encoders==2.0.0
    !pip install pandas-profiling==2.3.0
    !pip install plotly==4.1.1

In [0]:
#importing the dataframe
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/scottwmwork/datasets/master/tmdb_5000_movies.csv')

In [0]:
df.shape

(4803, 20)

Since there are so many features and rows, there should be plenty of data!

## Cleaning Data

In [0]:
#Create columns out of release date column
df['release_date'] = pd.to_datetime(df['release_date'],infer_datetime_format = True)
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_day'] = df['release_date'].dt.month
df = df.drop(columns = 'release_date')

# Visuals

In [0]:
import plotly.express as px
px.scatter(df, x = 'budget', y ='revenue', color = 'revenue', trendline = 'ols', title = 'Revenue of Movies 1920-2017 based on budget')


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



In [0]:
#3d Graph
px.scatter_3d(df, x = 'release_year',y = 'budget', z = 'revenue', color = 'revenue',opacity=0.7,size_max=8, title = 'Revenue of Movies 1916-2017')

# Baseline Prediction

In [0]:
from sklearn.metrics import r2_score

y = round(df['revenue'].mean(),2)

y_pred_base = [y] * len(df['revenue'])
target = df['revenue']
r2_score(target,y_pred_base)


-2.220446049250313e-16

In [0]:
from sklearn.metrics import mean_absolute_error
baseline_mean_abs_err = mean_absolute_error(target,y_pred_base)
baseline_mean_abs_err 

96403855.60307099

# Split Train/Test/Validate Sets

In [0]:
#Isolate the test set
test = df[df['release_year'] == 2016]
y_test = test['revenue']
X_test = test.drop(columns = 'revenue')

#Exclude test set from data
dfn = df[df['release_year'] != 2016]

In [0]:
#Create train and validation data 
from sklearn.model_selection import train_test_split
train, val = train_test_split(dfn, train_size = .80, test_size = 0.20, random_state = 42)

y_train = train.revenue
X_train = train.drop(columns = 'revenue')

y_val = val.revenue
X_val = val.drop(columns = 'revenue')


In [0]:
import ast
import numpy as np

def wrangle(X):
  
  X = X.copy()
  X = X.reset_index()

  #Make genres column usable
  genre = []
  for x in X['genres']:
    if x == '[]':
      genre.append(np.nan)
    else:
      temp = ast.literal_eval(x) 
      genre.append(temp[0]['name']) #grabs first genre in list of dictionaries
    

  #Features to not include:
  X = X.drop(columns = ['genres','homepage','keywords','overview','production_companies','production_countries','spoken_languages','tagline'])
  
  #Engineer features:
  
  #original title is same as title?    
  title_changed = []
  for x in range(0,len(X['title'])):
    if X['title'][x] == X['original_title'][x]:
       title_changed.append(0)
    else:
       title_changed.append(1)
  
  #length of title
  length_of_title = []
  for x in X['title']:
    length_of_title.append(len(x))
  
  #Add features to dataframe
  X['title_changed'] = title_changed
  X['length_of_title'] = length_of_title
  X['genre_first_listed'] = genre
   
  
  return X

In [0]:
#Wrangle data
X_test = wrangle(X_test)
X_val = wrangle(X_val)
X_train = wrangle(X_train)

# Pipeline

In [0]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor


#Create pipelines for different regressors
linear_regression = make_pipeline(
  ce.OneHotEncoder(use_cat_names = True),
  SimpleImputer(),  
  LinearRegression()
)

logistic_regression = make_pipeline(
  ce.OneHotEncoder(use_cat_names = True),
  SimpleImputer(),
  LogisticRegression()
)

forest_regression = make_pipeline(
  ce.OneHotEncoder(use_cat_names = True),
  SimpleImputer(),  
  RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
)


#Fit Models!
linear_regression.fit(X_train,y_train)
logistic_regression.fit(X_train,y_train)
forest_regression.fit(X_train,y_train)







Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['original_language', 'original_title',
                                     'status', 'title', 'genre_first_listed'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', return_df=True,
                               use_cat_names=True, verbose=0)),
                ('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan,...
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                             

### Linear Regression Scores

In [0]:
val_linear_score = linear_regression.score(X_val,y_val)
test_linear_score = linear_regression.score(X_test,y_test)

print('Validation set score:', val_linear_score)
print('Test set score:', test_linear_score)

Validation set score: 0.7357524630762733
Test set score: 0.8391775092274408


### Logistic Regression Scores

In [0]:
val_logistic_score = logistic_regression.score(X_val,y_val)
test_logistic_score = logistic_regression.score(X_test,y_test)

print('Validation set score:', val_logistic_score)
print('Test set score:', test_logistic_score)

Validation set score: 0.30957446808510636
Test set score: 0.2980769230769231


### Random Forest Scores

In [0]:
val_forest_score = forest_regression.score(X_val,y_val)
test_forest_score = forest_regression.score(X_test,y_test)

print('Validation set score:', val_forest_score)
print('Test set score:', test_forest_score)

Validation set score: 0.759255104337667
Test set score: 0.8111075596191704


### ToDo: Find the partial dependence so that you can understand the importance of each column