## Cleaning Data

In [1]:
## importing the dataframe
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/scottwmwork/datasets/master/tmdb_5000_movies.csv')

#Create release_year feature out of release date column
df['release_date'] = pd.to_datetime(df['release_date'],infer_datetime_format = True)
df['release_year'] = df['release_date'].dt.year
df = df.drop(columns = 'release_date')

## Wrangle Data

In [2]:
### Wrangle Data function ###
import ast
import numpy as np

def wrangle(X):
  
  X = X.copy()
  X = X.reset_index()

  #Make genres column usable
  genre = []
  
  for x in X['genres']:
    if x == '[]':
      genre.append(np.nan)
    else:
      temp = ast.literal_eval(x) 
      genre.append(temp[0]['name']) #grabs first genre in list of dictionaries

  
  
  #Engineer features:
  
  #original title is same as title?    
  title_changed = []
  for x in range(0,len(X['title'])):
    if X['title'][x] == X['original_title'][x]:
       title_changed.append(0)
    else:
       title_changed.append(1)
  
  #length of title
  length_of_title = []
  for x in X['title']:
    length_of_title.append(len(x))
  
  #Add features to dataframe
  X['title_changed'] = title_changed
  X['length_of_title'] = length_of_title
  X['genre_first_listed'] = genre  


  #Features to not include:
  X = X.drop(columns = ['genres','homepage','keywords','overview','production_companies','production_countries',
                        'spoken_languages','tagline','popularity','index','id','vote_average','vote_count','original_title','title'])
  return X

## Split Train/Test/Validate Sets

In [None]:
#Isolate the test set
test = df[df['release_year'] == 2016]
y_test = test['revenue']
X_test = test.drop(columns = 'revenue')

#Exclude test set from data
dfn = df[df['release_year'] != 2016]

In [None]:
#Exclude test set from data
dfn = df[df['release_year'] != 2016]

In [None]:
#Create train and validation data 
from sklearn.model_selection import train_test_split
train, val = train_test_split(dfn, train_size = .80, test_size = 0.20, random_state = 42)

y_train = train.revenue
X_train = train.drop(columns = 'revenue')

y_val = val.revenue
X_val = val.drop(columns = 'revenue')

In [3]:
#Wrangle data
X_test = wrangle(X_test)
X_val = wrangle(X_val)
X_train = wrangle(X_train)

## Pipeline and Model

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LinearRegression
import category_encoders as ce

#Create pipelines for different regressors
linear_regression = make_pipeline(
  ce.OneHotEncoder(use_cat_names = True),
  SimpleImputer(),  
  LinearRegression()
)

#Fit Models!
linear_regression.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['original_language', 'status',
                                     'genre_first_listed'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', return_df=True,
                               use_cat_names=True, verbose=0)),
                ('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

## Pickle model

In [5]:
from joblib import dump
dump(linear_regression, 'linear_regression.joblib')

['linear_regression.joblib']

In [6]:
#model is saved into a file within the folder 'assets' for use in the plotly dash app!

## Accuracy Score

In [7]:
linear_regression.score(X_test,y_test)

0.6532395810416742

In [8]:
linear_regression.score(X_val,y_val)

0.5347328840360128