# Exporting American Movie Box Office Hits 

### boxofficemojo.com regression model

### Model predicts a movie's international_total_gross revenue

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet
from sklearn.model_selection import (cross_val_score, train_test_split, KFold, GridSearchCV)
from sklearn.metrics import r2_score, mean_squared_error
%matplotlib inline


## 1. [Scraped ](https://github.com/slp22/regression-project/blob/main/adaptation_movies_webscraping.ipynb) and [Cleaned](https://github.com/slp22/regression-project/blob/main/adaptation_movies_eda.ipynb) Movie Adaptations DataFrame

In [None]:
movie_df = pd.read_csv('clean_df.csv')
movie_df.head(2)

In [None]:
movie_df.describe()
# domestic_total_gross: min $742, max $543,638,043

## 2. Features Correlation

In [None]:
# pairplot
sns.pairplot(movie_df, height=3, aspect=1.5);


In [None]:
# heatmap correlation matrix
sns.heatmap(movie_df.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);


### Correlation Summary

#### Correlation Features-Predictor
*predictor = `international_total_gross`*

* predictor corrolated with (highest to lowest):
    * `domestic_total_gross`
    * `domestic_opening`
    * `budget`
    * `max_theaters`
    * `opening_theathers`
    
Predictor is highly corrolated with `worldwide_total_gross`; has known multicollinearity as:<br/>
`worldwide_total_gross` = `domestic_total_gross` + `international_total_gross`


####  Features-Features > Positive Correlation
* domestic_total_gross:
    * `domestic_opening`
    * `worldwide_total_gross`
    * `budget`
    * `max_theaters`
    * `opening_theathers`

* domestic_opening:<br/>
    * `budget`
    * `max_theaters`
    * `opening_theathers`

* max_theaters:
    * `opening_theathers`
    * `budget`
    * `domestic_opening`


####  Features-Features > Negative Correlation
* rank:
    * `domestic_total_gross`
    * `max_theaters`
    * `opening_theathers`
    * `domestic_opening`
    * `budget`
    


## #. Hold data for final testing

In [None]:
X, y = cars.drop('price',axis=1), cars['price']

# hold out 20% of the data for final testing
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)

## #. Simple Linear Regression Model
FEATURE:'domestic_total_gross'
TARGET:'international_total_gross'

In [None]:
# split train-test
X_train, X_test, y_train, y_test = train_test_split(movie_df[['domestic_total_gross']], 
                                                    movie_df['international_total_gross'], 
                                                    test_size=0.20, 
                                                    random_state=42)

In [None]:
# linear regression: train and validate 
movie_model = LinearRegression()
movie_model.fit(X_train,y_train)

print('train score:', round(movie_model.score(X_train,y_train), 3))
print('test score:', round(movie_model.score(X_test,y_test), 3))
print('\n')
print('m_1.coef', movie_model.coef_ )

In [None]:
# ## FEAUTRE
# # 'domestic_total_gross'

# ## TARGET
# # 'international_total_gross'

# # y, X = patsy.dmatrices(' ~ domestic_total_gross + domestic_opening + budget + worldwide_total_gross', data=movie_df, return_type="dataframe")
# # y, X = patsy.dmatrices('international_total_gross ~ domestic_total_gross + domestic_opening + budget',                         data=movie_df, return_type="dataframe")
# # y, X = patsy.dmatrices('international_total_gross ~ domestic_total_gross + domestic_opening',                                  data=movie_df, return_type="dataframe")
# # y, X = patsy.dmatrices('international_total_gross ~ domestic_total_gross + budget',                                            data=movie_df, return_type="dataframe")
# y, X = patsy.dmatrices('international_total_gross ~ domestic_total_gross',                                                     data=movie_df, return_type="dataframe")

# # model
# model = sm.OLS(y, X)

# # fit model
# fit = model.fit()

# # model performance statistics
# fit.summary()



### Residuals Plot

In [None]:
# cone shape indicates heteroskedasticity; likely due to the large range of domestic movie revenue
f, ax = plt.subplots(figsize=(12, 8))
sns.residplot(x='domestic_total_gross', y='international_total_gross', data=movie_df);


## #. Dummy Variables

In [None]:
# get dummies (source: validation_workflow_and_utilities.ipynb)
pd.get_dummies(cars['make'], drop_first=True).head(5) 
cars = pd.get_dummies(cars, drop_first=True) #can just apply it to the whole df
cars.head(3)


In [None]:
#validate

lm.fit(X_train, y_train)
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')

lm_reg.fit(X_train_scaled, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}')

# Modeling

In [None]:
#set up the 3 models we're choosing from:

#1 simple lin reg
#2 ridge
#3 poly


#1 simple lin reg
lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge model on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

#2 ridge
lm_reg = Ridge(alpha=1)

#Feature transforms for train, val, and test so that we can run our poly model on each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

#3 poly
lm_poly = LinearRegression()



## #. Cross-Validation `KFold`

In [None]:
## K-fold, in a less manual way with sk-learn (source: validation_workflow_and_utilities.ipynb)

kf = KFold(n_splits=5, shuffle=True, random_state = 71)

# cross-val-score
cross_val_score(lm, X, y, cv=kf, scoring='r2')

In [None]:
# build model  (source: regression_lasso_solution.ipynb)
lin_reg_est = LinearRegression()

scores = cross_val_score(lin_reg_est, X_train, y_train, cv=kfold)
print(scores)
print("Linear Reg Mean Score: ", np.mean(scores))

# Build the Model
lin_reg_est.fit(X_train, y_train)

In [None]:
# evaluate model  (source: regression_lasso_solution.ipynb)

# # Fitted vs. Actual
# y_train_pred = lin_reg_est.predict(X_train)

# plt.scatter(y_train, y_train_pred, alpha=0.2)
# plt.plot([0, 400], [0, 400])

In [None]:
# # Fitted vs. Actual  (source: regression_lasso_solution.ipynb)
# y_test_pred = lin_reg_est.predict(X_holdout)

# plt.scatter(y_holdout, y_test_pred)
# plt.plot([0, 400], [0, 400])

In [None]:
# # Plot Residuals vs. predicted  (source: regression_lasso_solution.ipynb)

# lin_reg_residuals = y_train - y_train_pred

# plt.scatter(y_train_pred, lin_reg_residuals)
# plt.plot([0,400], [0, 0])
# plt.title("Residuals vs. Predictions")

## #. Model Tuning 
Standardscalar() <br/>
Take log of some features <br/>
Elastic Net <br/>
Grid_Search()


In [None]:
# print("Lasso Model:")  (source: regression_lasso_solution.ipynb)
# params = {
#     "alpha": np.logspace(-4, -.1, 20)
# }

# grid_est = GridSearchCV(Lasso(), param_grid=params, cv=kfold, 
#                         return_train_score=False)
# grid_est.fit(X_train, y_train)
# df = pd.DataFrame(grid_est.cv_results_)
# df["alpha"] = df.params.apply(lambda val: val["alpha"])
# plt.plot(np.log(df.alpha), df.mean_test_score);

In [None]:
grid_est.best_estimator_

In [None]:
params['alpha']

In [None]:
# print("Ridge Model:") (source: regression_lasso_solution.ipynb)
# params = {
#     "alpha": np.logspace(-4, -.1, 20)
# }

# grid_est = GridSearchCV(Ridge(), param_grid=params, cv=kfold, 
#                         return_train_score=False)
# grid_est.fit(X_train, y_train)
# df = pd.DataFrame(grid_est.cv_results_)
# df["alpha"] = df.params.apply(lambda val: val["alpha"])
# plt.plot(np.log(df.alpha), df.mean_test_score);

In [None]:
def build_grid_search_est(model, X, y, cv=5, **params): (source: regression_lasso_solution.ipynb)

    grid_est = GridSearchCV(model, param_grid=params, cv=kfold, 
                            return_train_score=False)
    grid_est.fit(X, y)
    df = pd.DataFrame(grid_est.cv_results_)
    for param in params:
        df[param] = df.params.apply(lambda val: val[param])
#       plt.plot(np.log(df.alpha), df.mean_test_score);
        plt.semilogx(df.alpha, df.mean_test_score)
    return grid_est

In [None]:
# print("Lasso Grid Search") (source: regression_lasso_solution.ipynb)
# lasso_grid_est = build_grid_search_est(Lasso(), X_train, y_train, cv=kfold,
#                                        alpha=np.logspace(-4, -1, 30))

In [None]:
# print("Ridge Grid Search") (source: regression_lasso_solution.ipynb)
# ridge_grid_est = build_grid_search_est(Ridge(), X_train, y_train, cv=kfold,
#                                        alpha=np.logspace(-4, -1, 10))

In [None]:
# print("Elastic Net Grid Search") (source: regression_lasso_solution.ipynb)
# elastic_net_grid_est = build_grid_search_est(ElasticNet(), X_train, y_train, cv=kfold,
#                                              alpha=np.logspace(-4, 0.1, 10))

## Llinear Regression Assumptions:
    1. Remove multicollinearity
    2. Transform some features
    3. Look at QQ plots of residuals
    4. Check for independence of errors
    5. Check for heteroskedasticity in residuals!


## #. Features Engineering

In [None]:
# square x5

m_1 = LinearRegression()
m_1.fit(X_train,y_train)

X_train['x5^2'] = X_train['x5']**2 
X_test['x5^2'] = X_test['x5']**2

print('train score with x5^2:', round(m_1.score(X_train,y_train), 3))
print('test score with x5^2:', round(m_1.score(X_test,y_test), 3))
print('\n')
print('m_1.coef  with x5^2', m_1.coef_ )

In [None]:
# interaction x2 - x3

m_1 = LinearRegression()
m_1.fit(X_train,y_train)

X_train['x2_-_x3'] = (X_train['x2'] - X_train['x3'])
X_test['x2_-_x3'] = (X_test['x2'] - X_test['x3'])


print('train score with x2_-_x3:', round(m_1.score(X_train,y_train), 3))
print('test score with x2_-_x3:', round(m_1.score(X_test,y_test), 3))
print('\n')
print('m_1.coef  with x2_-_x3', m_1.coef_ )

In [None]:
# x1 * x2

m_3 = LinearRegression()
m_3.fit(X_train,y_train)

X_train['x1_*_x2'] = (X_train['x1']* X_train['x2'])
X_test['x1_*_x2'] = (X_test['x1']* X_test['x2'])


print('train score with x1_*_x2:', round(m_3.score(X_train,y_train), 3))
print('test score with x1_*_x2:', round(m_3.score(X_test,y_test), 3))
print('\n')
print('m_1.coef  with x1_*_x2', m_3.coef_ )

## #. Best Model 
Fit best model on (train + val), score on test!

In [None]:
(source: regression_lasso_solution.ipynb)


y_pred = lin_reg_est.predict(X_holdout)
print("Linear Regression:", r2_score(y_holdout, y_pred))

y_pred = lasso_grid_est.predict(X_holdout)
print("Lasso Regression:", r2_score(y_holdout, y_pred))

y_pred = ridge_grid_est.predict(X_holdout)
print("Ridge Regression:", r2_score(y_holdout, y_pred))

y_pred = elastic_net_grid_est.predict(X_holdout)
print("ElasticNet Regression:", r2_score(y_holdout, y_pred))

In [None]:
# pd.DataFrame(list(zip(range(10), lasso_grid_est.best_estimator_.coef_)))

## #. Model Visualization


In [None]:
# linear regression plot
f, ax = plt.subplots(figsize=(12, 8))
sns.regplot(x='domestic_total_gross', y='international_total_gross', data=movie_df);


## #. Interpret Results
**Interpretability**: Coefficients, what are the top predictors <br/>
**Predictions**: Make a prediction for a new value, does it make sense?