In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import LinearRegression, ElasticNetCV, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MultiLabelBinarizer
import matplotlib.pyplot as plt
import ast
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [None]:
train = pd.read_csv("../input/tmdb-box-office-prediction/train.csv")
test = pd.read_csv("../input/tmdb-box-office-prediction/test.csv")

In [None]:
train.head()

In [None]:
train.columns

In [None]:
#preprocessing for eval
train["Keywords"] = train["Keywords"].fillna("[]")
train["spoken_languages"] = train["spoken_languages"].fillna("[]")
train["cast"] = train["cast"].fillna("[]")
train["production_companies"] = train["production_companies"].fillna("[]")
train["genres"] = train["genres"].fillna("[]")
train["production_countries"] = train["production_countries"].fillna("[]")

In [None]:
#get months day weekday
train["release_date"] = pd.to_datetime(train["release_date"])
train["month"] = train["release_date"].dt.month
train["weekday"] = train["release_date"].dt.weekday
train["week"] = train["release_date"].dt.week

In [None]:
y = train["revenue"].copy()

In [None]:
train = train.drop(["belongs_to_collection", "id", "imdb_id", "poster_path", "crew", "overview", "status", "original_title"], axis=1)

In [None]:
def get_keywords(kw):
    try:
        out = []
        kws = ast.literal_eval(kw)
        for k in kws:
            out.append(k["name"])
        return out
    except:
        print("Excepted")
        return []

In [None]:
train["Keywords"] = train["Keywords"].apply(get_keywords)
train["spoken_languages"] = train["spoken_languages"].apply(get_keywords)
train["cast"] = train["cast"].apply(get_keywords)
train["production_companies"] = train["production_companies"].apply(get_keywords)
train["genres"] = train["genres"].apply(get_keywords)
train["production_countries"] = train["production_countries"].apply(get_keywords)

In [None]:
kw = train["Keywords"].copy()
sp = train["spoken_languages"].copy()
cast = train["cast"].copy()
pc = train["production_companies"].copy()
genres = train["genres"].copy()
countries = train["production_countries"].copy()

train = train.drop(["Keywords", "spoken_languages", "cast", "production_companies", "genres", "production_countries", "revenue"], axis=1)

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
keywords = pd.DataFrame(mlb.fit_transform(kw),columns=mlb.classes_, index=train.index) #keywords dataframe
keywords.rename(columns={'popularity': 'popularity_keyword'}, inplace=True)
sp_lang = pd.DataFrame(mlb.fit_transform(sp),columns=mlb.classes_, index=train.index) 
casts = pd.DataFrame(mlb.fit_transform(cast),columns=mlb.classes_, index=train.index) 
pro_comp = pd.DataFrame(mlb.fit_transform(pc),columns=mlb.classes_, index=train.index)
genre = pd.DataFrame(mlb.fit_transform(genres),columns=mlb.classes_, index=train.index) 
cntry = pd.DataFrame(mlb.fit_transform(countries),columns=mlb.classes_, index=train.index) 

In [None]:
k_cols = keywords.columns
sp_lang_cols = sp_lang.columns
casts_cols = casts.columns
pro_comp_cols = pro_comp.columns
genre_cols = genre.columns
cntry_cols = cntry.columns

In [None]:
keywords = keywords.fillna(0)
sp_lang = sp_lang.fillna(0)
casts = casts.fillna(0)
pro_comp = pro_comp.fillna(0)
genre = genre.fillna(0)
cntry = cntry.fillna(0)

In [None]:
train["title"] = train["title"].apply(len)

In [None]:
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns

In [None]:
numerical_features

In [None]:
numerical_features = numerical_features.drop(['month', 'release_date', 'weekday'])

In [None]:
train[numerical_features]

In [None]:
train.drop(["release_date"], inplace=True, axis=1)

In [None]:
"popularity" in keywords.columns.to_list()

In [None]:
train = pd.concat([train, keywords, sp_lang, casts, pro_comp, genre, cntry],  axis=1)
train[numerical_features]

In [None]:
def has_homepage(homepage):
    if isinstance(homepage, str):
        return 1
    else:
        return 0

In [None]:
train["homepage"] = train["homepage"].apply(has_homepage)
train["tagline"] = train["tagline"].apply(has_homepage) #works for tagline too

In [None]:
train

In [None]:
def english(ol):
    if ol == "en":
        return 1
    else:
        return 0

In [None]:
train["original_language"] = train["original_language"].apply(english)

In [None]:
train = train.fillna(0)

In [None]:
train[numerical_features]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2)

train_num = X_train[numerical_features]
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(train_num)
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [None]:
#lr = LinearRegression()
#lr.fit(X_train, y_train)

In [None]:
#pred = lr.predict(X_test)
#for i in range(len(pred)):
#    if pred[i] < 0:
#        pred[i] = 0


In [None]:
def get_rmse(y_test, pred):
    rmse = np.sqrt(mean_squared_log_error(y_test, pred, sample_weight=None, multioutput='uniform_average'))
    return rmse

In [None]:
#get_rmse(y_test, pred)

In [None]:
#preparing the test set
test = pd.read_csv("../input/tmdb-box-office-prediction/test.csv")
#preprocessing for eval
test["Keywords"] = test["Keywords"].fillna("[]")
test["spoken_languages"] = test["spoken_languages"].fillna("[]")
test["cast"] = test["cast"].fillna("[]")
test["production_companies"] = test["production_companies"].fillna("[]")
test["genres"] = test["genres"].fillna("[]")
test["production_countries"] = test["production_countries"].fillna("[]")

#get months day weekday
test["release_date"] = pd.to_datetime(test["release_date"])
test["month"] = test["release_date"].dt.month
test["weekday"] = test["release_date"].dt.weekday
test["week"] = test["release_date"].dt.week

test = test.drop(["belongs_to_collection", "id", "imdb_id", "poster_path", "crew", "overview", "status", "original_title", "release_date"], axis=1)


test["Keywords"] = test["Keywords"].apply(get_keywords)
test["spoken_languages"] = test["spoken_languages"].apply(get_keywords)
test["cast"] = test["cast"].apply(get_keywords)
test["production_companies"] = test["production_companies"].apply(get_keywords)
test["genres"] = test["genres"].apply(get_keywords)
test["production_countries"] = test["production_countries"].apply(get_keywords)

kw = test["Keywords"].copy()
sp = test["spoken_languages"].copy()
cast = test["cast"].copy()
pc = test["production_companies"].copy()
genres = test["genres"].copy()
countries = test["production_countries"].copy()

test = test.drop(["Keywords", "spoken_languages", "cast", "production_companies", "genres", "production_countries"], axis=1)

mlb = MultiLabelBinarizer() #probably don't have to do it again but just in case

keywords = pd.DataFrame(mlb.fit_transform(kw),columns=mlb.classes_, index=test.index) #keywords dataframe
keywords.rename(columns={'popularity': 'popularity_keyword'}, inplace=True)
sp_lang = pd.DataFrame(mlb.fit_transform(sp),columns=mlb.classes_, index=test.index) 
casts = pd.DataFrame(mlb.fit_transform(cast),columns=mlb.classes_, index=test.index) 
pro_comp = pd.DataFrame(mlb.fit_transform(pc),columns=mlb.classes_, index=test.index)
genre = pd.DataFrame(mlb.fit_transform(genres),columns=mlb.classes_, index=test.index) 
cntry = pd.DataFrame(mlb.fit_transform(countries),columns=mlb.classes_, index=test.index) 

keywords = keywords.fillna(0)
sp_lang = sp_lang.fillna(0)
casts = casts.fillna(0)
pro_comp = pro_comp.fillna(0)
genre = genre.fillna(0)
cntry = cntry.fillna(0)



In [None]:
#cleaning up dummies, there probably is a much better way to do this, but I can't think of it right now
cols = keywords.columns.to_list() #precomputation is more efficient
cols2drop = []
for col in cols:
    if col not in k_cols:
        cols2drop.append(col)
keywords.drop(cols2drop, inplace= True, axis=1)
cols = keywords.columns.to_list()
for col in k_cols:
    if col not in cols:
        keywords[col] = 0
        



In [None]:
keywords = keywords[k_cols]

In [None]:
test["title"] = test["title"].apply(str)

#probably should have written a function to do this
#spoken language
cols = sp_lang.columns.to_list() #precomputation is more efficient
cols2drop = []
for col in cols:
    if col not in sp_lang_cols:
        cols2drop.append(col)
sp_lang.drop(cols2drop, inplace=True, axis=1)
cols = sp_lang.columns.to_list()

for col in sp_lang_cols:
    if col not in cols:
        sp_lang[col] = 0
sp_lang[sp_lang_cols]

#cast
cols = casts.columns.to_list()
cols2drop = []
for col in cols:
    if col not in casts_cols:
        cols2drop.append(col)
casts.drop(cols2drop, inplace=True, axis=1)

cols = casts.columns.to_list()
for col in casts_cols:
    if col not in cols:
        casts[col] = 0
casts = casts[casts_cols]

#production company
cols = pro_comp.columns.to_list()        
cols2drop = []  

for col in cols:
    if col not in pro_comp_cols:
        cols2drop.append(col)
pro_comp.drop(cols2drop, inplace=True, axis=1)
cols = pro_comp.columns.to_list() 

for col in pro_comp_cols:
    if col not in cols:
        pro_comp[col] = 0
pro_comp =pro_comp[pro_comp_cols]

#genre
cols = genre.columns.to_list()
cols2drop = []

for col in cols:
    if col not in genre_cols:
        cols2drop.append(col)
genre.drop(cols2drop, inplace=True, axis=1)

for col in genre_cols:
    if col not in cols:
        genre[col] = 0
genre = genre[genre_cols]

#countries
cols = cntry.columns.to_list() 
cols2drop = []

for col in cols:
    if col not in cntry_cols:
        cols2drop.append(col)
cntry.drop(cols2drop, inplace=True, axis=1)
cols = cntry.columns.to_list()

for col in cntry_cols:
    if col not in cols:
        cntry[col] = 0
cntry = cntry[cntry_cols]        
        
test["title"] = test["title"].apply(len)




In [None]:
test_num = test[numerical_features]
test[numerical_features] = scaler.transform(test_num)


test = pd.concat([test, keywords, sp_lang, casts, pro_comp, genre, cntry], axis=1)

test["homepage"] = test["homepage"].apply(has_homepage)
test["tagline"] = test["tagline"].apply(has_homepage) #works for tagline too

test["original_language"] = test["original_language"].apply(english)

test = test.fillna(0)

In [None]:
#lr = LinearRegression() #train on whole training data
#lr.fit(train, y)

In [None]:
#sub = pd.read_csv("../input/tmdb-box-office-prediction/sample_submission.csv")
#preds = lr.predict(test)
#sub["revenue"] = preds
#sub.to_csv("submission.csv", index=False)

In [None]:
#lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                          0.3, 0.6, 1], 
                max_iter = 50000, cv = 5)
#lasso.fit(X_train, y_train)

In [None]:
#best_alpha = lasso.alpha_
#alpha = best_alpha
#print("Best alpha :", best_alpha)

In [None]:
#print("Try again for more precision with alphas centered around " + str(alpha))
#lasso = LassoCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, 
                          alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, 
                          alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, 
                          alpha * 1.4], 
                max_iter = 50000, cv = 10)
#lasso.fit(X_train, y_train)
#best_alpha = lasso.alpha_
#print("Best alpha :", best_alpha)

In [None]:
#preds = lasso.predict(X_test)
#for prediction in range(len(preds)):
#    if preds[prediction] < 0:
#        preds[prediction] = 0
#rmse = get_rmse(y_test, preds)
#print("RMSE: {}".format(rmse))

In [None]:
plt.scatter(train["budget"], y)
plt.show()

In [None]:
import seaborn as sns

sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
elasticNet = ElasticNetCV(l1_ratio = [0.1, 0.3, 0.6, 1]
                          alphas = [0.0005, 0.001, 0.005, 
                                    0.01, 0.05, 0.1, 0.5, 1, 5], 
                          max_iter = 50000, cv = 5)
elasticNet.fit(X_train, y_train)
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )

In [None]:
print("Try again for more precision with l1_ratio centered around " + str(ratio))
elasticNet = ElasticNetCV(l1_ratio = [ratio * .9, ratio, ratio * 1.1, ratio * 1.15],
                          alphas = [0.0005, 0.001, 0.005, 
                                    0.01, 0.05, 0.1, 0.5, 1, 5], 
                          max_iter = 50000, cv = 5)
elasticNet.fit(X_train, y_train)
if (elasticNet.l1_ratio_ > 1):
    elasticNet.l1_ratio_ = 1    
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )

print("Now try again for more precision on alpha, with l1_ratio fixed at " + str(ratio) + 
      " and alpha centered around " + str(alpha))
elasticNet = ElasticNetCV(l1_ratio = ratio,
                          alphas = [alpha * .6, alpha * .7, alpha * .8, alpha * .9, alpha, alpha * 1.1, alpha * 1.25, alpha * 1.3, alpha * 1.4], 
                          max_iter = 50000, cv = 5)
elasticNet.fit(X_train, y_train)
if (elasticNet.l1_ratio_ > 1):
    elasticNet.l1_ratio_ = 1    
best_alpha = elasticNet.alpha_
best_ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", best_ratio)
print("Best alpha :", best_alpha )

In [None]:
#

In [None]:
#

In [None]:
sub = pd.read_csv("../input/tmdb-box-office-prediction/sample_submission.csv")
preds = elasticNet.predict(test)
sub["revenue"] = preds
sub.to_csv("elasticnet.csv", index=False)