****“Give them pleasure. The same pleasure they have when they wake up from a nightmare.” ****
                                                                    
                                                                    Alfred Hitchcock

**Import the Libraries and load the datasets**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
sns.set(rc={'figure.figsize':(15,5)})

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

Let's have the first look at the datasets

In [None]:
print(train.shape)
print(test.shape)
print(train.info())

In [None]:
print(train.head(1).T)

Let's first combine the train and test datasets so that the EDA and feature engineering is done only once.

In [None]:
combined = pd.concat([train,test],axis=0,sort=False)
print(combined.shape)
combined.index = range(len(combined))

Ok. So there are many columns with dictionary datatype represented as strings. Let's fix that. 
Thanks to [Andrew's Kernel](https://www.kaggle.com/artgor/eda-feature-engineering-and-model-interpretation), from where I picked up the below function.

In [None]:

dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df
        
combined = text_to_dict(combined)

Now we extract the weekday, month and year features from the release date. 

In [None]:
t = pd.DatetimeIndex(combined['release_date'])
combined['release_date'] = t
combined['release_year'] = t.year
combined['release_month'] = t.month
combined['release_day_of_week'] = t.weekday

Let's now look at the count of movies released in every decade.

In [None]:
combined.groupby([combined.release_year // 10 * 10,'status'])['id'].count().unstack().fillna(0)


There is something fishy here. There are movies showing up for 2060s, albeit the counts are very low. Since most of the movies in each decade are already released, we are misrepresenting the dates. Python converted 1960 to 2060 and so on. Let's look at the count of movies released in 2010-2020 in each month.

In [None]:
combined[(combined.release_year > 2010) & (combined.release_year < 2020)].groupby(['release_year','release_month'])['id'].count().unstack().plot(kind='bar')

So the data seems to be updated upto August 2017 and for the movies released from then to 2060s ,we can subtract 100 years. We will then fix the variables created earlier - Month, Year and Weekday.

In [None]:
combined.loc[combined.release_date > '2017-08-31','release_date'] = combined.loc[combined.release_date > '2017-08-31','release_date'].apply(lambda x: x - pd.DateOffset(years=100))

In [None]:
t = pd.DatetimeIndex(combined['release_date'])
combined['release_date'] = t
combined['release_year'] = t.year
combined['release_month'] = t.month
combined['release_day_of_week'] = t.weekday
combined['profit'] = combined.revenue - combined.budget

In [None]:
combined.groupby([combined.release_year // 10 * 10,'status'])['id'].count().unstack().fillna(0)

Let's plot decade wise revenue, budget and profits!

In [None]:

combined.groupby(combined.release_year // 10 * 10)[['revenue','budget','profit']].mean().plot(kind='bar',title='Revenue,Budget and Profit - Decade Wise')

The trend looks surprizing. After 1970s, the revenue and profit from movies went down in 1980s. We will revisit this later. The budget seems to be in line with expectations.

Let's now look at dictionary columns and create meaningful features out of them, starting with genres column.
Some movies have multiple genres(as seen in combined.head() above) so let's create a flag column for each genre.

In [None]:
# get list of unique genres and create columns for each, i.e. one hot encoding
def dict_to_cols(colname):
     for i in range(len(combined)):
        #running for each row
         for j in range(len(combined[colname][i])):
            #creating and filling values for each genre column based on current value
             combined.loc[i,str(colname) + "_" + str(combined.loc[i,colname][j]['name'])] = 1
    
    #fill 0 value where a genre doesn't exist for a movie
     t_col = combined.columns.str.startswith(colname)
     combined.iloc[:,t_col] = combined.iloc[:,t_col].fillna(0)

dict_to_cols('genres')
combined.drop('genres',axis=1,inplace=True)


In [None]:
combined.head().T

In [None]:
# get list of unique languages and create columns for each, i.e. one hot encoding
def dict_to_cols(colname,value):
     for i in range(len(combined)):
        #running for each row
         for j in range(len(combined[colname][i])):
            #creating and filling values for each genre column based on current value
             combined.loc[i,str(colname) + "_" + str(combined.loc[i,colname][j][value])] = 1
    
    #fill 0 value where a genre doesn't exist for a movie
     t_col = combined.columns.str.startswith(colname)
     combined.iloc[:,t_col] = combined.iloc[:,t_col].fillna(0)

dict_to_cols('spoken_languages','iso_639_1')
combined.drop('spoken_languages',axis=1,inplace=True)


In [None]:
combined.head(10).T

For production countries, we just take the count of production countries for each movie. 

In [None]:
combined['prod_countries_count'] = combined.production_countries.apply(lambda x:len(x))
combined.drop('production_countries',axis=1,inplace=True)

Belongs to collection doesn't look very useful, but has null values for a few movies. Let's create a flag variable to denote whether it's populated or not.

In [None]:
combined['belongs_to_collection_flag'] = combined['belongs_to_collection'].apply(lambda x: 1 if len(x) > 0 else 0)


Let's explore the homepage column. Since the internet became popular in mid 90's, I wouldn't expect the movies released before then to have it populated. Let's create a flag columns to check whether it exists or not. And look at whether it has any impact on revenue.

In [None]:
combined['homepage_exists'] = combined['homepage'].notnull().astype(int)
combined.drop('homepage',axis=1,inplace=True)

In [None]:
combined.groupby([combined.release_year // 10 * 10,'homepage_exists'])['revenue'].mean().unstack().fillna(0).plot(kind='bar',title='Decade wise counts')

This makes sense now! The movies which were released before the advent of internet, and still have a homepage are the ones which are still in popular culture. The homepages were probably created by the fanbase. But let's leave it at that. The combination of release year and homepage_exists will probably take care of revenue projections.

Let's now look at the overview_length. It's a summary of the movie plot and I don't think it would be very helpful in revenue prediction per say. But let's also not ignore it completely. Movies with longer description might indicate a more complex plot, or just better attention to detail. Let's create a feature of the length of this field.

In [None]:
combined['overview_length'] = combined.overview.apply(lambda x:len(str(x)))

In [None]:
# Let's drop the unwanted columns
combined.drop(['id','belongs_to_collection','release_date','overview','original_title','poster_path','tagline','title'],axis=1,inplace=True)


In [None]:
combined['production_companies'].head(10)

We have the following variables left to explore. Let's count the number of entries in dictionary datatypes for them.
* production_companies
* Keywords
* cast
* crew



In [None]:
combined.Keywords.apply(lambda x: [x[i]['name'] for i in range(len(x))])

We will revist the below code later

In [None]:
# %%time
# def get_names_from_dict(column):
#     df = pd.DataFrame(columns=['id','year','name','revenue','budget'])
#     for i in range(len(combined)):
#     #running for each row
#         for j in range(len(combined[column][i])):
#         #creating and filling values for the column based on current value
#             df = df.append({'name':combined.loc[i,column][j]['name'],
#                             'revenue':combined.loc[i,'revenue'],
#                             'budget':combined.loc[i,'budget'],
#                            'year':combined.loc[i,'release_year'],
#                            'id':combined.loc[i,'imdb_id']},
#                            ignore_index = True)
#     return(df)
# df = get_names_from_dict('production_companies')

# try:
#     df['production_roi'] = df.revenue / df.budget
# except:
#     df['production_roi'] = 0.00


In [None]:
# df[['budget','revenue']].head().T

# df.groupby(['name',df.year // 10 * 10])['revenue'].mean().fillna(0).sort_values(ascending=False)
# We now have the information that the movies from a given production house in a given decade earned what revenues on average
# For movies with multiple production houses, we can take the average of revenues in that decade and create a proxy for it
# This gives us an idea of whether the production house is big or small


In [None]:
combined.head().T

In [None]:
combined.drop(['imdb_id','original_language','production_companies','status','Keywords','cast','crew','profit'] \
              ,axis=1,inplace=True)


In [None]:
combined1 = combined.copy()


In [None]:
combined1.loc[combined1.runtime.isna(),'runtime'] = combined1.runtime.mean()

In [None]:
print(combined1.loc[combined.release_year.isna(),:])
print(combined1.loc[combined.release_year.isna(),:])
print(combined1.loc[combined.release_year.isna(),:])

In [None]:
combined1.loc[combined1.release_year.isna(),'release_year'] = 2000
combined1.loc[-combined1.release_year.isna(),'release_month'] = 5
combined1.loc[combined1.release_day_of_week.isna(),'release_day_of_week'] = 4

In [None]:
train_final = combined1.loc[-combined1.revenue.isna()].copy()
test_final = combined1.loc[combined1.revenue.isna()].copy()
print(train_final.shape)
print(test_final.shape)

In [None]:
test_final.drop('revenue',axis=1,inplace=True)
y = train_final['revenue']
X = train_final.drop('revenue',axis=1,inplace=True)

print(test_final.shape)
print(X.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_holdout, y_train, y_holdout = train_test_split(train_final, y, test_size=0.33, random_state=42)

In [None]:
print(X_train.shape)
print(X_holdout.shape)
print(y_train.head())
print(y_holdout.head())
print(train_final.shape)

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


In [None]:
forest = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=17)

forest_params = {
#     'max_depth': range(10, 21,5)
'max_features': range(20, 120,10)
                }


In [None]:
forest_grid = GridSearchCV(forest, forest_params,
                           cv=4, n_jobs=-1, verbose=True,scoring='neg_mean_squared_error')
forest_grid.fit(X_train, y_train)


In [None]:
forest_grid.best_params_, forest_grid.best_score_

In [None]:
holdout_pred = forest_grid.predict(X_holdout)
# print(holdout_pred)
from sklearn.metrics import mean_squared_error
100*mean_squared_error(y_holdout, holdout_pred)

In [None]:
test_pred = forest_grid.predict(test_final)

In [None]:
pred = pd.read_csv('../input/sample_submission.csv')
pred['revenue'] = test_pred
pred.to_csv("RFR.csv", index=False)


In [150]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
# from sklearn import cross_validation, metrics   
from sklearn.model_selection import GridSearchCV


In [156]:


xgb_params = {
    'learning_rate': [.01,.02,.03,.04,.05,.1,.15,.22,.3]}

xgb1 = XGBRegressor(n_estimators=100, gamma=0, subsample=0.75,
                           colsample_bytree=1,eval_metric='rmse',objective= 'reg:linear',
                      seed=27)

xgb_grid = GridSearchCV(xgb1, xgb_params,
                           cv=5, n_jobs=-1, verbose=True,scoring='neg_mean_squared_error')
xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   18.5s finished
  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='rmse', gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.75),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15, 0.22, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=True)

In [157]:
print(xgb_grid.best_params_, xgb_grid.best_score_)
holdout_pred = xgb_grid.predict(X_holdout)
mean_squared_error(y_holdout, holdout_pred)



{'learning_rate': 0.05} -6723073473742575.0


4676300390803395.0

In [158]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_holdout, holdout_pred)
test_pred = xgb_grid.predict(test_final)
pred_XG = pd.read_csv('../input/sample_submission.csv')
pred['revenue'] = test_pred
pred.to_csv("XGBR.csv", index=False)
