In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
pd.options.display.max_columns=100
pd.options.display.max_rows=100

In [None]:
trainDF=pd.read_csv('../input/train.csv')
unseenTestDF=pd.read_csv('../input/test.csv')

# Feature Enginearing

The following columns are of type object. However the actual type is <strong>"list of dictionaries"</strong>

* belongs_to_collection
* genres
* production_companies
* production_countries
* spoken_languages
* Keywords
* cast
* crew

 Following function takes object column, which is actually of type 'List of Dictionaries'. 
 <br>And convert it into comma separated values.

Eg. 
<br><strong>Input</strong>
<br>[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'de', 'name': 'Deutsch'}]
<br><strong>Output</strong> will be the comma separated values
<br>en,de

The reason of getting language id instead of name is there are some unicode values : <br>
{'iso_639_1': 'ar', 'name': 'العربية'}

Parameters to function <strong>GetCSVFromListOfDict(keyNameToFetch,columnName)</strong>

<strong>keyNameToFetch :</strong> what value do you want to fetch (e.g. iso_639_1 in above case )
<br><strong>column :</strong> The actual column
<br><strong>columnName :</strong> column name which you want to process

In [None]:
def GetCSVFromListOfDict(keyNameToFetch,column,columnName):
    column=column.copy()
    column=column.fillna('[{}]')
    columnList=[]
    for index,row in column.iteritems():
        columnStr=''
        listofDict=ast.literal_eval(row)
        for dic in listofDict:

            if(keyNameToFetch in dic.keys()):
                columnStr=columnStr+';'+str(dic[keyNameToFetch]) 
        columnStr=columnStr.strip(';') # trim leading ;
        columnList.append(columnStr)

    tempDF=pd.DataFrame(columnList,columns=[columnName])
    return tempDF[columnName]


#GetCSVFromListOfDict('iso_639_1',trainDF.spoken_languages,'spoken_languages')

In [None]:
trainDF['belongs_to_collection']=GetCSVFromListOfDict('name',trainDF.belongs_to_collection,'belongs_to_collection')
trainDF['genres']=GetCSVFromListOfDict('name',trainDF.genres,'genres')
trainDF['production_companies']=GetCSVFromListOfDict('name',trainDF.production_companies,'production_companies')
trainDF['production_countries']=GetCSVFromListOfDict('name',trainDF.production_countries,'production_countries')
trainDF['spoken_languages']=GetCSVFromListOfDict('iso_639_1',trainDF.spoken_languages,'spoken_languages')
trainDF['Keywords']=GetCSVFromListOfDict('name',trainDF.Keywords,'Keywords')
trainDF['Crew_Dept']=GetCSVFromListOfDict('department',trainDF.crew,'crew')
trainDF['Crew_Job']=GetCSVFromListOfDict('job',trainDF.crew,'crew')
trainDF['Crew_Name']=GetCSVFromListOfDict('name',trainDF.crew,'crew')
trainDF['Crew_Gender']=GetCSVFromListOfDict('gender',trainDF.crew,'crew')


unseenTestDF['belongs_to_collection']=GetCSVFromListOfDict('name',unseenTestDF.belongs_to_collection,'belongs_to_collection')
unseenTestDF['genres']=GetCSVFromListOfDict('name',unseenTestDF.genres,'genres')
unseenTestDF['production_companies']=GetCSVFromListOfDict('name',unseenTestDF.production_companies,'production_companies')
unseenTestDF['production_countries']=GetCSVFromListOfDict('name',unseenTestDF.production_countries,'production_countries')
unseenTestDF['spoken_languages']=GetCSVFromListOfDict('iso_639_1',unseenTestDF.spoken_languages,'spoken_languages')
unseenTestDF['Keywords']=GetCSVFromListOfDict('name',unseenTestDF.Keywords,'Keywords')
unseenTestDF['Crew_Dept']=GetCSVFromListOfDict('department',unseenTestDF.crew,'crew')
unseenTestDF['Crew_Job']=GetCSVFromListOfDict('job',unseenTestDF.crew,'crew')
unseenTestDF['Crew_Name']=GetCSVFromListOfDict('name',unseenTestDF.crew,'crew')
unseenTestDF['Crew_Gender']=GetCSVFromListOfDict('gender',unseenTestDF.crew,'crew')






In [None]:
display(trainDF.head(1))
display(unseenTestDF.head(1))


# belongs_to_colletion
Observation and analysis

In [None]:

print(len(trainDF.belongs_to_collection))
trainDF.belongs_to_collection.value_counts()
# Out of 3000 total 2396 missing values. i.e. 79% missing values.
# Lets check whether missing value vs. present value has effect on revenue?

In [None]:
trainDF['belongs_to_collection_ISMISSING']=(trainDF.belongs_to_collection.str.strip()=='').astype(int)
unseenTestDF['belongs_to_collection_ISMISSING']=(unseenTestDF.belongs_to_collection.str.strip()=='').astype(int)


In [None]:
trainDF[['belongs_to_collection_ISMISSING','revenue']].corr()


-0.33 means it has small coefficient of correlation. Hence we will take this column into consideration

Hypothese - Null values in belongs to colletion will have low revenue, while value present in this column means high revenue


Coefficient, r
<br>Strength of Association	
<br>Small	.1 to .3	-0.1 to -0.3
<br>Medium	.3 to .5	-0.3 to -0.5
<br>Large	.5 to 1.0	-0.5 to -1.0


In [None]:
trainDF.drop(columns=['belongs_to_collection'],inplace=True)
unseenTestDF.drop(columns=['belongs_to_collection'],inplace=True)

# genres

In [None]:
print(len(trainDF.genres))
print(trainDF.genres.isna().sum())
trainDF.genres.value_counts().head()
# No missing values. Good

 We can split these values as separate column (genres_drama, generes_comedy) and provide boolean values (0:absent 1:present)

In [None]:
trainDF['genres']=trainDF.genres.str.replace(' ','_') # so bigrams will act as unigram, and it wont become 2 columns
trainDF['genres']=trainDF.genres.str.replace(';',' ')


In [None]:


from sklearn.feature_extraction.text import CountVectorizer

vectFeatures = CountVectorizer(max_features=10)
vectFeatures.fit(trainDF['genres'])

featuresTrainSplit=vectFeatures.transform(trainDF['genres'])
featuresUnseenTestSplit=vectFeatures.transform(unseenTestDF['genres'])



In [None]:
featuresTrainDF=pd.DataFrame(featuresTrainSplit.toarray(),columns=vectFeatures.get_feature_names())
featuresUnseenTestDF=pd.DataFrame(featuresUnseenTestSplit.toarray(),columns=vectFeatures.get_feature_names())


In [None]:
featuresTrainDF.columns='genres_'+featuresTrainDF.columns
featuresUnseenTestDF.columns='genres_'+featuresUnseenTestDF.columns

In [None]:
trainDF=pd.concat([trainDF,featuresTrainDF],axis=1)
unseenTestDF=pd.concat([unseenTestDF,featuresUnseenTestDF],axis=1)

In [None]:
trainDF.drop(columns=['genres'],inplace=True)
unseenTestDF.drop(columns=['genres'],inplace=True)

# production_companies

In [None]:
print(len(trainDF.production_companies))
trainDF.production_companies.value_counts().head(20)
# 156 missing values out of 3000

# production_countries

I think this column is important.
Just like genres we will make boolean column for comma separated countries

In [None]:
print(len(trainDF.production_countries))
trainDF.production_countries.value_counts().head(20)
# 55 Missing values

In [None]:
trainDF['production_countries']=trainDF.production_countries.str.replace(' ','_') # so bigrams will act as unigram, and it wont become 2 columns
trainDF['production_countries']=trainDF.production_countries.str.replace(';',' ')


unseenTestDF['production_countries']=unseenTestDF.production_countries.str.replace(' ','_') # so bigrams will act as unigram, and it wont become 2 columns
unseenTestDF['production_countries']=unseenTestDF.production_countries.str.replace(';',' ')


In [None]:
trainDF['IsProductionFromUSA']=(trainDF['production_countries']=='united_states_of_america').astype(int)
unseenTestDF['IsProductionFromUSA']=(unseenTestDF['production_countries']=='united_states_of_america').astype(int)

In [None]:
trainDF.drop(columns=['production_countries'],inplace=True)
unseenTestDF.drop(columns=['production_countries'],inplace=True)

# original_language and spoken_languages

In [None]:
trainDF['IsEnglishLanguage']=(
                    (trainDF['spoken_languages'].str.contains('en'))
                    & 
                    (trainDF['original_language']=='en')).astype(int)



unseenTestDF['IsEnglishLanguage']=(
                    (unseenTestDF['spoken_languages'].str.contains('en'))
                    &
                    (unseenTestDF['original_language']=='en')).astype(int)

In [None]:
trainDF[['IsEnglishLanguage','revenue']].corr()

In [None]:
trainDF.drop(columns=['spoken_languages','original_language'],inplace=True)
unseenTestDF.drop(columns=['spoken_languages','original_language'],inplace=True)

# Keywords

In [None]:
trainDF['Keywords']=trainDF.Keywords.str.replace(' ','_') # so bigrams will act as unigram, and it wont become 2 columns
trainDF['Keywords']=trainDF.Keywords.str.replace(';',' ')
trainDF['Keywords']=trainDF['Keywords'].str.lower()


unseenTestDF['Keywords']=unseenTestDF.Keywords.str.replace(' ','_') # so bigrams will act as unigram, and it wont become 2 columns
unseenTestDF['Keywords']=unseenTestDF.Keywords.str.replace(';',' ')
unseenTestDF['Keywords']=unseenTestDF['Keywords'].str.lower()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectFeatures = CountVectorizer(max_features=20)
vectFeatures.fit(trainDF['Keywords'].str.lower())

featuresTrainSplit=vectFeatures.transform(trainDF['Keywords'])
featuresUnseenTestSplit=vectFeatures.transform(unseenTestDF['Keywords'])



featuresTrainDF=pd.DataFrame(featuresTrainSplit.toarray(),columns=vectFeatures.get_feature_names())
featuresUnseenTestDF=pd.DataFrame(featuresUnseenTestSplit.toarray(),columns=vectFeatures.get_feature_names())


featuresTrainDF.columns='Keywords'+featuresTrainDF.columns
featuresUnseenTestDF.columns='Keywords'+featuresUnseenTestDF.columns




In [None]:
trainDF=pd.concat([trainDF,featuresTrainDF],axis=1)
unseenTestDF=pd.concat([unseenTestDF,featuresUnseenTestDF],axis=1)

trainDF.drop(columns=['Keywords'],inplace=True)
unseenTestDF.drop(columns=['Keywords'],inplace=True)


# homepage

In [None]:
trainDF.homepage.isna().sum()

In [None]:
trainDF.homepage

In [None]:
trainDF['IsHomePageAvailable']=(trainDF.homepage.isna()==False).astype(int)
unseenTestDF['IsHomePageAvailable']=(unseenTestDF.homepage.isna()==False).astype(int)

In [None]:
trainDF[['IsHomePageAvailable','revenue']].corr()

# Date

In [None]:
dateSplit=trainDF.release_date.str.extract('([0-9]+)/([0-9]+)/([0-9]+)')
dateSplit.columns=['ReleaseMonth','ReleaseDate','ReleaseYear']

dateSplit.loc[dateSplit.ReleaseYear.astype(int)>20,'ReleaseYear']='19'+dateSplit.loc[dateSplit.ReleaseYear.astype(int)>20,'ReleaseYear']
dateSplit.loc[dateSplit.ReleaseYear.astype(int)<=20,'ReleaseYear']='20'+dateSplit.loc[dateSplit.ReleaseYear.astype(int)<=20,'ReleaseYear']

trainDF.drop(columns=['release_date'],inplace=True)
trainDF=pd.concat([trainDF,dateSplit.astype(int)],axis=1)

In [None]:
print(unseenTestDF.release_date.mode())
unseenTestDF['release_date'].fillna('9/9/11',inplace=True)

In [None]:
unseenTestDF['release_date'].isna().sum()

In [None]:
dateSplit=unseenTestDF.release_date.str.extract('([0-9]+)/([0-9]+)/([0-9]+)')
dateSplit.columns=['ReleaseMonth','ReleaseDate','ReleaseYear']


dateSplit.loc[dateSplit.ReleaseYear.astype(int)>20,'ReleaseYear']='19'+dateSplit.loc[dateSplit.ReleaseYear.astype(int)>20,'ReleaseYear']
dateSplit.loc[dateSplit.ReleaseYear.astype(int)<=20,'ReleaseYear']='20'+dateSplit.loc[dateSplit.ReleaseYear.astype(int)<=20,'ReleaseYear']


unseenTestDF.drop(columns=['release_date'],inplace=True)
unseenTestDF=pd.concat([unseenTestDF,dateSplit.astype(int)],axis=1)


In [None]:
## Month -- > SeasonEnd feature engg

In [None]:
pd.concat([pd.get_dummies(trainDF['ReleaseMonth'].astype(str)),trainDF.revenue],axis=1).corr()['revenue']

In [None]:
trainDF.groupby(by='ReleaseMonth')['revenue'].mean()

In [None]:


pd.concat([((trainDF.ReleaseMonth==6) |
            (trainDF.ReleaseMonth==12)|
           (trainDF.ReleaseMonth==7)
           ).astype(int),trainDF.revenue],axis=1).corr()['revenue']






In [None]:
trainDF['IsReleaseMonthSeasonEnd']=((trainDF.ReleaseMonth==6) |
            (trainDF.ReleaseMonth==12)|
           (trainDF.ReleaseMonth==7)
           ).astype(int)

unseenTestDF['IsReleaseMonthSeasonEnd']=((unseenTestDF.ReleaseMonth==6) |
            (unseenTestDF.ReleaseMonth==12)|
           (unseenTestDF.ReleaseMonth==7)
           ).astype(int)


trainDF.drop(columns=['ReleaseMonth'],inplace=True)
unseenTestDF.drop(columns=['ReleaseMonth'],inplace=True)

In [None]:
trainDF.drop(columns=['ReleaseDate'],inplace=True)
unseenTestDF.drop(columns=['ReleaseDate'],inplace=True)

# Log Scaling

In [None]:
trainDF['revenue']=np.log1p(trainDF.revenue)

trainDF['budget']=np.log1p(trainDF.budget)
unseenTestDF['budget']=np.log1p(unseenTestDF.budget)


trainDF['popularity']=np.log1p(trainDF.popularity)
unseenTestDF['popularity']=np.log1p(unseenTestDF.popularity)

# Model

In [None]:

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
trainDFNum=trainDF.select_dtypes(include=numerics)
unseenTestDFNum=unseenTestDF.select_dtypes(include=numerics)


In [None]:
trainDFNum.drop(columns=['id'],inplace=True)
unseenTestDFNum.drop(columns=['id'],inplace=True)

In [None]:
trainDFNum=trainDFNum.fillna(trainDFNum.median())
unseenTestDFNum=unseenTestDFNum.fillna(trainDFNum.median())

In [None]:
from sklearn import model_selection # for splitting into train and test
import sklearn
# Split-out validation dataset
X = trainDFNum.drop(columns=['revenue'])
Y = trainDFNum['revenue']

validation_size = 0.2
seed = 100
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

## XGBoost

In [None]:
import xgboost
model_XG = xgboost.XGBRegressor() 
model_XG.fit(X_train, Y_train)

In [None]:

# make predictions for test data

trainResult_XG = model_XG.predict(X_train)
testResult_XG = model_XG.predict(X_test)
unseenTestResult_XG=model_XG.predict(unseenTestDFNum)

In [None]:

    

########## TRAIN DATA RESULT ##########

print('---------- TRAIN DATA RESULT ----------')
# The mean squared error
print("Mean squared error: %.5f"%np.sqrt( mean_squared_error(Y_train, trainResult_XG)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.4f' % r2_score(Y_train, trainResult_XG))




########## TEST DATA RESULT ##########

print('---------- TEST DATA RESULT ----------')
# The mean squared error
print("Mean squared error: %.5f"% np.sqrt(mean_squared_error(Y_test, testResult_XG)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.4f' % r2_score(Y_test, testResult_XG))






# Submission

In [None]:
unseenTestResult_XG=np.expm1(unseenTestResult_XG)

In [None]:
submission=pd.DataFrame([unseenTestDF.id,unseenTestResult_XG]).T

submission.columns=['id','revenue']

submission.id=submission.id.astype(int)

submission.to_csv('submission.csv',index=False)
