**Import all necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
plt.style.use('fast')
!pip install missingno
import missingno as msngno 
%matplotlib inline
import ast
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tmdb-box-office-prediction/train.csv')
test = pd.read_csv('../input/tmdb-box-office-prediction/test.csv')
train_extra = pd.read_csv('../input/tmdb-competition-additional-features/TrainAdditionalFeatures.csv')
test_extra = pd.read_csv('../input/tmdb-competition-additional-features/TestAdditionalFeatures.csv')

train = pd.merge(train, train_extra, how='left', on=['imdb_id'])
test = pd.merge(test, test_extra, how='left', on=['imdb_id'])



**Loading the dataset, Train and Test
I have also added additional features from an external source (both test and train)

**Note: When the data is in the local machine, it is easier to load. The data was however loaded through GitHub link and it might take a while :)


In [None]:
train.shape, test.shape

In [None]:
train_extra.shape, test_extra.shape

In [None]:
train.head()

**Checking the duplicates

In [None]:
sum(train.duplicated()), sum(test.duplicated())

**Fixing the values that were missing/wrong for the complete dataset
These were pre-released in the discussion section

In [None]:
train.loc[train['id'] == 16,'revenue'] = 192864          # Skinning
train.loc[train['id'] == 90,'budget'] = 30000000         # Sommersby          
train.loc[train['id'] == 118,'budget'] = 60000000        # Wild Hogs
train.loc[train['id'] == 149,'budget'] = 18000000        # Beethoven
train.loc[train['id'] == 313,'revenue'] = 12000000       # The Cookout 
train.loc[train['id'] == 451,'revenue'] = 12000000       # Chasing Liberty
train.loc[train['id'] == 464,'budget'] = 20000000        # Parenthood
train.loc[train['id'] == 470,'budget'] = 13000000        # The Karate Kid, Part II
train.loc[train['id'] == 513,'budget'] = 930000          # From Prada to Nada
train.loc[train['id'] == 797,'budget'] = 8000000         # Welcome to Dongmakgol
train.loc[train['id'] == 819,'budget'] = 90000000        # Alvin and the Chipmunks: The Road Chip
train.loc[train['id'] == 850,'budget'] = 90000000        # Modern Times
train.loc[train['id'] == 1007,'budget'] = 2              # Zyzzyx Road 
train.loc[train['id'] == 1112,'budget'] = 7500000        # An Officer and a Gentleman
train.loc[train['id'] == 1131,'budget'] = 4300000        # Smokey and the Bandit   
train.loc[train['id'] == 1359,'budget'] = 10000000       # Stir Crazy 
train.loc[train['id'] == 1542,'budget'] = 1              # All at Once
train.loc[train['id'] == 1570,'budget'] = 15800000       # Crocodile Dundee II
train.loc[train['id'] == 1571,'budget'] = 4000000        # Lady and the Tramp
train.loc[train['id'] == 1714,'budget'] = 46000000       # The Recruit
train.loc[train['id'] == 1721,'budget'] = 17500000       # Cocoon
train.loc[train['id'] == 1865,'revenue'] = 25000000      # Scooby-Doo 2: Monsters Unleashed
train.loc[train['id'] == 1885,'budget'] = 12             # In the Cut
train.loc[train['id'] == 2091,'budget'] = 10             # Deadfall
train.loc[train['id'] == 2268,'budget'] = 17500000       # Madea Goes to Jail budget
train.loc[train['id'] == 2491,'budget'] = 6              # Never Talk to Strangers
train.loc[train['id'] == 2602,'budget'] = 31000000       # Mr. Holland's Opus
train.loc[train['id'] == 2612,'budget'] = 15000000       # Field of Dreams
train.loc[train['id'] == 2696,'budget'] = 10000000       # Nurse 3-D
train.loc[train['id'] == 2801,'budget'] = 10000000       # Fracture
train.loc[train['id'] == 335,'budget'] = 2 
train.loc[train['id'] == 348,'budget'] = 12
train.loc[train['id'] == 470,'budget'] = 13000000 
train.loc[train['id'] == 513,'budget'] = 1100000
train.loc[train['id'] == 640,'budget'] = 6 
train.loc[train['id'] == 696,'budget'] = 1
train.loc[train['id'] == 797,'budget'] = 8000000 
train.loc[train['id'] == 850,'budget'] = 1500000
train.loc[train['id'] == 1199,'budget'] = 5 
train.loc[train['id'] == 1282,'budget'] = 9               # Death at a Funeral
train.loc[train['id'] == 1347,'budget'] = 1
train.loc[train['id'] == 1755,'budget'] = 2
train.loc[train['id'] == 1801,'budget'] = 5
train.loc[train['id'] == 1918,'budget'] = 592 
train.loc[train['id'] == 2033,'budget'] = 4
train.loc[train['id'] == 2118,'budget'] = 344 
train.loc[train['id'] == 2252,'budget'] = 130
train.loc[train['id'] == 2256,'budget'] = 1 
train.loc[train['id'] == 2696,'budget'] = 10000000


test.loc[test['id'] == 6733,'budget'] = 5000000
test.loc[test['id'] == 3889,'budget'] = 15000000
test.loc[test['id'] == 6683,'budget'] = 50000000
test.loc[test['id'] == 5704,'budget'] = 4300000
test.loc[test['id'] == 6109,'budget'] = 281756
test.loc[test['id'] == 7242,'budget'] = 10000000
test.loc[test['id'] == 7021,'budget'] = 17540562       #  Two Is a Family
test.loc[test['id'] == 5591,'budget'] = 4000000        # The Orphanage
test.loc[test['id'] == 4282,'budget'] = 20000000       # Big Top Pee-wee
test.loc[test['id'] == 3033,'budget'] = 250 
test.loc[test['id'] == 3051,'budget'] = 50
test.loc[test['id'] == 3084,'budget'] = 337
test.loc[test['id'] == 3224,'budget'] = 4  
test.loc[test['id'] == 3594,'budget'] = 25  
test.loc[test['id'] == 3619,'budget'] = 500  
test.loc[test['id'] == 3831,'budget'] = 3  
test.loc[test['id'] == 3935,'budget'] = 500  
test.loc[test['id'] == 4049,'budget'] = 995946 
test.loc[test['id'] == 4424,'budget'] = 3  
test.loc[test['id'] == 4460,'budget'] = 8  
test.loc[test['id'] == 4555,'budget'] = 1200000 
test.loc[test['id'] == 4624,'budget'] = 30 
test.loc[test['id'] == 4645,'budget'] = 500 
test.loc[test['id'] == 4709,'budget'] = 450 
test.loc[test['id'] == 4839,'budget'] = 7
test.loc[test['id'] == 3125,'budget'] = 25 
test.loc[test['id'] == 3142,'budget'] = 1
test.loc[test['id'] == 3201,'budget'] = 450
test.loc[test['id'] == 3222,'budget'] = 6
test.loc[test['id'] == 3545,'budget'] = 38
test.loc[test['id'] == 3670,'budget'] = 18
test.loc[test['id'] == 3792,'budget'] = 19
test.loc[test['id'] == 3881,'budget'] = 7
test.loc[test['id'] == 3969,'budget'] = 400
test.loc[test['id'] == 4196,'budget'] = 6
test.loc[test['id'] == 4221,'budget'] = 11
test.loc[test['id'] == 4222,'budget'] = 500
test.loc[test['id'] == 4285,'budget'] = 11
test.loc[test['id'] == 4319,'budget'] = 1
test.loc[test['id'] == 4639,'budget'] = 10
test.loc[test['id'] == 4719,'budget'] = 45
test.loc[test['id'] == 4822,'budget'] = 22
test.loc[test['id'] == 4829,'budget'] = 20
test.loc[test['id'] == 4969,'budget'] = 20
test.loc[test['id'] == 5021,'budget'] = 40 
test.loc[test['id'] == 5035,'budget'] = 1 
test.loc[test['id'] == 5063,'budget'] = 14 
test.loc[test['id'] == 5119,'budget'] = 2 
test.loc[test['id'] == 5214,'budget'] = 30 
test.loc[test['id'] == 5221,'budget'] = 50 
test.loc[test['id'] == 4903,'budget'] = 15
test.loc[test['id'] == 4983,'budget'] = 3
test.loc[test['id'] == 5102,'budget'] = 28
test.loc[test['id'] == 5217,'budget'] = 75
test.loc[test['id'] == 5224,'budget'] = 3 
test.loc[test['id'] == 5469,'budget'] = 20 
test.loc[test['id'] == 5840,'budget'] = 1 
test.loc[test['id'] == 5960,'budget'] = 30
test.loc[test['id'] == 6506,'budget'] = 11 
test.loc[test['id'] == 6553,'budget'] = 280
test.loc[test['id'] == 6561,'budget'] = 7
test.loc[test['id'] == 6582,'budget'] = 218
test.loc[test['id'] == 6638,'budget'] = 5
test.loc[test['id'] == 6749,'budget'] = 8 
test.loc[test['id'] == 6759,'budget'] = 50 
test.loc[test['id'] == 6856,'budget'] = 10
test.loc[test['id'] == 6858,'budget'] =  100
test.loc[test['id'] == 6876,'budget'] =  250
test.loc[test['id'] == 6972,'budget'] = 1
test.loc[test['id'] == 7079,'budget'] = 8000000
test.loc[test['id'] == 7150,'budget'] = 118
test.loc[test['id'] == 6506,'budget'] = 118
test.loc[test['id'] == 7225,'budget'] = 6
test.loc[test['id'] == 7231,'budget'] = 85
test.loc[test['id'] == 5222,'budget'] = 5
test.loc[test['id'] == 5322,'budget'] = 90
test.loc[test['id'] == 5350,'budget'] = 70
test.loc[test['id'] == 5378,'budget'] = 10
test.loc[test['id'] == 5545,'budget'] = 80
test.loc[test['id'] == 5810,'budget'] = 8
test.loc[test['id'] == 5926,'budget'] = 300
test.loc[test['id'] == 5927,'budget'] = 4
test.loc[test['id'] == 5986,'budget'] = 1
test.loc[test['id'] == 6053,'budget'] = 20
test.loc[test['id'] == 6104,'budget'] = 1
test.loc[test['id'] == 6130,'budget'] = 30
test.loc[test['id'] == 6301,'budget'] = 150
test.loc[test['id'] == 6276,'budget'] = 100
test.loc[test['id'] == 6473,'budget'] = 100
test.loc[test['id'] == 6842,'budget'] = 30

**Checking for null values for both train and test dataset**

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

**Plotting the missing values to get a clear picture of how many variables have missing values

In [None]:
missing_values=train.isna().sum().sort_values(ascending=False)
sns.barplot(missing_values[:10],missing_values[:10].index,palette="rocket")
plt.show()

missing_values1=test.isna().sum().sort_values(ascending=False)
sns.barplot(missing_values1[:10],missing_values1[:10].index,palette="rocket")
plt.show()

**Mapping the plot for the number of non missing values 

In [None]:
msngno.bar(train)

**Visualising correlation between missing values in different columns
(might be missing values in the same row)

In [None]:
msngno.heatmap(train)

In [None]:
train.describe()

**Various columns in the are in JSON format, converting it to dictionary

In [None]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df
        
train = text_to_dict(train)
test = text_to_dict(test)

**List out the different values in the columns
These columns were all in JSON format and to have a clear picture I needed to look through the details of what information each column has that will be used for our modelling phase

In [None]:
for count,value in enumerate(train['belongs_to_collection'][:2]):
    print(count,value)

for count,value in enumerate(test['genres'][:2]):
    print(count,value)
    
for count,value in enumerate(train['genres'][:2]):
    print(count,value)
    
for count,value in enumerate(train['spoken_languages'][:2]):
    print(count,value)
    
for count,value in enumerate(train['Keywords'][:2]):
    print(count,value)
    
for count,value in enumerate(train['production_countries'][:2]):
    print(count,value)
    
for count,value in enumerate(train['production_companies'][:2]):
    print(count,value)
    
for count,value in enumerate(train['cast'][:2]):
    print(count,value)
    
for count,value in enumerate(train['crew'][:2]):
    print(count,value)

**The release date column was in an unstructured format and to analyse the data and it's features, I have dissolved the column in three seperate columns giving the year,month and day respectively.

In [None]:
train['release_year']=pd.to_datetime(train['release_date']).dt.year
train['release_month']=pd.to_datetime(train['release_date']).dt.month
train['release_day']=pd.to_datetime(train['release_date']).dt.dayofweek


test['release_year']=pd.to_datetime(test['release_date']).dt.year
test['release_month']=pd.to_datetime(test['release_date']).dt.month
test['release_day']=pd.to_datetime(test['release_date']).dt.dayofweek


**Imputing the missing values for Rating, TotalVotes, and Runtime by the average values in the dataset
NOTE: The votes can't be in decimal hence I have rounded it upto the nearest integer.

In [None]:
train['rating']=train['rating'].fillna(train['rating'].mean())
test['rating']=test['rating'].fillna(test['rating'].mean())

train['totalVotes']=train['totalVotes'].fillna(train['totalVotes'].mean())
train['totalVotes']=train['totalVotes'].round()
test['totalVotes']=test['totalVotes'].fillna(test['totalVotes'].mean())
test['totalVotes']=test['totalVotes'].round()

train['runtime']=train['runtime'].fillna(train['runtime'].mean())
test['runtime']=test['runtime'].fillna(test['runtime'].mean())

In [None]:
train.head()

**Extracting the required data from the converted dictionary columns 

In [None]:
train['numberofgenres']=train['genres'].apply(lambda x: len(x) if x != {} else 0)
train['nameofcollection'] = train['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
train['genres_info'] = train['genres'].apply(lambda x: x[0]['name'] if x != {} else 0)
train['language'] = train['spoken_languages'].apply(lambda x: x[0]['name'] if x != {} else 0)
train['production_country'] = train['production_countries'].apply(lambda x: x[0]['name'] if x != {} else 0)
train['production_company'] = train['production_companies'].apply(lambda x: x[0]['name'] if x != {} else 0)

In [None]:
test['numberofgenres']=train['genres'].apply(lambda x: len(x) if x != {} else 0)
test['numberofgenres']=test['numberofgenres'].replace(np.nan,0)
test['nameofcollection'] = test['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
test['genres_info'] = test['genres'].apply(lambda x: x[0]['name'] if x != {} else 0)
test['language'] = test['spoken_languages'].apply(lambda x: x[0]['name'] if x != {} else 0)
test['production_country'] = test['production_countries'].apply(lambda x: x[0]['name'] if x != {} else 0)
test['production_company'] = test['production_companies'].apply(lambda x: x[0]['name'] if x != {} else 0)

**Exploring skewness of the Budget

In [None]:
train = train.astype({"budget":'Int32',})
fig, ax = plt.subplots(1,2,figsize=(12,5))
sns.distplot(train['budget'],ax=ax[0],color='black')
sns.distplot(np.log1p(train['budget']),ax=ax[1],color='black')


**Exploring the skewness of the target variable "Revenue"

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,5))
sns.distplot(train['revenue'],ax=ax[0],color='black')
sns.distplot(np.log(train['revenue']),ax=ax[1],color='black')


**Feature Engineering (Test and Train)

In [None]:
train['lang_english']=0
train.loc[train['original_language'].astype(str)=="en","lang_english"]=1

train['bud_runtime']=(train['budget']+1)/(train['runtime']+1)

train['bud_year']=(train['budget']+1)/(train['release_year']+1)

train['bud_popularity']=(train['budget']+1)/(train['popularity']+1)

train['runtime_year']=(train['runtime']+1)/(train['release_year']+1)

train['popularity_year']=(train['popularity']+1)/(train['release_year']+1)

In [None]:
test['lang_english']=0
test.loc[test['original_language'].astype(str)=="en","lang_english"]=1

test['bud_runtime']=(test['budget']+1)/(test['runtime']+1)

test['bud_year']=(test['budget']+1)/(test['release_year']+1)

test['bud_popularity']=(test['budget']+1)/(test['popularity']+1)

test['runtime_year']=(test['runtime']+1)/(test['release_year']+1)

test['popularity_year']=(test['popularity']+1)/(test['release_year']+1)

**Encoding of some variables as per our requirement (Train and Test)

In [None]:
train['homepage'] = train['homepage'].fillna(0)
train['homepage'] = train['homepage'].apply(lambda x: 0 if x==0 else 1)

train['overview'] = train['overview'].fillna(0)
train['overview'] = train['overview'].apply(lambda x: 0 if x==0 else 1)

train['poster_path'] = train['poster_path'].fillna(0)
train['poster_path'] = train['poster_path'].apply(lambda x: 0 if x==0 else 1)

train['tagline'] = train['tagline'].fillna(0)
train['tagline'] = train['tagline'].apply(lambda x: 0 if x==0 else 1)


In [None]:
test['homepage'] = test['homepage'].fillna(0)
test['homepage'] = test['homepage'].apply(lambda x: 0 if x==0 else 1)

test['overview'] = test['overview'].fillna(0)
test['overview'] = test['overview'].apply(lambda x: 0 if x==0 else 1)

test['poster_path'] = test['overview'].fillna(0)
test['poster_path'] = test['poster_path'].apply(lambda x: 0 if x==0 else 1)

test['tagline'] = test['tagline'].fillna(0)
test['tagline'] = test['tagline'].apply(lambda x: 0 if x==0 else 1)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

**There were few missing values in the test as seen above in the dates section
Imputing year,month and day by the Mode and the continuous variables by the Mean

In [None]:
test['release_year']=test['release_year'].fillna(test['release_year'].mode()[0])
test['release_month']=test['release_month'].fillna(test['release_month'].mode()[0])
test['release_day']=test['release_day'].fillna(test['release_day'].mode()[0])

test['bud_year']=test['bud_year'].fillna(test['bud_year'].mean())

test['popularity_year']=test['popularity_year'].fillna(test['popularity_year'].mean())

test['runtime_year']=test['runtime_year'].fillna(test['runtime_year'].mean())

**Release Year VS Revenue

In [None]:
train.groupby('release_year')['revenue'].mean().plot(color='black')
plt.xlabel('Release_year',fontsize = 10)
plt.ylabel('Revenue',fontsize = 10);
plt.title("Release Year vs Revenue")

**We have revenue for the movies upto 2060!! Fishy :)

In [None]:
train =train.loc[(train['release_year']<=2020)]

In [None]:
train.groupby('release_year')['revenue'].mean().plot(color='black')
plt.xticks(np.arange(1970,2020,4),rotation=45)
plt.xlabel('Release_year',fontsize = 10)
plt.ylabel('Revenue',fontsize = 10);
plt.title("Release Year vs Revenue")

**Release Month VS Revenue

In [None]:
train.groupby('release_month')['revenue'].mean().plot(color='black')
plt.xlabel('Release_month',fontsize = 10)
plt.ylabel('Revenue',fontsize = 10)
plt.title("Release Month vs Revenue")

**Release Day VS Revenue

In [None]:
train.groupby('release_day')['revenue'].mean().plot(color='black')
plt.xlabel('Release_day',fontsize = 10)
plt.ylabel('Revenue',fontsize = 10)
plt.title("Release Day vs Revenue")

**Movie Released VS Year

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x='release_year',data=train,palette="rocket")
plt.title("Movie Released VS Year",fontsize=15)
loc, labels = plt.xticks()
plt.xlabel('Year',fontsize = 10)
plt.ylabel('Number of Movies',fontsize = 10)
plt.xticks(fontsize=15,rotation='vertical')
plt.show()

**Movie Released VS Day of week

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(x='release_day',data=train,palette="rocket")
loc, labels = plt.xticks()
loc, labels = loc, ['Monday','Tuesday','Wednesday','Thursday',"Friday","Saturday","Sunday"]
plt.xticks(loc, labels,fontsize=15)
plt.xlabel('Release_day',fontsize = 10)
plt.ylabel('Number of Movies',fontsize = 10)
plt.title("Movies released VS Day of week",fontsize=15)
plt.show()

**Revenue VS Budget

In [None]:
train[['budget','revenue']].plot(kind='scatter',x='budget',y='revenue',figsize=(12,8))
plt.title("Revenue VS Budget",fontsize=15)
plt.xlabel('Budget',fontsize = 10)
plt.ylabel('Revenue',fontsize = 10)
plt.show()

**Revenue of movies VS Status

In [None]:
plt.figure(figsize=(10,5))
sns.catplot(x='status', y='revenue', data=train,palette='rocket')
plt.title('Revenue of movies VS Status',fontsize=13);
plt.show()

In [None]:
train.describe()

**Dropping down the columns

In [None]:
train = train.drop(['belongs_to_collection','genres','production_companies','nameofcollection','imdb_id','overview','poster_path','tagline','original_title','original_language','release_date','language','production_countries','cast','spoken_languages',
                    'Keywords', 'status','genres_info','production_company','production_country','title','crew','popularity2'], axis=1)

test = test.drop(['belongs_to_collection','genres','production_companies','nameofcollection','imdb_id','overview','poster_path','tagline','original_title','original_language','release_date','language','production_countries','cast','spoken_languages',
                    'Keywords', 'status','genres_info','production_company','production_country','title','crew','popularity2'], axis=1)


In [None]:
gridpl = sns.PairGrid(data= train,vars = ['revenue', 'totalVotes', 'popularity'], size = 3)

gridpl = gridpl.map_upper(plt.scatter, color = 'black')
gridpl = gridpl.map_diag(plt.hist, bins = 15, color = 'black', edgecolor = 'k')
gridpl = gridpl.map_lower(sns.kdeplot, cmap = 'Reds')


**Taking the log values of "Budget" and "Revenue" in the train section and "Budget" in the test section as they were skewed and not normally distributed

In [None]:
test['budget']=np.log1p(test['budget'])
train['budget']=np.log1p(train['budget'])

train['revenue']=np.log1p(train['revenue'])

In [None]:
train.head()

In [None]:
test.head()

**Checking the correlation of the variables with the target variable "Revenue"

In [None]:
train.corr().revenue

**MODELLING

In [None]:
y=train.revenue
X=train.drop( columns= ['revenue'])

**Splitting the data in 80:20 (Train and Validation)

In [None]:
X.shape,y.shape,test.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val= train_test_split(X,y,test_size=0.2,random_state=39)

**LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
lr_model=LinearRegression()
lr_model.fit(X_train,y_train)
lr_pred=lr_model.predict(X_val)

In [None]:
lr_rmse= mean_squared_error(y_val, lr_pred, squared=False)
lr_rmse

In [None]:
corr = train.corr()
corr.style.background_gradient(cmap='coolwarm')

Fitting the entire dataset on the model

In [None]:
lr_model.fit(X,y)
lr_pred_full = lr_model.predict(test)
lr_pred_full = np.exp(lr_pred_full)
rand_lr = pd.DataFrame(lr_pred_full)

linear_regression = pd.concat([test["id"],rand_lr], axis =1)
linear_regression.columns = ['id', 'revenue']
linear_regression.to_csv(r'linear_regression.csv',index = False)


In [None]:
reg = LinearRegression().fit(X, y)
for i,j in zip(reg.coef_, train.columns):
    print(i,j)

**RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(random_state=42, 
                                  max_features='auto', 
                                  n_estimators= 50, 
                                  min_samples_leaf=2)
rfr_model.fit(X_train,y_train)
rfr_pred = rfr_model.predict(X_val)

In [None]:
rfr_rmse= mean_squared_error(y_val, rfr_pred, squared=False)
rfr_rmse

Fitting the entire dataset on the model

In [None]:
rfr_model.fit(X,y)
rfr_pred_full = rfr_model.predict(test)
rfr_pred_full = np.exp(rfr_pred_full)
rand_rf = pd.DataFrame(rfr_pred_full)

random_forest = pd.concat([test["id"],rand_rf], axis =1)
random_forest.columns = ['id', 'revenue']
random_forest.to_csv(r'random_forest.csv',index = False)


**XGBoost

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(objective = 'reg:linear',
                                  eta = 0.01,
                                  max_depth = 3,
                                  min_child_weight = 3,
                                  subsample = 0.8,
                                  gamma = 1.45,
                                  colsample_bytree = 0.7,
                                  eval_metric = 'rmse',
                                  seed = 42,
                                  n_estimators = 3000)

In [None]:
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_val)
xgb_rmse= mean_squared_error(y_val, xgb_pred, squared=False)
xgb_rmse

**Hyperparameter Tuning - GRID Search

In [None]:
# from sklearn.model_selection import GridSearchCV
# from xgboost.sklearn import XGBRegressor


# xgb = XGBRegressor()
# parameters = {
#               'objective':['reg:linear'],
#               'eta': [0.01, 0.03, 0.05, 0.07], 
#               'max_depth': [3, 5, 6, 7],
#               'min_child_weight': [3, 4, 5],
#               'subsample': [0.6, 0.7, 0.8],
#               'colsample_bytree': [0.5, 0.6, 0.7],
#               'eval_metric': ['rmse'],
#               'n_estimators': [1000,2000,3000,3500] }

# xgb_grid = GridSearchCV(xgb,parameters,cv = 2,n_jobs = 5,verbose=True)

# xgb_grid.fit(X,y)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

Fitting the entire dataset on the model

In [None]:
xgb_model.fit(X, y)
xgb_pred_full = xgb_model.predict(test)
xgb_pred_full = np.exp(xgb_pred_full)
rand_xgb = pd.DataFrame(xgb_pred_full)

xgb = pd.concat([test["id"],rand_xgb], axis =1)
xgb.columns =['id', 'revenue']
xgb.to_csv(r'xgb.csv',index = False)

**LightBGM

In [None]:
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(objective='regression',num_leaves=1023,
                              learning_rate=0.005, n_estimators=650,
                              max_bin=58, bagging_fraction=0.80,max_depth=10,
                              bagging_freq=5, feature_fraction=0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf=7, min_sum_hessian_in_leaf=11)


In [None]:
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_val)
lgb_rmse= mean_squared_error(y_val, lgb_pred, squared=False)
lgb_rmse

Fitting the entire dataset on the model

In [None]:
lgb_model.fit(X, y)
lgb_pred_full = lgb_model.predict(test)
lgb_pred_full = np.exp(lgb_pred_full)
rand_lgb = pd.DataFrame(lgb_pred_full)

lgb = pd.concat([test["id"],rand_lgb], axis =1)
lgb.columns =['id', 'revenue']
lgb.to_csv(r'lgb.csv',index = False)
