In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import ast
from wordcloud import WordCloud

In [None]:
train = pd.read_csv('../input/tmdb-box-office-prediction/train.csv')
test = pd.read_csv('../input/tmdb-box-office-prediction/test.csv')

In [None]:
train.info()

In [None]:
test["revenue"] = np.nan
data = train.append(test,ignore_index=True)

In [None]:
data.describe()

In [None]:
# Lets see the Unique Value Distribution of Object features
Object_col = data.select_dtypes('object').columns
Analysis = []

for columns in Object_col:
    Unique = data[columns].nunique()
    Analysis.append([str(columns),Unique])

Analysis = np.array(Analysis)
plt.figure(figsize = (10,10))
plt.bar(x = Analysis[:,0],height = Analysis[:,1],width = 0.6,color = "red",edgecolor = "black")
plt.gca().invert_xaxis()
plt.xlabel("Features")
plt.ylabel("Unique Count")
plt.title("Distribution of Object Column")
plt.xticks(rotation = 50)
print("Total Size of dataset: {}".format(len(data)))


In [None]:
# Lets Deal with Every category one by one
object_columns = data.select_dtypes('object').columns
print(object_columns)

**Lets Deal with "belongs_to_collection" first.**

In [None]:
for idx,instance in enumerate(data.loc[:3,'belongs_to_collection']):
    print(idx,instance)

In [None]:
# Lets Extract out the names and make it as feature
Value = []
for instance in data.loc[:,'belongs_to_collection']:
    if isinstance(instance,str):  # Because nan in float instance and rest is string instance
        Value.append(ast.literal_eval(instance)[0]['name'])  # Extracting out the name
    else:
        Value.append(np.nan)

In [None]:
data['belongs_to_collection'] = Value

In [None]:
data['belongs_to_collection'].head(5)

In [None]:
x, y = np.ogrid[:1000, :1000]

mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)
Count = data['belongs_to_collection'].value_counts()
plt.figure(figsize = (15,15))
wordcloud = WordCloud(background_color="white",width=1920, height=1080,mask = mask).generate_from_frequencies(Count)
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')
plt.show()

**Lets deal with 'Genre' Now:**

In [None]:
for idx,instance in enumerate(data.loc[:10,'genres']):
    print(idx,instance)

In [None]:
# Lets Extract out the name first
Value = []
Count = []
for instance in data.loc[:,'genres']:
    if isinstance(instance,str):
        X = ast.literal_eval(instance)
        Count.append(len(X))   # Count Keep tracks of total Genres of One Movie
        for i in X:
            Value.append(str(i['name']))
    else:
        Count.append(0)

In [None]:
x, y = np.ogrid[:1000, :1000]

mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)

Total_Genre = pd.DataFrame(Value).value_counts()
Total_Genre.index = ['Drama','Comedy','Thriller','Action','Romance','Adventure','Crime','Science Fiction',
                     'Horror','Family','Fantasy','Mystery','Animation','History','Music','War','Documentary',
                     'Western','Foreign','TV Movie']

plt.figure(figsize = (15,15))
plt.subplot(121)
wordcloud = WordCloud(background_color="white",mask=mask,width=2000, height=1180).generate_from_frequencies(Total_Genre)
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')

plt.subplot(122)
Total_Genre.plot(kind = 'bar',color = 'blue',edgecolor = 'black')
plt.show()

# Drama And Comedy are the most Repeating ones

In [None]:
data['genres'] = Count

**Lets Deal with "homepage" now.**

In [None]:
for idx,instance in enumerate(data.loc[:10,'homepage']):
    print(idx,instance)

In [None]:
data['homepage'].value_counts()   # This we can Remove. Because its given in dataset description that
# even though two different movies have same homepage they must be considered as two different movies.

In [None]:
data.drop(['homepage'],axis = 1,inplace = True)

**Lets Deal with "homepage" now:**

In [None]:
for idx,instance in enumerate(data.loc[:10,'imdb_id']):
    print(idx,instance)

In [None]:
data['imdb_id'].value_counts()       # All Unique Value We can Remove this attribute.
print(np.any(np.array(data['imdb_id'].value_counts()) == 1))

In [None]:
data.drop(['imdb_id'],axis = 1,inplace = True)

**Lets Deal with "Original language" now:**

In [None]:
for idx,instance in enumerate(data.loc[:5,'original_language']):
    print(idx,instance)

In [None]:
data['original_language'].value_counts().head(5)    # We can simply use Linear Encoding in here

In [None]:
for idx,instance in enumerate(data.loc[:10,'original_title']):
    print(idx,instance)

In [None]:
data['original_title'].value_counts()  # I can remove this same reason for homepage

In [None]:
data.drop(['original_title'],axis = 1,inplace = True)

In [None]:
for idx,instance in enumerate(data.loc[:1,'overview']):
    print(idx,instance)

In [None]:
data.drop(['overview'],axis = 1,inplace = True)

**Lets see "poster_path" now:**

In [None]:
for idx,instance in enumerate(data.loc[:1,'poster_path']):
    print(idx,instance)

In [None]:
data['poster_path'].value_counts()      # All Unique Value We can Remove this attribute.
print(np.any(np.array(data['poster_path'].value_counts()) == 1))

In [None]:
data.drop(['poster_path'],axis = 1,inplace = True)

In [None]:
for idx,instance in enumerate(data.loc[:10,'production_companies']):
    print(idx,instance)

In [None]:
# Lets Extract out the name first
Value = []
Count = []
for instance in data.loc[:,'production_companies']:
    if isinstance(instance,str):
        X = ast.literal_eval(instance)
        Count.append(len(X))   # Count Keep tracks of total Production Companirs of One Movie
        for i in X:
            Value.append(str(i['name']))
    else:
        Count.append(0)

In [None]:
pd.DataFrame(Value).value_counts().index[:5] # We have to work on it and bring in right format

In [None]:
Correct_index = []
for index in pd.DataFrame(Value).value_counts().index:
    Correct_index.append(index[0])
print(Correct_index[:5])

In [None]:
x, y = np.ogrid[:1000, :1000]

mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)

companies = pd.DataFrame(Value).value_counts()
companies.index = Correct_index

plt.figure(figsize = (15,15))
wordcloud = WordCloud(background_color="white",mask=mask,width=2000, height=1180).generate_from_frequencies(companies)
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')

# Most pictures are from Paramount Pictures,Warner Bors and Universal Pictures

In [None]:
data['production_companies'] = Count

**Lets See "production_countries" now**

In [None]:
for idx,instance in enumerate(data.loc[:10,'production_countries']):
    print(idx,instance)

In [None]:
# Same treatment we can give it as above
# Lets Extract out the name first
Value = []
Count = []
for instance in data.loc[:,'production_countries']:
    if isinstance(instance,str):
        X = ast.literal_eval(instance)
        Count.append(len(X))   # Count Keep tracks of total Production Countries of One Movie
        for i in X:
            Value.append(str(i['name']))
    else:
        Count.append(0)

In [None]:
Value = pd.DataFrame(Value).rename(columns = {0:"Name"})
df = Value.value_counts()

In [None]:
df.loc[df > 50].plot(kind = "bar",color = "blue",edgecolor = "black",figsize = (20,5)) 
plt.xticks(rotation = 50)
plt.xlabel("Production Countries")
plt.ylabel("Count of Total Movies")
# Plotting the the Countries who created more than 50 movies

In [None]:
data['production_countries'] = Count

**Dealing with "release_date" attribute:**

In [None]:
for idx,instance in enumerate(data.loc[:20,'release_date']):
    print(idx,instance)

In [None]:
data['release_date'] = pd.to_datetime(data['release_date'],format="%m/%d/%y")

# format ==> https://strftime.org/

In [None]:
data['release_date'].head(10)

In [None]:
for idx,instance in enumerate(data.loc[:5,'spoken_languages']):
    print(idx,instance)

In [None]:
# We can Do same treatment as Earlier
# Lets Extract out the name first
Value = []
Count = []
for instance in data.loc[:,'spoken_languages']:
    if isinstance(instance,str):
        X = ast.literal_eval(instance)
        Count.append(len(X))   # Count Keep tracks of total Languages of One Movie
        for i in X:
            Value.append(str(i['name']))
    else:
        Count.append(0)

In [None]:
Value[:5]

In [None]:
Correct_index = []
for index in pd.DataFrame(Value).value_counts().index:
    Correct_index.append(index[0])
print(Correct_index[:5])

In [None]:
x, y = np.ogrid[:1000, :1000]

mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)

Lang = pd.DataFrame(Value).value_counts()
Lang.index = Correct_index

plt.figure(figsize = (7,7))
wordcloud = WordCloud(background_color="white",mask=mask,width=2000, height=1180).generate_from_frequencies(Lang)
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis('off')

# Most pictures are in English,Francais,Espanol usw

In [None]:
data['spoken_languages'] = Count

In [None]:
for idx,instance in enumerate(data.loc[:5,'status']):
    print(idx,instance)         # We can simply do Label Encoding in it

In [None]:
for idx,instance in enumerate(data.loc[:5,'tagline']):
    print(idx,instance)

In [None]:
data.drop(['tagline'],inplace = True,axis = 1)

In [None]:
for idx,instance in enumerate(data.loc[:5,'title']):
    print(idx,instance)

In [None]:
data['title'].value_counts() 
# We can remove it as I said earlier even though Ghost has 3 instances all must be treated as differnt. So meaning this attribute is Unique.

In [None]:
data.drop(['title'],inplace = True,axis = 1)

In [None]:
for idx,instance in enumerate(data.loc[:5,'Keywords']):
    print(idx,instance)

In [None]:
# We can Do same treatment as Earlier

Count = []
for instance in data.loc[:,'Keywords']:
    if isinstance(instance,str):
        X = ast.literal_eval(instance)
        Count.append(len(X))   # Count Keep tracks of total Keywords of One Movie
    else:
        Count.append(0)

In [None]:
data['Keywords'] = Count

In [None]:
for idx,instance in enumerate(data.loc[:1,'cast']):
    print(idx,instance)

In [None]:
# We can Do same treatment as Earlier

Count = []
for instance in data.loc[:,'cast']:
    if isinstance(instance,str):
        X = ast.literal_eval(instance)
        Count.append(len(X))   # Count Keep tracks of total casts of One Movie
    else:
        Count.append(0)

In [None]:
data['cast'] = Count

In [None]:
for idx,instance in enumerate(data.loc[:1,'crew']):
    print(idx,instance)

In [None]:
# We can Do same treatment as Earlier

Count = []
for instance in data.loc[:,'crew']:
    if isinstance(instance,str):
        X = ast.literal_eval(instance)
        Count.append(len(X))   # Count Keep tracks of total crew of One Movie
    else:
        Count.append(0)

In [None]:
data['crew'] = Count

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# Lets see Null values Now.
Null_Values = pd.DataFrame(data.isnull().sum()).rename(columns = {0:'Total'})
Null_Values['Percent'] = Null_Values['Total']/len(data)
Null_Values.sort_values('Percent',ascending=False).head()

In [None]:
# Lets Manually first store the index where Null values are present becasue Label Encoding will also encode Null values
index = list(Null_Values[Null_Values['Total'] >= 1].index)
index.remove('revenue')
Position = []
for col in index:
    temp = data[data[col].isnull()].index
    Position.append([col,temp])   

In [None]:
from sklearn.preprocessing import LabelEncoder
# Lets Apply Label Encoding to object Columns
Object_Columns = data.select_dtypes('object').columns
data[Object_Columns] = data.select_dtypes('object').apply(LabelEncoder().fit_transform)

In [None]:
for instance in Position:
    data.loc[instance[1],instance[0]] = np.nan

In [None]:
# Extracting day,month,year,and quarter of the year
data['Release_month'] = data['release_date'].dt.month
data['Release_Year'] = data['release_date'].dt.year
data['Release_Day'] = data['release_date'].dt.day
data['Release_Quarter'] = data['release_date'].dt.quarter
data.drop(['release_date'],axis = 1,inplace = True)

In [None]:
# First Lets Compress our dataset
def Reduce_Me(dataset):
    Initial = data.memory_usage().sum()/ 1024**2
    print("Initial Memory : {:.2f} MB".format(Initial))
    Columns = dataset.columns
    for column in Columns:
        Dtype = str(data[column].dtype)
        
        min_ = data[column].min()
        max_ = data[column].max()
            
        if 'int' in Dtype:
            if min_ > np.iinfo(np.int8).min and max_ < np.iinfo(np.int8).max:
                data[column] = data[column].astype(np.int8)
            elif min_ > np.iinfo(np.int16).min and max_ < np.iinfo(np.int16).max:
                data[column] = data[column].astype(np.int16)
            elif min_ > np.iinfo(np.int32).min and max_ < np.iinfo(np.int32).max:
                data[column] = data[column].astype(np.int32)
            elif min_ > np.iinfo(np.int64).min and max_ < np.iinfo(np.int64).max:
                data[column] = data[column].astype(np.int64)
        else:
            if min_ > np.finfo(np.float16).min and max_ < np.finfo(np.float16).max:
                data[column] = data[column].astype(np.float16)
            elif min_ > np.finfo(np.float32).min and max_ < np.finfo(np.float32).max:
                data[column] = data[column].astype(np.float32)
            elif min_ > np.finfo(np.float64).min and max_ < np.finfo(np.float64).max:
                data[column] = data[column].astype(np.float64)
    Final = data.memory_usage().sum()/1024**2
    print("Final Memory : {:.2f} MB".format(Final))
    print("Reduced By: {:.2f}%".format((Initial-Final)/Initial * 100))
    return dataset


In [None]:
data.drop(['id'],inplace = True,axis = 1)
data = Reduce_Me(data)

In [None]:
# Lets see how many Unique Values are there in Integer columns
Integer_Columns = [col for col in data if 'int' in str(data[col].dtypes)]
data[Integer_Columns].nunique().value_counts().sort_index().plot(kind = 'bar',figsize = (8,8),edgecolor = 'blue',color = 'red')
plt.xlabel('Unique Count if the Feature')
plt.ylabel('Number of Feature')
plt.show()

In [None]:
import warnings
warnings.filterwarnings('ignore',category = FutureWarning)

# Lets See the Distribution of Each of them
import seaborn as sns
plt.figure(figsize=(20,20))
color = ['red','green','blue','orange','pink','yellow']
for idx,col in enumerate(Integer_Columns):
    ax = plt.subplot(4,3,idx+1)
    color_idx = np.random.randint(0,len(color))
    sns.violinplot(data[col],ax=ax,color=color[color_idx],linewidth=2)

    
# Its clear that the datapoints are highly skewed. We will see later if we should apply some transformations on it

In [None]:
# Lets see how they are related to our target Variable
import warnings
warnings.filterwarnings('ignore',category = FutureWarning)

# Lets See the Distribution of Each of them
import seaborn as sns
plt.figure(figsize=(20,20))
color = ['red','green','blue','orange','pink','yellow']
for idx,col in enumerate(Integer_Columns):
    ax = plt.subplot(4,3,idx+1)
    color_idx = np.random.randint(0,len(color))
    sns.scatterplot(data[col],data['revenue'],ax=ax,color=color[color_idx],linewidth=2)

    
# No Clear Correlation are Visible. Budget and revenue seems to be correlated

In [None]:
import warnings
warnings.filterwarnings('ignore',category = FutureWarning)

Float_columns = [col for col in data if 'float' in str(data[col].dtypes)]
Float_columns.remove('revenue')
# Lets See the Distribution of Float columns Each of them
import seaborn as sns
plt.figure(figsize=(20,20))
color = ['red','green','blue','orange','pink','yellow']
for idx,col in enumerate(Float_columns):
    ax = plt.subplot(4,3,idx+1)
    color_idx = np.random.randint(0,len(color))
    Mean,Median = np.mean(data[col]),np.median(data[col])
    sns.kdeplot(data[col],ax=ax,color=color[color_idx],linewidth=2)

    
# Some of them are Skewed. Some of them are almost gaussian centered at their respective mean

In [None]:
# Lets see how they are related to our target Variable
import warnings
warnings.filterwarnings('ignore',category = FutureWarning)

# Lets See the Distribution of Each of them
import seaborn as sns
plt.figure(figsize=(20,20))
color = ['red','green','blue','orange','pink','yellow']
for idx,col in enumerate(Float_columns):
    ax = plt.subplot(4,3,idx+1)
    color_idx = np.random.randint(0,len(color))
    sns.scatterplot(data[col],data['revenue'],ax=ax,color=color[color_idx],linewidth=2)

    
# No Clear Correlation are Visible. Budget and revenue seems to be correlated

In [None]:
figure,ax = plt.subplots()
data.plot(x = "revenue",y = "runtime",ax = ax)
data.plot(x = "revenue",y = "Release_Year",ax = ax,secondary_y = True)

# It simply says at almost fixed runtime the revenue tends to increase similar argument for Release Year

In [None]:
figure,ax = plt.subplots()
data.plot(x = "revenue",y = "budget",ax = ax)
data.plot(x = "revenue",y = "cast",ax = ax,secondary_y = True)

In [None]:
# Lets see Null values Again.
Null_Values = pd.DataFrame(data.isnull().sum()).rename(columns = {0:'Total'})
Null_Values['Percent'] = Null_Values['Total']/len(data)
Null_Values.sort_values('Percent',ascending=False).head(10)

In [None]:
# Lets see what values we should Put in these
Null_Columns = list(Null_Values[Null_Values['Total'] >= 1].index)
Null_Columns.remove('revenue')
import warnings
warnings.filterwarnings('ignore',category = FutureWarning)

# Lets See the Distribution of Each of them
import seaborn as sns
plt.figure(figsize=(20,20))
color = ['red','green','blue','orange','pink','yellow']
for idx,col in enumerate(Null_Columns):
    ax = plt.subplot(3,3,idx+1)
    color_idx = np.random.randint(0,len(color))
    sns.histplot(data[col],ax=ax,color=color[color_idx],linewidth=2)

# Lets fill them with the median Value 

In [None]:
# Lets Start Feature Engineering
# Lets first remove any redundant feature having correlation greater than 95%
Correlation_Matrix = data.corr()
Triu = Correlation_Matrix.where(np.triu(np.ones(Correlation_Matrix.shape),k = 1).astype(np.bool))  # Will return Upper triangle
Redundant_col = [col for col in Triu if np.any(Triu[col].abs() >= 0.95)]

In [None]:
data.drop(Redundant_col,axis = 1,inplace = True)

In [None]:
data.info()

In [None]:
from sklearn.preprocessing import StandardScaler
OriginalLabel = data.loc[data['revenue'].notnull(),'revenue']
Columns = data.columns

In [None]:
scaler = StandardScaler()  # Null values will remain np.nan so no problem there
data = scaler.fit_transform(data)
data = pd.DataFrame(data,columns=list(Columns))

In [None]:
# Lets Manually Create Some Features
import featuretools as ft
es = ft.EntitySet(id = 'TMDB')
es = es.entity_from_dataframe(entity_id="data_tmdb",dataframe=data,make_index=True,index = 'TMDB_id')

In [None]:
es

In [None]:
es['data_tmdb'].variables

In [None]:
feature_matrix,feature_dfs = ft.dfs(entityset=es,target_entity="data_tmdb",trans_primitives=["add_numeric","cum_mean","cum_sum","percentile"],max_depth=2)

In [None]:
feature_matrix.shape

In [None]:
Label = feature_matrix["revenue"]
Remove_col = []
for col in feature_matrix:
    if "revenue" in str(col):
        Remove_col.append(col)
Remove_col[:5]

In [None]:
feature_matrix.drop(Remove_col,axis = 1,inplace = True)

In [None]:
feature_matrix.head()

In [None]:
Correlation_Matrix = feature_matrix.corr()
Triu = Correlation_Matrix.where(np.triu(np.ones(Correlation_Matrix.shape),k = 1).astype(np.bool))  # Will return Upper triangle
Redundant_col = [col for col in Triu if np.any(Triu[col].abs() >= 0.95)]

In [None]:
feature_matrix.drop(Redundant_col,axis = 1,inplace = True)

In [None]:
feature_matrix.head()

In [None]:
feature_matrix['revenue'] = Label
del Label

In [None]:
Train = feature_matrix.loc[feature_matrix['revenue'].notnull(),:]
Test = feature_matrix.loc[feature_matrix['revenue'].isnull(),:]

In [None]:
Label = OriginalLabel
Train.drop(['revenue'],inplace = True,axis = 1)
Test.drop(['revenue'],inplace = True,axis = 1)

In [None]:
Features = Train.columns

In [None]:
# Lets start Machine Learning From Here
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score,make_scorer
#Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).
#A constant model that always predicts the expected value of y, disregarding the input features,
#would get a R^2 score of 0.0.

imputer = SimpleImputer(strategy="median")
score_fn = make_scorer(r2_score,greater_is_better=True)

In [None]:
Train = imputer.fit_transform(Train)
Test = imputer.transform(Test)

In [None]:
Label = Label.values

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.ensemble import RandomForestRegressor   # Our Base Model 
from sklearn.model_selection import cross_val_score

RandomForest = RandomForestRegressor(n_estimators=100,n_jobs=-1,random_state=42)
cv_score = cross_val_score(RandomForest,Train,Label,cv = 10,scoring=score_fn)

In [None]:
print("10 Fold Cross Validation Mean R2_SCORE: {0} with Deviation: {1}".format(round(np.mean(cv_score),3),round(cv_score.std(),3)))

In [None]:
RandomForest.fit(Train,Label)
feature_importance = pd.DataFrame({'Features':Features,'Importance':RandomForest.feature_importances_})

In [None]:
def Plot_Importance(df,count = 5,threshold = 0.95):
    df['Importance'] = df['Importance']/df['Importance'].sum()
    df = df.sort_values('Importance',ascending = False).reset_index(drop = True)
    
    df['cum_sum'] = np.cumsum(df['Importance'])
    
    # Plotting Values
    df.loc[:count,:].plot(kind = 'barh',x = 'Features',y = 'Importance',color = 'green',
                          edgecolor = 'blue',figsize = (5,5),linewidth = 2)
    plt.xlabel("Normalized Importance")
    plt.xscale("log")
    plt.gca().invert_yaxis()
    
    plt.show()
    
    if threshold:
        min_ = np.min(np.where(df['cum_sum'] > threshold))
        plt.xlabel('# Features', size = 10)
        plt.ylabel('Cumulative Importance', size = 10)
        plt.title('Cumulative Importance of Features', size = 10)
        plt.plot(np.arange(len(df)),df['cum_sum'],color = 'red')
        plt.vlines(min_ + 1,ymin = 0,ymax = 1,color = 'black',linestyles='dotted')
        plt.show()
        print('Number of Columns required for {0} threshold is: {1}'.format(threshold,min_+1))
    return df

In [None]:
feature_importance = Plot_Importance(feature_importance,count=10)

In [None]:
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore',category=ConvergenceWarning)
warnings.filterwarnings('ignore',category=UserWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)

Result = pd.DataFrame(columns = ['Model','CV_Mean','CV_Std'])

def Check_Model(Model,Name,cv = 10,WantResult = True):
    
    global Result
    cv_score = cross_val_score(Model,Train,Label,cv = cv,scoring=score_fn)
    print("10 Fold Cross Validation Mean R2_SCORE: {0} with Deviation: {1}".format(round(cv_score.mean(),3),round(cv_score.std(),3)))
    
    if WantResult:
        Result = Result.append(pd.DataFrame({'Model':Name,'CV_Mean':round(np.mean(cv_score),3),'CV_Std':round(np.std(cv_score),3)},index = [0]),ignore_index=True)
        return Result

In [None]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import LinearSVR,SVR
from sklearn.neural_network import MLPRegressor

forest_regressor = RandomForestRegressor(n_estimators = 100,n_jobs = -1,random_state=42)
L_Regress = LinearRegression()
Log_Regress = LogisticRegression(n_jobs = -1)
L_SVR = LinearSVR()
N_SVR = SVR()
MLP_Regressor = MLPRegressor(hidden_layer_sizes=[100,100,100],early_stopping=True)

In [None]:
Result = Check_Model(forest_regressor,"RandomForest",WantResult=True)

In [None]:
Result = Check_Model(L_Regress,"LinearRegression",WantResult=True)

In [None]:
Result = Check_Model(Log_Regress,"LogistikRegression",WantResult=True)

In [None]:
Result = Check_Model(L_SVR,"LinearSVR",WantResult=True)

In [None]:
Result = Check_Model(N_SVR,"SVR",WantResult=True)

In [None]:
Result = Check_Model(MLP_Regressor,"MLPRegressor",WantResult=True)

In [None]:
Result.set_index("Model",inplace = True)

In [None]:
Result["CV_Mean"].plot(kind = "bar",edgecolor = "black",color = "yellow",yerr = list(Result["CV_Std"]))
plt.hlines(0,xmin = -2,xmax = 10,color = "red",linestyles="dotted")
plt.axis([-2,10,-2,10])

# Only Random Forest and MLP Classifier seems to do good

In [None]:
#Hyperparameter optimization for Random Forest

from hyperopt import STATUS_OK

def Objective_function(params):
    
    model = RandomForestRegressor(**params,n_jobs = -1)
    cvscore = cross_val_score(model,Train,Label,cv = 10,scoring=score_fn)
    score = 1 - np.mean(cvscore)
    return {"loss":score,'params':params,"status":STATUS_OK}

In [None]:
# Domain Space
from hyperopt import hp
space = {
    'n_estimators':hp.choice('n_estimators',range(10,1000)),
    'max_depth':hp.choice('max_depth',range(1,50)),
    'max_features':hp.choice('max_features',['auto','sqrt','log2']),
    'max_leaf_nodes':hp.choice('max_leaf_nodes',range(10,150)),
    'min_samples_split':hp.choice('min_samples_split',range(2,100)),
    'max_leaf_nodes':hp.choice('max_leaf_nodes',[None,2,5,10,15,20,25,50,100])
}

In [None]:
from hyperopt import Trials

bayes_trials = Trials()

In [None]:
from hyperopt import fmin,tpe

best = fmin(Objective_function,space,max_evals=50,trials=bayes_trials,algo = tpe.suggest)

In [None]:
# Lets Optimize MLP Classifier

def MLPObjectiveFunction(params):
    model = MLPRegressor(**params,early_stopping=True)
    cvscore = cross_val_score(model,Train,Label,cv = 10,scoring=score_fn)
    score = 1 - np.mean(cvscore)
    return {"loss":score,'params':params,"status":STATUS_OK}

In [None]:
MLP_space = {
    'hidden_layer_sizes':hp.choice('hidden_layer_sizes',[100,[100,100],[100,100,100],[100,100,100,100]]),
    'activation':hp.choice('activation',['identity', 'logistic', 'tanh', 'relu']),
    'alpha':hp.choice('alpha',np.linspace(0.0001,1,num=1000)),
    'learning_rate':hp.choice('learning_rate',['constant','adaptive','invscaling']),
    'learning_rate_init':hp.choice('learning_rate_init',np.linspace(0.001,1,1000))
}

In [None]:
MLP_Trial = Trials()

In [None]:
MLP_best = fmin(MLPObjectiveFunction,MLP_space,max_evals=50,trials=MLP_Trial,algo = tpe.suggest)

In [None]:
from hyperopt import space_eval
params = space_eval(space,best)    # These are the best parameters for Random Forest

In [None]:
params

In [None]:
Best_model = RandomForestRegressor(**params,n_jobs = -1)

In [None]:
cvscore = cross_val_score(Best_model,Train,Label,cv = 10,n_jobs = -1)
print("10 Fold Cross Validation Mean R2_SCORE: {0} with Deviation: {1}".format(round(np.mean(cv_score),3),round(cv_score.std(),3)))

In [None]:
# Lets use Gradient boosting as see how it works we will use lightgbm implementation

import lightgbm as lgb
import numpy as np

def Objective_function_lgn(params):
        
    boost_type = params['boosting_type']['boosting_type']
    del params['boosting_type']
    params['boosting_type'] = boost_type

    model = lgb.LGBMRegressor(**params,n_jobs = -1)
    score = cross_val_score(model,Train,Label,cv = 10,scoring=score_fn)
    loss = 1 - np.mean(score)
    return {'loss':loss,'params':params,'status':STATUS_OK}

In [None]:
from hyperopt import hp,Trials,fmin,tpe
space = {
        'class_weight': hp.choice('class_weight', [None, 'balanced']),
        'boosting_type': hp.choice('boosting_type',
                                   [{'boosting_type': 'gbdt'},                                     
                                    {'boosting_type': 'dart'},
                                    {'boosting_type': 'goss'}]),
        'num_leaves': hp.choice('num_leaves', np.arange(30, 150)),
        'subsample_for_bin': hp.choice('subsample_for_bin', np.arange(20000, 300000)),
        'feature_fraction': hp.choice('feature_fraction', np.random.rand(1,1000)),
        'bagging_fraction': hp.choice('bagging_fraction', np.random.rand(1,1000)), 
        'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
        'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
        'min_child_samples': hp.choice('min_child_samples', np.arange(20, 500)),
        'reg_alpha': hp.choice('reg_alpha', np.random.rand(1,1000)),
        'reg_lambda': hp.choice('reg_lambda', np.random.rand(1,1000)),
        'colsample_bytree': hp.choice('colsample_by_tree', np.random.rand(1,1000)),
        'objective': "regression",
        'n_estimators': hp.choice('n_estimator',np.arange(10,1000)),
    }

In [None]:
lgbm_trial = Trials()

In [None]:
GBM_best = fmin(Objective_function_lgn,space,max_evals=200,trials=lgbm_trial,algo = tpe.suggest)