In [1]:
import json
import datetime
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from statistics import mean
import statsmodels.api as sm

In [2]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor

In [3]:
path = "ECE219_tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", \
        "tweets_#nfl.txt", "tweets_#patriots.txt", \
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]
topics = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]

for i, fl in enumerate(files):
    print("files[" + str(i) + "] => " + fl)

files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [4]:
def prase_dataset(file):
    """
    Prase x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, hour of the day
    with lines of hours,
    Prase y of dataset as number of tweets in the next hour.
    """
   
    # extract raw features
    pst_tz = pytz.timezone('America/Los_Angeles')
    data_raw = []
    for line in open(path + file, 'r') :
        row_tmp = []
        a = json.loads(line)
        time = a['citation_date']
        retweet = a['metrics']['citations']['total']
        foll = a['author']['followers']        
        row_tmp.append(time)
        row_tmp.append(retweet)
        row_tmp.append(foll)        
        data_raw.append(row_tmp)
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','retweets','followers'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)
    pddata_raw['tweets'] = 1    
#     print(pddata_raw)

    # reset time to hour index
    hour_accu = []
    hour_day = []
    for index, row in pddata_raw.iterrows():  
        p = datetime.datetime.fromtimestamp(row["time"], pst_tz)  
        hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)
        hour_day.append(p.hour)    
    pddata_raw["time"] = hour_accu
    pddata_raw["hour of day"] = hour_day
    
    # create a new dataframe with desired form
    df = pd.DataFrame([],columns=['hour index','tweets','retweets','followers sum','followers max','hour of day'])
    df['hour index'] = range(pddata_raw.iloc[len(pddata_raw.index)-1,0]+1)
    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['hour of day'] = pddata_raw.groupby("time")['hour of day'].mean()
    df = df.drop([0]).fillna(0).reset_index(drop=True)

    # assign number of tweets of the next hour to be the target value
    df_y = df.iloc[1:,1].reset_index(drop=True)
    df = df[:len(df_y)]
    
    return df.iloc[:,1:],df_y

In [5]:
def train_lr(i):
    """
    This function gets data and train using a linear regression model
    """    
    df, df_y = prase_dataset(files[i])

    reg = LinearRegression().fit(df, df_y)
    pred_y = reg.predict(df)
    MSE = mean_squared_error(df_y, pred_y)
    R2 = r2_score(df_y, pred_y)
    
    print(topics[i])
    print('MSE for test data = ',MSE)
    print('R2 score for test data = ',R2)   
    
    return df, df_y, pred_y

In [6]:
def plot_recipe(df_y, pred_y):
    """
    This function plots fitted values vs true values
    """
    plt.figure()
    area = np.pi * (4)**2/4
    plt.scatter(df_y, pred_y, s = area)
    plt.plot([df_y.min(), df_y.max()], [df_y.min(), df_y.max()], 'k--', lw = 1)
    plt.xlabel('true values')
    plt.ylabel('fitted values')
    plt.show()

In [7]:

def ols_regression(df, df_y):
    X2 = sm.add_constant(df)
    y = df_y.as_matrix()
    lm = sm.OLS(y, X2).fit()
    print(lm.summary())
    print(list(df))

In [8]:
df, df_y = prase_dataset(files[0])
for i in range(1,6):
    df_temp,df_temp_y=prase_dataset(files[i])
    df=df.append(df_temp,ignore_index=True)
    df_y=df_y.append(df_temp_y,ignore_index=True)
    print(df_temp.shape,df.shape)

(574, 5) (1151, 5)
(585, 5) (1736, 5)
(585, 5) (2321, 5)
(585, 5) (2906, 5)
(585, 5) (3491, 5)


In [9]:
param_grid={
'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800, 1000,
1200, 1400, 1600, 1800, 2000]
}
kf = KFold(n_splits=5,random_state=42,shuffle=True)

In [10]:
#GradientBoostingRegressor

In [12]:


avg_RMSE1_train=np.zeros((8,2,3,3,10))
avg_RMSE1_test=np.zeros((8,2,3,3,10))
for n1,max_depth in enumerate(param_grid['max_depth']):
    for n2,max_features in enumerate(param_grid['max_features']):
        for n3,min_samples_leaf in enumerate(param_grid['min_samples_leaf']):
            for n4,min_samples_split in enumerate(param_grid['min_samples_split']):
                for n5,n_estimators in enumerate(param_grid['n_estimators']):
                    MSE_train=[]
                    MSE_test=[]
                    oob_error=[]
                    total_train=0
                    total_test=0
                    for train_index, test_index in kf.split(df):
                        X_train= df.iloc[train_index]
                        y_train= df_y.iloc[train_index]
                        X_test= df.iloc[test_index]
                        y_test= df_y.iloc[test_index]
                        reg = GradientBoostingRegressor(n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split, max_features=max_features,random_state=42)
                        reg.fit(X_train,y_train)
                        pred_train = reg.predict(X_train)
                        pred_test = reg.predict(X_test)
                        MSE_train.append(mean_squared_error(y_train, pred_train)*len(train_index))
                        MSE_test.append(mean_squared_error(y_test, pred_test)*len(test_index))
                        total_train=total_train+len(train_index)
                        total_test=total_test+len(test_index)
                        
                    avg_RMSE1_test[n1,n2,n3,n4,n5]=np.sqrt(sum(MSE_test)/total_test)
                    avg_RMSE1_train[n1,n2,n3,n4,n5]=np.sqrt(sum(MSE_train)/total_train)
                    print("max_depth=",max_depth,"max_features=",max_features,"min_samples_leaf=",min_samples_leaf,"min_samples_split=",min_samples_split,"n_estimators=",n_estimators)
                    print("RMSE_train=",avg_RMSE1_train[n1,n2,n3,n4,n5],"RMSE_test=",avg_RMSE1_test[n1,n2,n3,n4,n5])
                    print("")

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 5.434509605085224 RMSE_test= 5739.7674133289065

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.8689189533183477 RMSE_test= 5739.774353066769

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3910610136301954 RMSE_test= 5739.774653218022

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.36128737433237595 RMSE_test= 5739.774186226752

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.3598360659122803 RMSE_test= 5739.774210820761

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.3597606337342153 RMSE_test= 5739.774209187793

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_est

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.8507576038794517 RMSE_test= 5687.027741523853

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.49197233538891016 RMSE_test= 5687.034623607951

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.39292556055777744 RMSE_test= 5687.037024032263

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.36768931673110417 RMSE_test= 5687.0384654979935

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3617684835216491 RMSE_test= 5687.039164835266

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.36029906421404334 RMSE_test= 5687.039483750899

max_depth= 10 max_features= auto min_samples_leaf= 4 min_samples_

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.35975558821524756 RMSE_test= 5039.642285571695

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.3597555549282824 RMSE_test= 5039.642285983389

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 29.391564007792233 RMSE_test= 4924.359180294276

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 5.293820984819998 RMSE_test= 4924.43035157491

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 1.5490233157941042 RMSE_test= 4924.436089906007

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.6141128615598503 RMSE_test= 4924.438223455339

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 64.90989304310018 RMSE_test= 6248.969466750587

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 29.120957674075186 RMSE_test= 6262.580436890782

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 15.061068778867071 RMSE_test= 6266.168396054188

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 8.206393578474609 RMSE_test= 6267.393626925018

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 4.693983031286479 RMSE_test= 6268.136923047943

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1600
RMSE_train= 2.8354967757822025 RMSE_test= 6268.563901497239

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estim

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.35975548469399804 RMSE_test= 5777.357900945414

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.35975547320382767 RMSE_test= 5777.357901263832

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.3597554661606487 RMSE_test= 5777.357901795318

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.35975546141159137 RMSE_test= 5777.357902141608

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 14.503422249469986 RMSE_test= 5853.213339753343

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 0.46636525967437015 RMSE_test= 5857.007475795074

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 5 

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.35975555591876396 RMSE_test= 4604.345626299735

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.35975550672431006 RMSE_test= 4604.3456306044345

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3597554849527186 RMSE_test= 4604.345631648178

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.35975547285699844 RMSE_test= 4604.345632064071

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.35975546526374913 RMSE_test= 4604.34563227438

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.3597554607070574 RMSE_test= 4604.3456323597975

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.3598771693386437 RMSE_test= 5640.64085567377

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.3597609137127181 RMSE_test= 5640.641260620274

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.35975590002789604 RMSE_test= 5640.641277037688

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.35975556786992186 RMSE_test= 5640.641278173104

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3597555237212021 RMSE_test= 5640.641278645873

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.3597555016737429 RMSE_test= 5640.64127899746

max_depth= 20 max_features= sqrt min_samples_leaf= 4 min_samples_split

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.3597554516501459 RMSE_test= 5761.985290549786

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.359755451535196 RMSE_test= 5761.985290562651

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 0.3905034089251966 RMSE_test= 5320.187714765032

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.35975778395280855 RMSE_test= 5320.1899117035455

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.35975553420003126 RMSE_test= 5320.1899221871545

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.3597554829591421 RMSE_test= 5320.189921980833

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 1

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 27.55776788180906 RMSE_test= 6605.14276444652

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 7.541138165271074 RMSE_test= 6610.109900871217

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 2.1510885559167434 RMSE_test= 6610.717630761395

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.7913374671382695 RMSE_test= 6610.827289464108

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.5008857736030798 RMSE_test= 6610.897103904083

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.4214552091143093 RMSE_test= 6610.945854562889

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estim

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.3597554574612758 RMSE_test= 5362.207356344364

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.3597554549368455 RMSE_test= 5362.207356771584

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.35975545348739457 RMSE_test= 5362.20735693059

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.3597554526411462 RMSE_test= 5362.207357037537

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 37.97724032959834 RMSE_test= 5376.527404573199

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 1.5355938492939436 RMSE_test= 5403.172180835548

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_est

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.35975550647089066 RMSE_test= 5727.472330149421

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.35975546408852654 RMSE_test= 5727.472334352784

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3597554554925943 RMSE_test= 5727.472334810708

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.3597554529653409 RMSE_test= 5727.472335017894

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.3597554521061986 RMSE_test= 5727.472335200223

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.35975545176443097 RMSE_test= 5727.472335248744

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_e

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.35975547610209263 RMSE_test= 5743.095240558188

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.35975546020254395 RMSE_test= 5743.095238572329

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.3597554551756412 RMSE_test= 5743.095238536943

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.35975545319518176 RMSE_test= 5743.0952385144

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3597554522836765 RMSE_test= 5743.095238536717

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.359755451854641 RMSE_test= 5743.095238576071

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_split

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.35975545158255245 RMSE_test= 4875.624764026277

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.35975545149933297 RMSE_test= 4875.62476402098

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 0.4539483812253052 RMSE_test= 4968.759399831014

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.3597555823282853 RMSE_test= 4968.792810406064

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.35975548642265776 RMSE_test= 4968.7928116993935

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.35975546499012895 RMSE_test= 4968.792811869385

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 45.488892633993984 RMSE_test= 6152.8308798778435

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 16.757857167521657 RMSE_test= 6166.639464906848

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 6.654737093907334 RMSE_test= 6170.219501301611

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 3.1294997967928864 RMSE_test= 6171.044493042333

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 1.5511294716450337 RMSE_test= 6171.670336900135

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.8961469702208372 RMSE_test= 6172.038993121624

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_es

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.35975545165701034 RMSE_test= 5769.630042499043

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.3597554515166057 RMSE_test= 5769.630042530265

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.3597554514570259 RMSE_test= 5769.6300425362315

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.3597554514283809 RMSE_test= 5769.630042532599

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 13.946292513347144 RMSE_test= 5805.917370886311

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 0.4276298026022887 RMSE_test= 5810.493159178107

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.35975551938881667 RMSE_test= 5100.104778174412

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.35975546550142795 RMSE_test= 5100.104782609651

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.359755456474232 RMSE_test= 5100.104782985246

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.3597554534612027 RMSE_test= 5100.104783080847

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.3597554523351125 RMSE_test= 5100.104783206611

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.3597554518225896 RMSE_test= 5100.1047832550175

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_es

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.359761054326114 RMSE_test= 5754.851639833789

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.3597554934903986 RMSE_test= 5754.851951511294

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.3597554543275205 RMSE_test= 5754.851972167493

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.35975545280721255 RMSE_test= 5754.851973143433

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3597554521353487 RMSE_test= 5754.8519733248095

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.35975545178250945 RMSE_test= 5754.851973385118

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_spl

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.3597554514158323 RMSE_test= 5762.940273040596

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.3597554514080869 RMSE_test= 5762.940273041902

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 0.36111520779536305 RMSE_test= 5321.813989497353

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.35975550143901036 RMSE_test= 5321.815559853729

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.3597554611919984 RMSE_test= 5321.815559776204

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.35975545456830094 RMSE_test= 5321.815559713456

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_s

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 400
RMSE_train= 94.1552062971764 RMSE_test= 6574.0730295940275

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 25.168223902027368 RMSE_test= 6597.957832293069

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 7.255162906633742 RMSE_test= 6602.681351984229

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 2.1282518070063916 RMSE_test= 6603.378165511955

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.7992801352223351 RMSE_test= 6603.495810829193

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.515035340977998 RMSE_test= 6603.566881960387

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.35975545397394426 RMSE_test= 5437.701754992974

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.3597554525414903 RMSE_test= 5437.701755415499

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.35975545196771275 RMSE_test= 5437.701755527459

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.35975545171517603 RMSE_test= 5437.701755551625

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.3597554515699288 RMSE_test= 5437.701755582879

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.359755451497218 RMSE_test= 5437.701755606722

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_sp

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1600
RMSE_train= 2.142361872503413 RMSE_test= 6154.194516332041

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1800
RMSE_train= 1.1589712947223079 RMSE_test= 6154.426245027725

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.6882226114323649 RMSE_test= 6154.557217532217

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.3597554975982607 RMSE_test= 5727.472095195647

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.3597554558893746 RMSE_test= 5727.472099988764

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.35975545273706205 RMSE_test= 5727.472100449868

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_spli

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 200
RMSE_train= 30.88918441479124 RMSE_test= 5706.236243375119

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.7390230942656986 RMSE_test= 5722.10154713371

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.36010985217335945 RMSE_test= 5722.419559719877

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.3597557996503513 RMSE_test= 5722.425279633919

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.35975545271754217 RMSE_test= 5722.425405521511

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.3597554516596509 RMSE_test= 5722.425402399949

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_sp

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 800
RMSE_train= 0.35975545250465607 RMSE_test= 4875.806176902593

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1000
RMSE_train= 0.3597554519173992 RMSE_test= 4875.806176903881

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.3597554516449696 RMSE_test= 4875.806176923413

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.359755451521651 RMSE_test= 4875.806176928607

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.3597554514607481 RMSE_test= 4875.806176937231

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.35975545143124354 RMSE_test= 4875.806176917927

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_spli

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1400
RMSE_train= 1.425985770618491 RMSE_test= 6135.870006410812

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.8617222044032511 RMSE_test= 6136.226805835202

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.6144119737763384 RMSE_test= 6136.369766753023

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.4984793673797844 RMSE_test= 6136.479244915765

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 200
RMSE_train= 484.24441350912235 RMSE_test= 5888.589795364822

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 400
RMSE_train= 135.5956392687939 RMSE_test= 6073.4826461522

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.35975545140369863 RMSE_test= 5321.629444113596

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 200
RMSE_train= 7.773683984795851 RMSE_test= 5774.549084815504

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.36424385322646347 RMSE_test= 5776.071163444232

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.35975580471074864 RMSE_test= 5776.0752023765335

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.3597554523476387 RMSE_test= 5776.075235289043

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.3597554517140038 RMSE_test= 5776.075235577959

max_depth= None max_features= auto min_samples_leaf= 2 min_sampl

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 600
RMSE_train= 64.7117501309452 RMSE_test= 6400.4518850356435

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 800
RMSE_train= 22.069257122096108 RMSE_test= 6410.75720555098

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1000
RMSE_train= 8.934278671674667 RMSE_test= 6413.777829656321

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1200
RMSE_train= 3.816483655083502 RMSE_test= 6414.09533861013

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1400
RMSE_train= 1.749106116204355 RMSE_test= 6414.265926850968

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.848849854742872 RMSE_test= 6414.366223996677

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.3597554522641322 RMSE_test= 5324.716274435602

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.35975545182842117 RMSE_test= 5324.716276252057

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.3597554516421 RMSE_test= 5324.716276261805

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.35975545153665817 RMSE_test= 5324.716276316646

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.35975545147962573 RMSE_test= 5324.716276347868

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 200
RMSE_train= 81.11970266161171 RMSE_test= 5701.86136158785

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples

In [13]:
print("min RMSE in testset=",np.min(avg_RMSE1_test))
print("parameters:")
n1,n2,n3,n4,n5= np.unravel_index(np.argmin(avg_RMSE1_test), avg_RMSE1_test.shape)
print("max_depth=",param_grid["max_depth"][n1],"max_features=",param_grid["max_features"][n2],"min_samples_leaf=",param_grid["min_samples_leaf"][n3],"min_samples_split=",param_grid["min_samples_split"][n4],"n_estimators=",param_grid["n_estimators"][n5])
print("")

min RMSE in testset= 4604.345626299735
parameters:
max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200



In [15]:
reg = GradientBoostingRegressor(n_estimators=param_grid["n_estimators"][n5],max_depth=param_grid["max_depth"][n1],min_samples_leaf=param_grid["min_samples_leaf"][n3],min_samples_split=param_grid["min_samples_split"][n4], max_features=param_grid["max_features"][n2],random_state=42)
reg.fit(df,df_y)
pred = reg.predict(df)
print("MSE=",mean_squared_error(df_y, pred))

MSE= 0.13004878095281003
