In [1]:
import json
import datetime
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from statistics import mean
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
# import datetime,time

In [2]:
path = "ECE219_tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", \
        "tweets_#nfl.txt", "tweets_#patriots.txt", \
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]
topics = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]
test_files =["sample0_period1.txt","sample0_period2.txt","sample0_period3.txt", \
             "sample1_period1.txt","sample1_period2.txt","sample1_period3.txt","sample2_period1.txt",\
             "sample2_period2.txt","sample2_period3.txt"]

for i, fl in enumerate(files):
    print("files[" + str(i) + "] => " + fl)

files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [3]:
def parse_dataset(file):
    """
    Prase x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, hour of the day
    with lines of hours,
    Prase y of dataset as number of tweets in the next hour.
    """
   
    # extract raw features
    pst_tz = pytz.timezone('America/Los_Angeles')
    data_raw = []
    for line in open(path + file, 'r', encoding="utf-8") :
        row_tmp = []
        a = json.loads(line)
        time = a['citation_date']
        retweet = a['metrics']['citations']['total']
        foll = a['author']['followers']        
        row_tmp.append(time)
        row_tmp.append(retweet)
        row_tmp.append(foll)        
        data_raw.append(row_tmp)
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','retweets','followers'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)
    pddata_raw['tweets'] = 1    
#     print(pddata_raw)

    # reset time to hour index
    hour_accu = []
    hour_day = []
    for index, row in pddata_raw.iterrows():  
        p = datetime.datetime.fromtimestamp(row["time"], pst_tz)  
        hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)
        hour_day.append(p.hour)    
    pddata_raw["time"] = hour_accu
    pddata_raw["hour of day"] = hour_day
    
    # create a new dataframe with desired form
    df = pd.DataFrame([],columns=['hour index','tweets','retweets','followers sum','followers max','hour of day'])
    df['hour index'] = range(pddata_raw.iloc[len(pddata_raw.index)-1,0]+1)
    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['hour of day'] = pddata_raw.groupby("time")['hour of day'].mean()
    df = df.drop([0]).fillna(0).reset_index(drop=True)

    # assign number of tweets of the next hour to be the target value
    df_y = df.iloc[1:,1].reset_index(drop=True)
    df = df[:len(df_y)]
    
    return df.iloc[:,1:],df_y

In [4]:
def parse_dataset_ten_features(file):
    """
    Prase x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, mentioned, media, active, author, favourites_count, title
    with lines of hours,
    where mentioned: number of @ in tweets per hour
          media: number of url in tweets per hour
          active: a measure of active state of author
          author: number of unique authors post tweet per hour
          favourites_count: the total number of likes by this user
          title: length of this tweet's title
    Prase y of dataset as number of tweets in the next hour.
    """
   
    # extract raw features
    pst_tz = pytz.timezone('America/Los_Angeles')
    data_raw = []
    for line in open(path + file, 'r', encoding="utf-8") :
        row_tmp = []
        a = json.loads(line)
        time = a['citation_date']
        retweet = a['metrics']['citations']['total']
        foll = a['author']['followers']             
        ment = len(a['tweet']['entities']['user_mentions'])        
        medi = len(a['tweet']['extended_entities']['media']) if 'extended_entities' in a['tweet'] else 0
        hist_tw = a['tweet']['user']["statuses_count"]
        hist_yr = a['tweet']['user']['created_at'][-4:]
        acti = hist_tw/(2015-float(hist_yr)+1) 
        auth = a['author']['name']
        favo = a['tweet']['user']['favourites_count']
        titl = len(a['title'])
        
        # append to list
        row_tmp.append(time)
        row_tmp.append(retweet)
        row_tmp.append(foll)    
        row_tmp.append(ment) 
        row_tmp.append(medi) 
        row_tmp.append(acti)  
        row_tmp.append(auth)
        row_tmp.append(favo)
        row_tmp.append(titl)        
        data_raw.append(row_tmp)
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','retweets','followers','mentioned','media',\
                                                'active','author','favourites_count','title'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)
    pddata_raw['tweets'] = 1                   
#     print(pddata_raw)

    # reset time to hour index
    hour_accu = []
    hour_day = []
    for index, row in pddata_raw.iterrows():  
        p = datetime.datetime.fromtimestamp(row["time"], pst_tz)  
        hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)
        hour_day.append(p.hour)    
    pddata_raw["time"] = hour_accu
    pddata_raw["hour of day"] = hour_day
    
    # create a new dataframe with desired form
    df = pd.DataFrame([],columns=['hour index','tweets','retweets','followers sum','followers max',\
                                  'mentioned','media','active','author','favourites_count','title'])
    df['hour index'] = range(pddata_raw.iloc[len(pddata_raw.index)-1,0]+1)
    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['mentioned'] = pddata_raw.groupby("time")['mentioned'].sum()
    df['media'] = pddata_raw.groupby("time")['media'].sum()
    df['active'] = pddata_raw.groupby("time")['active'].mean()  
    df['author'] = pddata_raw.groupby("time")['author'].nunique() # count number of not-repeating authors    
    df['favourites_count'] = pddata_raw.groupby("time")['favourites_count'].sum()
    df['title'] = pddata_raw.groupby("time")['title'].mean()
            
    # reset index of df
    df = df.drop([0]).fillna(0).reset_index(drop=True)

    # assign number of tweets of the next hour to be the target value
    df_y = df.iloc[1:,1].reset_index(drop=True)
    df = df[:len(df_y)]
    
    return df.iloc[:,1:],df_y

In [5]:
def plot_recipe(df_y, pred_y):
    """
    This function plots fitted values vs true values
    """
    plt.figure()
    area = np.pi * (4)**2/4
    plt.scatter(df_y, pred_y, s = area)
    plt.plot([df_y.min(), df_y.max()], [df_y.min(), df_y.max()], 'k--', lw = 1)
    plt.xlabel('true values')
    plt.ylabel('fitted values')
    plt.show()

In [6]:

def ols_regression(df, df_y):
    X2 = sm.add_constant(df)
    y = df_y.as_matrix()
    lm = sm.OLS(y, X2).fit()
    print(lm.summary())
    print(list(df))

In [7]:
#aggregate all data 
df, df_y = parse_dataset_ten_features(files[0])
for i in range(1,6):
    df_temp,df_temp_y=parse_dataset_ten_features(files[i])
    df=df.append(df_temp,ignore_index=True)
    df_y=df_y.append(df_temp_y,ignore_index=True)
    print(df_temp.shape,df.shape)

(574, 10) (1151, 10)
(585, 10) (1736, 10)
(585, 10) (2321, 10)
(585, 10) (2906, 10)
(585, 10) (3491, 10)


In [8]:
param_grid={
'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800, 1000,
1200, 1400, 1600, 1800, 2000]
}
kf = KFold(n_splits=5,random_state=42,shuffle=True)

# GradientBoostingRegressor

In [9]:

avg_RMSE1_train=np.zeros((8,2,3,3,10))
avg_RMSE1_test=np.zeros((8,2,3,3,10))
for n1,max_depth in enumerate(param_grid['max_depth']):
    for n2,max_features in enumerate(param_grid['max_features']):
        for n3,min_samples_leaf in enumerate(param_grid['min_samples_leaf']):
            for n4,min_samples_split in enumerate(param_grid['min_samples_split']):
                for n5,n_estimators in enumerate(param_grid['n_estimators']):
                    MSE_train=[]
                    MSE_test=[]
                    oob_error=[]
                    total_train=0
                    total_test=0
                    for train_index, test_index in kf.split(df):
                        X_train= df.iloc[train_index]
                        y_train= df_y.iloc[train_index]
                        X_test= df.iloc[test_index]
                        y_test= df_y.iloc[test_index]
                        reg = GradientBoostingRegressor(n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split, max_features=max_features,random_state=42)
                        reg.fit(X_train,y_train)
                        pred_train = reg.predict(X_train)
                        pred_test = reg.predict(X_test)
                        MSE_train.append(mean_squared_error(y_train, pred_train)*len(train_index))
                        MSE_test.append(mean_squared_error(y_test, pred_test)*len(test_index))
                        total_train=total_train+len(train_index)
                        total_test=total_test+len(test_index)

                    avg_RMSE1_test[n1,n2,n3,n4,n5]=np.sqrt(sum(MSE_test)/total_test)
                    avg_RMSE1_train[n1,n2,n3,n4,n5]=np.sqrt(sum(MSE_train)/total_train)
                    print("max_depth=",max_depth,"max_features=",max_features,"min_samples_leaf=",min_samples_leaf,"min_samples_split=",min_samples_split,"n_estimators=",n_estimators)
                    print("RMSE_train=",avg_RMSE1_train[n1,n2,n3,n4,n5],"RMSE_test=",avg_RMSE1_test[n1,n2,n3,n4,n5])
                    print("")

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 5.141715445523478 RMSE_test= 5396.208295466027

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.6528196706645077 RMSE_test= 5396.232648341095

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3669581095824265 RMSE_test= 5396.234736516447

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.3591412450614904 RMSE_test= 5396.235281865453

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.3589631520633751 RMSE_test= 5396.235409844648

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.35895857474646353 RMSE_test= 5396.23542276963

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estim

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.38201372518958376 RMSE_test= 5701.144432398737

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.3611069924793405 RMSE_test= 5701.146373669467

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.3591592857734845 RMSE_test= 5701.146943909656

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.3589777199113105 RMSE_test= 5701.147164669193

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3589600310276557 RMSE_test= 5701.147217816784

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.3589585640010906 RMSE_test= 5701.147239859548

max_depth= 10 max_features= auto min_samples_leaf= 4 min_samples_spli

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.35895837931082897 RMSE_test= 4992.24139746165

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.3589583673904833 RMSE_test= 4992.241397482902

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 24.829818083992304 RMSE_test= 4766.506900508761

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 3.1064986960760788 RMSE_test= 4766.777183836446

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.6378267505031401 RMSE_test= 4766.773539212992

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.37445587859008816 RMSE_test= 4766.772489416971

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 12.92218757871205 RMSE_test= 5616.831281929089

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 4.359475128160015 RMSE_test= 5618.553336035521

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 1.480271068198181 RMSE_test= 5618.871272497125

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.6112936134326629 RMSE_test= 5618.947086559148

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.39673980858772717 RMSE_test= 5618.958784184185

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.36308578506561023 RMSE_test= 5618.961363184896

max_depth= 10 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_est

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.3589583382738043 RMSE_test= 5959.5171617689275

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.3589583347554164 RMSE_test= 5959.517162531906

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.3589583328688571 RMSE_test= 5959.5171628049575

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.3589583318221252 RMSE_test= 5959.51716296954

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 1.858842067163892 RMSE_test= 5958.930083964266

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 0.3596818223368732 RMSE_test= 5959.247410925592

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_es

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.35895842554651197 RMSE_test= 5185.641589188667

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.3589583719434108 RMSE_test= 5185.641593246024

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3589583509499759 RMSE_test= 5185.641593315943

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.3589583413104773 RMSE_test= 5185.641593406586

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.35895833623090867 RMSE_test= 5185.641593607185

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.35895833364284874 RMSE_test= 5185.641593674574

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_e

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.35895840115248184 RMSE_test= 5298.563202451252

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.3589583684666465 RMSE_test= 5298.563203334029

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.3589583520168476 RMSE_test= 5298.5632036361085

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.3589583428637365 RMSE_test= 5298.563203953681

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.35895833765090185 RMSE_test= 5298.563204290168

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.35895833477623035 RMSE_test= 5298.563204669913

max_depth= 20 max_features= sqrt min_samples_leaf= 4 min_samples_s

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.35895833054416515 RMSE_test= 5374.510078955571

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.3589583305414899 RMSE_test= 5374.510078955715

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 0.36348448033157726 RMSE_test= 4949.941962278152

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.3589584060666911 RMSE_test= 4949.939333335173

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.3589583479985833 RMSE_test= 4949.939333361674

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.35895833520413 RMSE_test= 4949.939332941679

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 0.7677896341343324 RMSE_test= 6181.319640891996

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 0.365131336444848 RMSE_test= 6181.68756633108

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 0.3590093265887025 RMSE_test= 6181.700700367599

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.35895883559432373 RMSE_test= 6181.702172665328

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.3589583521151717 RMSE_test= 6181.702287289464

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.35895833675651334 RMSE_test= 6181.702295521924

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_es

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.3589583310062414 RMSE_test= 5414.950148992554

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.3589583307438693 RMSE_test= 5414.950149035953

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.3589583306305983 RMSE_test= 5414.950149065373

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.35895833058130955 RMSE_test= 5414.950149091179

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 2.4197068079971875 RMSE_test= 5229.392481103664

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 0.3590200839593329 RMSE_test= 5230.105741012578

max_depth= 40 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_e

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.3589583733745789 RMSE_test= 5485.719695593375

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.35895833483791073 RMSE_test= 5485.719700641221

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3589583311736223 RMSE_test= 5485.7197014444355

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.35895833067176985 RMSE_test= 5485.719701723137

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.358958330571141 RMSE_test= 5485.719701748394

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.3589583305482931 RMSE_test= 5485.719701782649

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_es

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.3589583314878712 RMSE_test= 5748.082720172368

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.35895833077476835 RMSE_test= 5748.082720256425

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.35895833061362603 RMSE_test= 5748.082720283104

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.35895833056952764 RMSE_test= 5748.082720294673

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3589583305526066 RMSE_test= 5748.082720286658

max_depth= 60 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.35895833054588844 RMSE_test= 5748.082720281076

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_s

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.3589583305435857 RMSE_test= 4991.363500027194

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.3589583305410568 RMSE_test= 4991.363500032888

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 0.6611871724889417 RMSE_test= 4681.4247332963405

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.35895837203044895 RMSE_test= 4681.492344740432

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.35895833764862234 RMSE_test= 4681.492344198563

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.35895833215538436 RMSE_test= 4681.492344390273

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 5.12942466813199 RMSE_test= 5627.95011559654

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 1.121642626520362 RMSE_test= 5630.918868519852

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 0.4279980484592641 RMSE_test= 5631.513448746582

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.3620864879181088 RMSE_test= 5631.6668986967115

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.35913417105299644 RMSE_test= 5631.690168132544

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.35897271748614 RMSE_test= 5631.694249767926

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estima

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.35895833054977455 RMSE_test= 5954.631892054435

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.3589583305431299 RMSE_test= 5954.631892054731

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.3589583305408615 RMSE_test= 5954.631892051154

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.35895833054003484 RMSE_test= 5954.631892053009

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 1.1652989642483427 RMSE_test= 5897.219473668475

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 0.3589589795184325 RMSE_test= 5897.580006834361

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.3589583810302091 RMSE_test= 4632.326415816641

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.3589583363773194 RMSE_test= 4632.326418819545

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.35895833134841776 RMSE_test= 4632.326419141145

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.3589583306833873 RMSE_test= 4632.326419047175

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.3589583305731815 RMSE_test= 4632.326419087394

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.3589583305509119 RMSE_test= 4632.326419097749

max_depth= 80 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_est

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.3589583314039828 RMSE_test= 5377.201428146703

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.3589583307773335 RMSE_test= 5377.201428605201

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.35895833061529825 RMSE_test= 5377.201428784623

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.35895833056642473 RMSE_test= 5377.201428840658

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3589583305509625 RMSE_test= 5377.201428871226

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.3589583305439359 RMSE_test= 5377.201428879671

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_spl

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.3589583305395889 RMSE_test= 5374.95346563616

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.3589583305395459 RMSE_test= 5374.953465636133

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 0.3606632860761821 RMSE_test= 4937.453746075501

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.35895833973863667 RMSE_test= 4937.449563606668

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.35895833131955207 RMSE_test= 4937.4495633806255

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.3589583306505277 RMSE_test= 4937.449563352306

max_depth= 100 max_features= auto min_samples_leaf= 1 min_samples_sp

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 400
RMSE_train= 8.72657591800741 RMSE_test= 6154.089464516477

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 0.791572891267089 RMSE_test= 6161.596285460446

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 0.3648996376188061 RMSE_test= 6161.985710331183

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 0.3590150493578686 RMSE_test= 6162.001454262884

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.35895901536219044 RMSE_test= 6162.002787203355

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.35895834806379706 RMSE_test= 6162.002912957195

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 5

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.3589583307152215 RMSE_test= 5361.99802962767

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1200
RMSE_train= 0.35895833060218213 RMSE_test= 5361.998029761314

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.3589583305632691 RMSE_test= 5361.998029790932

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.3589583305503002 RMSE_test= 5361.998029789406

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.35895833054417997 RMSE_test= 5361.998029785726

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.3589583305416984 RMSE_test= 5361.998029785484

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_spl

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.3590220272289799 RMSE_test= 5816.085350997881

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1800
RMSE_train= 0.3589617627624718 RMSE_test= 5816.088882048711

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.35895851716595784 RMSE_test= 5816.0896966351365

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.3589583635157279 RMSE_test= 5485.719710557522

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.35895833167138497 RMSE_test= 5485.719714374307

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3589583306068866 RMSE_test= 5485.719714606453

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_s

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 200
RMSE_train= 16.426394472132834 RMSE_test= 5747.645620603264

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 400
RMSE_train= 0.37133777400211476 RMSE_test= 5754.942076003136

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 600
RMSE_train= 0.3589594248752433 RMSE_test= 5755.007609297184

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.3589583317995601 RMSE_test= 5755.008368958478

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.3589583308751058 RMSE_test= 5755.008376750721

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.35895833066547006 RMSE_test= 5755.008376758293

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 800
RMSE_train= 0.3589583307552271 RMSE_test= 4991.406925519868

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1000
RMSE_train= 0.35895833060339416 RMSE_test= 4991.406925573945

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.3589583305607992 RMSE_test= 4991.406925607283

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.3589583305471238 RMSE_test= 4991.406925618687

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.35895833054221793 RMSE_test= 4991.406925619801

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.3589583305405135 RMSE_test= 4991.406925621103

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_spl

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1400
RMSE_train= 0.3590428565169489 RMSE_test= 5645.396458010625

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1600
RMSE_train= 0.35896235613180655 RMSE_test= 5645.401153130438

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1800
RMSE_train= 0.35895849048016526 RMSE_test= 5645.402049544676

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 2000
RMSE_train= 0.35895833785087 RMSE_test= 5645.402186445017

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 200
RMSE_train= 186.92868551873397 RMSE_test= 5532.672335833384

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 400
RMSE_train= 25.245097947954022 RMSE_test= 5629.934672208544

max_depth= 200 max_features= sqrt min_samples_leaf= 4 min_samples_split=

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 2000
RMSE_train= 0.35895833053954396 RMSE_test= 4937.628726842825

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 200
RMSE_train= 0.838902983647403 RMSE_test= 5953.826097834474

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 400
RMSE_train= 0.3589589592849917 RMSE_test= 5954.159314527775

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 600
RMSE_train= 0.3589583314017385 RMSE_test= 5954.1596800066645

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 800
RMSE_train= 0.3589583307114075 RMSE_test= 5954.159680862087

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1000
RMSE_train= 0.35895833059688975 RMSE_test= 5954.159680939691

max_depth= None max_features= auto min_samples_leaf= 2 min_sample

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 600
RMSE_train= 3.69417522791253 RMSE_test= 5894.163908269233

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 800
RMSE_train= 0.6496053799655905 RMSE_test= 5896.279753596769

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1000
RMSE_train= 0.3685863617703027 RMSE_test= 5896.5610394776395

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1200
RMSE_train= 0.35927392376005185 RMSE_test= 5896.610058809183

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1400
RMSE_train= 0.358969260409192 RMSE_test= 5896.619156377566

max_depth= None max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1600
RMSE_train= 0.3589587333476491 RMSE_test= 5896.620863211788

max_depth= None max_features= auto min_samples_leaf= 4 min_sa

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1200
RMSE_train= 0.3589583305819096 RMSE_test= 5205.477035818478

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1400
RMSE_train= 0.35895833055501336 RMSE_test= 5205.477035877351

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1600
RMSE_train= 0.35895833054531845 RMSE_test= 5205.477035886759

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1800
RMSE_train= 0.35895833054173754 RMSE_test= 5205.4770358904325

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 2000
RMSE_train= 0.358958330540485 RMSE_test= 5205.477035888883

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 200
RMSE_train= 23.898201104093157 RMSE_test= 5370.843193358953

max_depth= None max_features= sqrt min_samples_leaf= 2 min_sa

In [10]:
print("min RMSE in testset=",np.min(avg_RMSE1_test))
print("parameters:")
n1,n2,n3,n4,n5= np.unravel_index(np.argmin(avg_RMSE1_test), avg_RMSE1_test.shape)
print("max_depth=",param_grid["max_depth"][n1],"max_features=",param_grid["max_features"][n2],"min_samples_leaf=",param_grid["min_samples_leaf"][n3],"min_samples_split=",param_grid["min_samples_split"][n4],"n_estimators=",param_grid["n_estimators"][n5])
print("")

min RMSE in testset= 4549.305838591513
parameters:
max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200



In [11]:
reg = GradientBoostingRegressor(n_estimators=param_grid["n_estimators"][n5],max_depth=param_grid["max_depth"][n1],min_samples_leaf=param_grid["min_samples_leaf"][n3],min_samples_split=param_grid["min_samples_split"][n4], max_features=param_grid["max_features"][n2],random_state=42)
reg.fit(df,df_y)
pred = reg.predict(df)
print("MSE=",mean_squared_error(df_y, pred))

MSE= 4.020178554388553
