In [13]:
import json
import datetime
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from statistics import mean
import statsmodels.api as sm
import datetime,time

In [14]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor

In [15]:
path = "tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", \
        "tweets_#nfl.txt", "tweets_#patriots.txt", \
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]
topics = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]
test_files =["sample0_period1.txt","sample0_period2.txt","sample0_period3.txt", \
             "sample1_period1.txt","sample1_period2.txt","sample1_period3.txt","sample2_period1.txt",\
             "sample2_period2.txt","sample2_period3.txt"]

for i, fl in enumerate(files):
    print("files[" + str(i) + "] => " + fl)

files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [16]:
def parse_dataset(file):
    """
    Prase x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, hour of the day
    with lines of hours,
    Prase y of dataset as number of tweets in the next hour.
    """
   
    # extract raw features
    pst_tz = pytz.timezone('America/Los_Angeles')
    data_raw = []
    for line in open(path + file, 'r', encoding="utf-8") :
        row_tmp = []
        a = json.loads(line)
        time = a['citation_date']
        retweet = a['metrics']['citations']['total']
        foll = a['author']['followers']        
        row_tmp.append(time)
        row_tmp.append(retweet)
        row_tmp.append(foll)        
        data_raw.append(row_tmp)
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','retweets','followers'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)
    pddata_raw['tweets'] = 1    
#     print(pddata_raw)

    # reset time to hour index
    hour_accu = []
    hour_day = []
    for index, row in pddata_raw.iterrows():  
        p = datetime.datetime.fromtimestamp(row["time"], pst_tz)  
        hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)
        hour_day.append(p.hour)    
    pddata_raw["time"] = hour_accu
    pddata_raw["hour of day"] = hour_day
    
    # create a new dataframe with desired form
    df = pd.DataFrame([],columns=['hour index','tweets','retweets','followers sum','followers max','hour of day'])
    df['hour index'] = range(pddata_raw.iloc[len(pddata_raw.index)-1,0]+1)
    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['hour of day'] = pddata_raw.groupby("time")['hour of day'].mean()
    df = df.drop([0]).fillna(0).reset_index(drop=True)

    # assign number of tweets of the next hour to be the target value
    df_y = df.iloc[1:,1].reset_index(drop=True)
    df = df[:len(df_y)]
    
    return df.iloc[:,1:],df_y

In [17]:
def parse_dataset_ten_features(file):
    """
    Prase x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, mentioned, media, active, author, favourites_count, title
    with lines of hours,
    where mentioned: number of @ in tweets per hour
          media: number of url in tweets per hour
          active: a measure of active state of author
          author: number of unique authors post tweet per hour
          favourites_count: the total number of likes by this user
          title: length of this tweet's title
    Prase y of dataset as number of tweets in the next hour.
    """
   
    # extract raw features
    pst_tz = pytz.timezone('America/Los_Angeles')
    data_raw = []
    for line in open(path + file, 'r', encoding="utf-8") :
        row_tmp = []
        a = json.loads(line)
        citation_date = a['citation_date']
        retweet = a['metrics']['citations']['total']
        foll = a['author']['followers']             
        ment = len(a['tweet']['entities']['user_mentions'])        
        medi = len(a['tweet']['extended_entities']['media']) if 'extended_entities' in a['tweet'] else 0
        hist_tw = a['tweet']['user']["statuses_count"]
        hist_yr = a['tweet']['user']['created_at'][-4:]
        acti = hist_tw/(2015-float(hist_yr)+1) 
        auth = a['author']['name']
        favo = a['tweet']['user']['favourites_count']
        titl = len(a['title'])
        
        # append to list
        row_tmp.append(citation_date)
        row_tmp.append(retweet)
        row_tmp.append(foll)    
        row_tmp.append(ment) 
        row_tmp.append(medi) 
        row_tmp.append(acti)  
        row_tmp.append(auth)
        row_tmp.append(favo)
        row_tmp.append(titl)        
        data_raw.append(row_tmp)
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','retweets','followers','mentioned','media',\
                                                'active','author','favourites_count','title'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)
    pddata_raw['tweets'] = 1                   
#     print(pddata_raw)

    # reset time to hour index
    hour_accu = []
    hour_day = []
    for index, row in pddata_raw.iterrows():  
        p = datetime.datetime.fromtimestamp(row["time"], pst_tz)  
        hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)
        hour_day.append(p.hour)    
    pddata_raw["time"] = hour_accu
    pddata_raw["hour of day"] = hour_day
    
    # create a new dataframe with desired form
    df = pd.DataFrame([],columns=['hour index','tweets','retweets','followers sum','followers max',\
                                  'mentioned','media','active','author','favourites_count','title'])
    df['hour index'] = range(pddata_raw.iloc[len(pddata_raw.index)-1,0]+1)
    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['mentioned'] = pddata_raw.groupby("time")['mentioned'].sum()
    df['media'] = pddata_raw.groupby("time")['media'].sum()
    df['active'] = pddata_raw.groupby("time")['active'].mean()  
    df['author'] = pddata_raw.groupby("time")['author'].nunique() # count number of not-repeating authors    
    df['favourites_count'] = pddata_raw.groupby("time")['favourites_count'].sum()
    df['title'] = pddata_raw.groupby("time")['title'].mean()
            
    # reset index of df
    df = df.drop([0]).fillna(0).reset_index(drop=True)

    # assign number of tweets of the next hour to be the target value
    df_y = df.iloc[1:,1].reset_index(drop=True)
    df = df[:len(df_y)]
    
    return df.iloc[:,1:],df_y

In [18]:
# df,df_y=parse_dataset_ten_features(files[1])
# print(df.shape,df_y.shape)
# df[0:50]

In [19]:
def plot_recipe(df_y, pred_y):
    """
    This function plots fitted values vs true values
    """
    plt.figure()
    area = np.pi * (4)**2/4
    plt.scatter(df_y, pred_y, s = area)
    plt.plot([df_y.min(), df_y.max()], [df_y.min(), df_y.max()], 'k--', lw = 1)
    plt.xlabel('true values')
    plt.ylabel('fitted values')
    plt.show()

In [20]:

def ols_regression(df, df_y):
    X2 = sm.add_constant(df)
    y = df_y.as_matrix()
    lm = sm.OLS(y, X2).fit()
    print(lm.summary())
    print(list(df))

In [21]:
#aggregate all data 
df, df_y = parse_dataset_ten_features(files[0])
for i in range(1,6):
    df_temp,df_temp_y=parse_dataset_ten_features(files[i])
    df=df.append(df_temp,ignore_index=True)
    df_y=df_y.append(df_temp_y,ignore_index=True)
    print(df_temp.shape,df.shape)

(574, 10) (1151, 10)
(585, 10) (1736, 10)
(585, 10) (2321, 10)
(585, 10) (2906, 10)
(585, 10) (3491, 10)


In [22]:
param_grid={
'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800, 1000,
1200, 1400, 1600, 1800, 2000]
}
kf = KFold(n_splits=5,random_state=42,shuffle=True)

In [23]:
#RandomForestRegressor

In [24]:
avg_RMSE_train=np.zeros((8,2,3,3,10))
avg_RMSE_test=np.zeros((8,2,3,3,10))
avg_oob=np.zeros((8,2,3,3,10))
for n1,max_depth in enumerate(param_grid['max_depth']):
    for n2,max_features in enumerate(param_grid['max_features']):
        for n3,min_samples_leaf in enumerate(param_grid['min_samples_leaf']):
            for n4,min_samples_split in enumerate(param_grid['min_samples_split']):
                for n5,n_estimators in enumerate(param_grid['n_estimators']):
                    MSE_train=[]
                    MSE_test=[]
                    oob_error=[]
                    total_train=0
                    total_test=0
                    for train_index, test_index in kf.split(df):
                        X_train= df.iloc[train_index]
                        y_train= df_y.iloc[train_index]
                        X_test= df.iloc[test_index]
                        y_test= df_y.iloc[test_index]
                        reg = RandomForestRegressor(oob_score=True,n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split, bootstrap=True,max_features=max_features,random_state=42)
                        reg.fit(X_train,y_train)
                        pred_train = reg.predict(X_train)
                        pred_test = reg.predict(X_test)
                        oob_error.append(1 - reg.oob_score_)
                        MSE_train.append(mean_squared_error(y_train, pred_train)*len(train_index))
                        MSE_test.append(mean_squared_error(y_test, pred_test)*len(test_index))
                        total_train=total_train+len(train_index)
                        total_test=total_test+len(test_index)

                    avg_RMSE_test[n1,n2,n3,n4,n5]=np.sqrt(sum(MSE_test)/total_test)
                    avg_RMSE_train[n1,n2,n3,n4,n5]=np.sqrt(sum(MSE_train)/total_train)
                    avg_oob[n1,n2,n3,n4,n5]=mean(oob_error)
                    print("max_depth=",max_depth,"max_features=",max_features,"min_samples_leaf=",min_samples_leaf,"min_samples_split=",min_samples_split,"n_estimators=",n_estimators)
                    print("RMSE_train=",avg_RMSE_train[n1,n2,n3,n4,n5],"RMSE_test=",avg_RMSE_test[n1,n2,n3,n4,n5],"oob=",avg_oob[n1,n2,n3,n4,n5])
                    print("")

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 1814.1988961675663 RMSE_test= 4707.9354801064 oob= 0.3569720887783383

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 1789.9145249017465 RMSE_test= 4720.4453203852245 oob= 0.3559795880799912

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 1755.7456255342938 RMSE_test= 4703.208313109233 oob= 0.3554031037925789

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 1744.8342036850843 RMSE_test= 4713.44074060564 oob= 0.3524505310070561

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 1748.1237163085507 RMSE_test= 4721.389148273278 oob= 0.354556510677296

max_depth= 10 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 1751.8696656

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1600
RMSE_train= 2620.4626406610955 RMSE_test= 4955.698533977563 oob= 0.3789231686966502

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1800
RMSE_train= 2620.9062389763594 RMSE_test= 4964.02271017503 oob= 0.3790870444270925

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 2000
RMSE_train= 2618.517865785405 RMSE_test= 4960.862165637417 oob= 0.3785725527647451

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 200
RMSE_train= 3456.263428524737 RMSE_test= 4935.3731293243145 oob= 0.4066546645535882

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 400
RMSE_train= 3452.521576666194 RMSE_test= 4944.450776308151 oob= 0.4011647619448613

max_depth= 10 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 600
RMSE_train= 3437.584

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 1695.0224228145992 RMSE_test= 4453.634835820852 oob= 0.3368151083929068

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 1692.8916279070982 RMSE_test= 4458.226493845716 oob= 0.33458532956778264

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1400
RMSE_train= 1691.1327048776407 RMSE_test= 4454.648106476469 oob= 0.3343293253243356

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1600
RMSE_train= 1697.6731201226678 RMSE_test= 4460.589134595095 oob= 0.33577776314396673

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1800
RMSE_train= 1697.9408190414672 RMSE_test= 4448.778163974289 oob= 0.3365168059475801

max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 2000
RMSE_train= 169

max_depth= 10 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 400
RMSE_train= 3522.172926310324 RMSE_test= 4819.799217526375 oob= 0.393332789246675

max_depth= 10 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 600
RMSE_train= 3495.637307403448 RMSE_test= 4815.597919942167 oob= 0.39034970937815894

max_depth= 10 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 800
RMSE_train= 3486.746764920033 RMSE_test= 4820.101049742313 oob= 0.38907331605527895

max_depth= 10 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1000
RMSE_train= 3490.4611279184596 RMSE_test= 4810.316515969999 oob= 0.3892580461602012

max_depth= 10 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 3494.0095641124562 RMSE_test= 4807.480275263875 oob= 0.3893303092866917

max_depth= 10 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 3486

max_depth= 20 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1800
RMSE_train= 1757.9791719215655 RMSE_test= 4694.330176717846 oob= 0.3568441903151527

max_depth= 20 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 2000
RMSE_train= 1755.1791103457233 RMSE_test= 4696.756012433943 oob= 0.35706073984656445

max_depth= 20 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 200
RMSE_train= 2313.7425903403264 RMSE_test= 4864.085732019369 oob= 0.36718258302305495

max_depth= 20 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 400
RMSE_train= 2300.862658529271 RMSE_test= 4861.516641038722 oob= 0.3686258015361964

max_depth= 20 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 600
RMSE_train= 2270.630546995961 RMSE_test= 4847.184979120423 oob= 0.3678461374306049

max_depth= 20 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 800
RMSE_train= 2264.3819

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1200
RMSE_train= 3430.551580925473 RMSE_test= 4953.589589072776 oob= 0.4012125004775555

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1400
RMSE_train= 3427.4473080394796 RMSE_test= 4947.301516475533 oob= 0.40059558468809686

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1600
RMSE_train= 3429.431204307096 RMSE_test= 4949.610136662299 oob= 0.40032802575155735

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 1800
RMSE_train= 3429.701837356202 RMSE_test= 4953.0532007496995 oob= 0.39994142061078447

max_depth= 20 max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 3433.2098729282043 RMSE_test= 4956.993673094479 oob= 0.3995830127043869

max_depth= 20 max_features= auto min_samples_leaf= 4 min_samples_split= 2 n_estimators= 200
RMSE_train= 

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 600
RMSE_train= 2367.539084296261 RMSE_test= 4613.405621723887 oob= 0.3520441651046333

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 800
RMSE_train= 2369.894292395383 RMSE_test= 4617.144909208058 oob= 0.35144023855018736

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1000
RMSE_train= 2368.620641852763 RMSE_test= 4610.1987566962125 oob= 0.35147499850380715

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1200
RMSE_train= 2368.9494004935273 RMSE_test= 4615.311253491364 oob= 0.3515874627964027

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1400
RMSE_train= 2370.197387717699 RMSE_test= 4619.610709082937 oob= 0.35205906443411805

max_depth= 20 max_features= sqrt min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1600
RMSE_train= 2371.96

max_depth= 20 max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 2000
RMSE_train= 3481.7278438634494 RMSE_test= 4817.002548764905 oob= 0.38714764757738096

max_depth= 20 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 200
RMSE_train= 3857.468990525663 RMSE_test= 4939.343003955784 oob= 0.4096300029324713

max_depth= 20 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 400
RMSE_train= 3865.611219251735 RMSE_test= 4960.024789724428 oob= 0.40895491971074704

max_depth= 20 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 600
RMSE_train= 3860.8309247874618 RMSE_test= 4966.182976643276 oob= 0.40793496042103616

max_depth= 20 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 800
RMSE_train= 3854.4687982098158 RMSE_test= 4966.61480679922 oob= 0.40660875538255115

max_depth= 20 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1000
RMSE_train= 3852.84

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1400
RMSE_train= 2277.523985417786 RMSE_test= 4855.551514930924 oob= 0.3689914258085489

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1600
RMSE_train= 2278.0760208382535 RMSE_test= 4855.4737368320475 oob= 0.36824483682560855

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 1800
RMSE_train= 2275.1224825700883 RMSE_test= 4862.09871943811 oob= 0.36769373532699595

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 5 n_estimators= 2000
RMSE_train= 2271.8957837269363 RMSE_test= 4863.556229832697 oob= 0.36738907195360565

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 3323.912433341378 RMSE_test= 4899.499593908618 oob= 0.3973357396547052

max_depth= 40 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 3328

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 2 n_estimators= 800
RMSE_train= 3620.546123358542 RMSE_test= 5048.847449195128 oob= 0.4094982687333781

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1000
RMSE_train= 3616.671144440298 RMSE_test= 5045.723049048908 oob= 0.41032191143368774

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1200
RMSE_train= 3614.745425223916 RMSE_test= 5043.832463299298 oob= 0.41008805786905617

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1400
RMSE_train= 3613.050843041995 RMSE_test= 5033.295994081369 oob= 0.40939553405622875

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1600
RMSE_train= 3613.5194404883337 RMSE_test= 5039.902513260997 oob= 0.4086010313288595

max_depth= 40 max_features= auto min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1800
RMSE_train= 3613.31

max_depth= 40 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 200
RMSE_train= 3305.4899832167584 RMSE_test= 4752.10094382693 oob= 0.377841749257151

max_depth= 40 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 400
RMSE_train= 3327.268313241713 RMSE_test= 4775.322381946441 oob= 0.38169072132907844

max_depth= 40 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 600
RMSE_train= 3296.9592493580312 RMSE_test= 4773.141704272041 oob= 0.37781793237698025

max_depth= 40 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 800
RMSE_train= 3288.2087949949455 RMSE_test= 4774.2971662333375 oob= 0.376335494700661

max_depth= 40 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1000
RMSE_train= 3289.6711260099805 RMSE_test= 4763.785328183034 oob= 0.37698835711725187

max_depth= 40 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1200
RMSE_train= 329

max_depth= 40 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1600
RMSE_train= 3857.630481363914 RMSE_test= 4960.239020363685 oob= 0.4091254728232322

max_depth= 40 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 1800
RMSE_train= 3855.1349987680164 RMSE_test= 4967.832030761785 oob= 0.40834481399254513

max_depth= 40 max_features= sqrt min_samples_leaf= 4 min_samples_split= 2 n_estimators= 2000
RMSE_train= 3854.7417240702457 RMSE_test= 4965.041570788765 oob= 0.4080488793783999

max_depth= 40 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 200
RMSE_train= 3859.384983044582 RMSE_test= 4965.531202834159 oob= 0.40967189852676766

max_depth= 40 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 400
RMSE_train= 3870.5927373839386 RMSE_test= 4975.464812258375 oob= 0.411238351212323

max_depth= 40 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 3865.5345

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1000
RMSE_train= 3289.079861965704 RMSE_test= 4909.114187486905 oob= 0.3932363856520932

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1200
RMSE_train= 3295.330847703174 RMSE_test= 4916.294945613617 oob= 0.39343618689510407

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1400
RMSE_train= 3291.427904800479 RMSE_test= 4914.557994741648 oob= 0.39301666324665735

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1600
RMSE_train= 3295.50182609969 RMSE_test= 4915.342363108791 oob= 0.3930137452476656

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1800
RMSE_train= 3294.0908049667246 RMSE_test= 4917.674287275542 oob= 0.3925841138864968

max_depth= 60 max_features= auto min_samples_leaf= 1 min_samples_split= 10 n_estimators= 2000
RMSE_train= 32

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 400
RMSE_train= 3635.2901349366653 RMSE_test= 5021.502275837392 oob= 0.41055114083096567

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 600
RMSE_train= 3625.2449929356885 RMSE_test= 5035.2994534776435 oob= 0.4094820476840183

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 800
RMSE_train= 3620.546123358542 RMSE_test= 5048.847449195128 oob= 0.4094982687333781

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1000
RMSE_train= 3616.671144440298 RMSE_test= 5045.723049048908 oob= 0.41032191143368774

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 3614.745425223916 RMSE_test= 5043.832463299298 oob= 0.41008805786905617

max_depth= 60 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 3613.05

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 1800
RMSE_train= 3286.440767762836 RMSE_test= 4762.264992484091 oob= 0.376418806193379

max_depth= 60 max_features= sqrt min_samples_leaf= 1 min_samples_split= 10 n_estimators= 2000
RMSE_train= 3288.2276420463477 RMSE_test= 4765.614614988705 oob= 0.37601995329366966

max_depth= 60 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 200
RMSE_train= 2694.241445842179 RMSE_test= 4752.348837041025 oob= 0.36574786708229823

max_depth= 60 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 400
RMSE_train= 2680.566769675362 RMSE_test= 4741.092897852718 oob= 0.36456554168746985

max_depth= 60 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 600
RMSE_train= 2659.051767730534 RMSE_test= 4717.246589602021 oob= 0.36270590148843673

max_depth= 60 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 800
RMSE_train= 2658.365

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1200
RMSE_train= 3852.753141219758 RMSE_test= 4958.410127397157 oob= 0.40868042313680736

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1400
RMSE_train= 3854.520457970732 RMSE_test= 4953.391045039127 oob= 0.4089791216450007

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1600
RMSE_train= 3857.630481363914 RMSE_test= 4960.239020363685 oob= 0.4091254728232322

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 1800
RMSE_train= 3855.1349987680164 RMSE_test= 4967.832030761785 oob= 0.40834481399254513

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 5 n_estimators= 2000
RMSE_train= 3854.7417240702457 RMSE_test= 4965.041570788765 oob= 0.4080488793783999

max_depth= 60 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 200
RMSE_train= 3994.8

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 600
RMSE_train= 2489.9877811081524 RMSE_test= 5001.936128178622 oob= 0.38222286510871883

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 800
RMSE_train= 2482.820439660079 RMSE_test= 5010.641131729755 oob= 0.3781789759284029

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1000
RMSE_train= 2488.3551210467303 RMSE_test= 5005.1796268921635 oob= 0.3804450566142895

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1200
RMSE_train= 2495.0239760915006 RMSE_test= 5012.764357936072 oob= 0.3804516449971698

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 2490.378528850284 RMSE_test= 5009.437827873974 oob= 0.37915605399166175

max_depth= 80 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 2489.4

max_depth= 80 max_features= auto min_samples_leaf= 4 min_samples_split= 5 n_estimators= 2000
RMSE_train= 3613.1992357916793 RMSE_test= 5040.647668981232 oob= 0.4076992467979301

max_depth= 80 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 200
RMSE_train= 3805.0879130069084 RMSE_test= 5054.837061539337 oob= 0.42174372243194713

max_depth= 80 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 400
RMSE_train= 3808.565977673211 RMSE_test= 5051.143642936738 oob= 0.41870898859427297

max_depth= 80 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 600
RMSE_train= 3797.952024638974 RMSE_test= 5064.597581913008 oob= 0.4173559939476971

max_depth= 80 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 800
RMSE_train= 3789.6030154918376 RMSE_test= 5077.807687921276 oob= 0.4167355181380116

max_depth= 80 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1000
RMSE_train= 3785

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1200
RMSE_train= 2666.2558299056077 RMSE_test= 4706.045917981876 oob= 0.36375690020696305

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1400
RMSE_train= 2665.6727983741234 RMSE_test= 4692.269261216477 oob= 0.3637676674617414

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1600
RMSE_train= 2667.7562870168144 RMSE_test= 4695.580574501849 oob= 0.3630880685755158

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 2665.0282198577984 RMSE_test= 4700.114131447022 oob= 0.3624539784600138

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 2664.1476236806034 RMSE_test= 4701.676158964286 oob= 0.36279776515356826

max_depth= 80 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 2798

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 600
RMSE_train= 3999.3497033401654 RMSE_test= 5011.609970138227 oob= 0.4138859946455376

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 800
RMSE_train= 3990.2386841829225 RMSE_test= 5010.36308820144 oob= 0.4127968053626902

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1000
RMSE_train= 3992.592683183413 RMSE_test= 4997.515112174333 oob= 0.41288936538907084

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1200
RMSE_train= 3993.572688615406 RMSE_test= 4996.442505516442 oob= 0.4135752690354801

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1400
RMSE_train= 3991.450770088581 RMSE_test= 4989.068080971598 oob= 0.4134427135296623

max_depth= 80 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1600
RMSE_train= 3991

max_depth= 100 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 1800
RMSE_train= 2489.3505697445808 RMSE_test= 5012.654428473683 oob= 0.37701987418318283

max_depth= 100 max_features= auto min_samples_leaf= 2 min_samples_split= 2 n_estimators= 2000
RMSE_train= 2487.8907202310716 RMSE_test= 5007.198398021586 oob= 0.3768314833263904

max_depth= 100 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 2649.5042287025353 RMSE_test= 4964.999226307397 oob= 0.38298443571289703

max_depth= 100 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 2646.0902625985036 RMSE_test= 4984.615672970371 oob= 0.3814260894646336

max_depth= 100 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 600
RMSE_train= 2617.688530180844 RMSE_test= 4963.021865933453 oob= 0.3814952381155876

max_depth= 100 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 800
RMSE_train= 26

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1000
RMSE_train= 3785.2650395324017 RMSE_test= 5075.287880410405 oob= 0.4173242413580197

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1200
RMSE_train= 3785.4530505742055 RMSE_test= 5075.999654505371 oob= 0.41727890288422786

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1400
RMSE_train= 3784.2892563392793 RMSE_test= 5066.678707767814 oob= 0.41716894810068694

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1600
RMSE_train= 3784.5101826653267 RMSE_test= 5073.16233686778 oob= 0.41657716585595206

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1800
RMSE_train= 3783.6753773868018 RMSE_test= 5077.306154753291 oob= 0.41632854264342645

max_depth= 100 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 2000
RM

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 200
RMSE_train= 2798.8160258497815 RMSE_test= 4665.521954097283 oob= 0.3684158862423333

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 400
RMSE_train= 2793.4824822293863 RMSE_test= 4724.489595910295 oob= 0.3679617022661592

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 600
RMSE_train= 2775.8448064674353 RMSE_test= 4718.140221912688 oob= 0.36904422883287685

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 800
RMSE_train= 2773.645195181735 RMSE_test= 4713.361694474005 oob= 0.36564982065962903

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1000
RMSE_train= 2778.525536690295 RMSE_test= 4705.418966012818 oob= 0.36693574532797224

max_depth= 100 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1200
RMSE_train= 27

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1400
RMSE_train= 3991.450770088581 RMSE_test= 4989.068080971598 oob= 0.4134427135296623

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1600
RMSE_train= 3991.4165368286126 RMSE_test= 4993.467708533714 oob= 0.4130665450460859

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1800
RMSE_train= 3990.4687178499316 RMSE_test= 4997.473540211605 oob= 0.4126554281955857

max_depth= 100 max_features= sqrt min_samples_leaf= 4 min_samples_split= 10 n_estimators= 2000
RMSE_train= 3990.0330572775038 RMSE_test= 4996.079528924701 oob= 0.41212521161188

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 1817.8077758692918 RMSE_test= 4700.474368276735 oob= 0.35925213022595914

max_depth= 200 max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train=

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 600
RMSE_train= 2617.688530180844 RMSE_test= 4963.021865933453 oob= 0.3814952381155876

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 800
RMSE_train= 2610.8833337596166 RMSE_test= 4974.536692960553 oob= 0.37725396262187316

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1000
RMSE_train= 2617.352806643474 RMSE_test= 4968.3726603436835 oob= 0.38001747276218084

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1200
RMSE_train= 2623.5140138175475 RMSE_test= 4972.810162575653 oob= 0.37996929075921676

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1400
RMSE_train= 2618.926235132617 RMSE_test= 4963.849373625133 oob= 0.37947973118833417

max_depth= 200 max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1600
RMSE_train=

max_depth= 200 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 1800
RMSE_train= 3783.6753773868018 RMSE_test= 5077.306154753291 oob= 0.41632854264342645

max_depth= 200 max_features= auto min_samples_leaf= 4 min_samples_split= 10 n_estimators= 2000
RMSE_train= 3786.053173557274 RMSE_test= 5078.536366249116 oob= 0.41608337165554754

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 1765.7018379195194 RMSE_test= 4557.9443720968375 oob= 0.343479256412939

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 1748.6826869642796 RMSE_test= 4538.707038353448 oob= 0.34563475611483463

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 1706.458986043336 RMSE_test= 4516.512721254019 oob= 0.34121861555141614

max_depth= 200 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train=

max_depth= 200 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1000
RMSE_train= 2778.525536690295 RMSE_test= 4705.418966012818 oob= 0.36693574532797224

max_depth= 200 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1200
RMSE_train= 2779.3758498497905 RMSE_test= 4697.877614336967 oob= 0.3659913864726772

max_depth= 200 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1400
RMSE_train= 2779.101543626184 RMSE_test= 4687.790720884992 oob= 0.36651262658462874

max_depth= 200 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1600
RMSE_train= 2777.9275990453657 RMSE_test= 4689.4210294358845 oob= 0.3653557363973491

max_depth= 200 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1800
RMSE_train= 2779.1826560428367 RMSE_test= 4693.890576151818 oob= 0.3657847364910074

max_depth= 200 max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 2000
RMSE_train

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200
RMSE_train= 1817.8077758692918 RMSE_test= 4700.474368276735 oob= 0.35925213022595914

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 400
RMSE_train= 1791.6605396409363 RMSE_test= 4693.529857219777 oob= 0.3579487362207656

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 1752.0361083351331 RMSE_test= 4694.276177210476 oob= 0.35549376949301054

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 1742.1014580043106 RMSE_test= 4713.893156726255 oob= 0.3530014627661395

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 1745.1761761756857 RMSE_test= 4722.684618864264 oob= 0.35499784036621196

max_depth= None max_features= auto min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_t

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1400
RMSE_train= 2618.926235132617 RMSE_test= 4963.849373625133 oob= 0.37947973118833417

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1600
RMSE_train= 2618.399100880682 RMSE_test= 4962.943296922036 oob= 0.37837956352260044

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1800
RMSE_train= 2617.2238733806535 RMSE_test= 4972.16348923838 oob= 0.37794879726457514

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 5 n_estimators= 2000
RMSE_train= 2615.0491917704976 RMSE_test= 4968.947289382783 oob= 0.37759894699684615

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 200
RMSE_train= 3453.982113464079 RMSE_test= 4938.420043351321 oob= 0.4061430883851096

max_depth= None max_features= auto min_samples_leaf= 2 min_samples_split= 10 n_estimators= 400
RMSE_

max_depth= None max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 600
RMSE_train= 1706.458986043336 RMSE_test= 4516.512721254019 oob= 0.34121861555141614

max_depth= None max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 800
RMSE_train= 1695.6959002482818 RMSE_test= 4488.664891708227 oob= 0.3383277502542739

max_depth= None max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1000
RMSE_train= 1695.9028807475277 RMSE_test= 4469.667619969189 oob= 0.3391733952229056

max_depth= None max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1200
RMSE_train= 1700.9427700016481 RMSE_test= 4466.740545596612 oob= 0.3393798885805643

max_depth= None max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1400
RMSE_train= 1698.7493723523103 RMSE_test= 4473.422116890827 oob= 0.3389979777389022

max_depth= None max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1600
RMSE_tr

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 1800
RMSE_train= 2779.1826560428367 RMSE_test= 4693.890576151818 oob= 0.3657847364910074

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 5 n_estimators= 2000
RMSE_train= 2779.334971975721 RMSE_test= 4696.744416973492 oob= 0.36534511634026673

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 200
RMSE_train= 3501.3152753472577 RMSE_test= 4810.747153188922 oob= 0.3930568750966577

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 400
RMSE_train= 3511.873875217969 RMSE_test= 4828.629843253242 oob= 0.3925889213903284

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 600
RMSE_train= 3485.0617048564154 RMSE_test= 4816.664186151502 oob= 0.3894017551675869

max_depth= None max_features= sqrt min_samples_leaf= 2 min_samples_split= 10 n_estimators= 800
RMSE_t

In [25]:

print("min oob =",np.min(avg_oob))
print("parameters:")
n1,n2,n3,n4,n5= np.unravel_index(np.argmin(avg_oob), avg_oob.shape)
print("max_depth=",param_grid["max_depth"][n1],"max_features=",param_grid["max_features"][n2],"min_samples_leaf=",param_grid["min_samples_leaf"][n3],"min_samples_split=",param_grid["min_samples_split"][n4],"n_estimators=",param_grid["n_estimators"][n5])


print("")
print("min RMSE in testset=",np.min(avg_RMSE_test))
print("parameters:")
n1,n2,n3,n4,n5= np.unravel_index(np.argmin(avg_RMSE_test), avg_RMSE_test.shape)
print("max_depth=",param_grid["max_depth"][n1],"max_features=",param_grid["max_features"][n2],"min_samples_leaf=",param_grid["min_samples_leaf"][n3],"min_samples_split=",param_grid["min_samples_split"][n4],"n_estimators=",param_grid["n_estimators"][n5])
print("")

min oob = 0.3325525295376627
parameters:
max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 200

min RMSE in testset= 4448.778163974289
parameters:
max_depth= 10 max_features= sqrt min_samples_leaf= 1 min_samples_split= 2 n_estimators= 1800



In [26]:
reg = RandomForestRegressor(oob_score=True,n_estimators=param_grid["n_estimators"][n5],max_depth=param_grid["max_depth"][n1],min_samples_leaf=param_grid["min_samples_leaf"][n3],min_samples_split=param_grid["min_samples_split"][n4], bootstrap=True,max_features=param_grid["max_features"][n2],random_state=42)
reg.fit(df,df_y)
pred = reg.predict(df)
print("MSE=",mean_squared_error(df_y, pred))

MSE= 2634245.6964638233
