In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston

boston = load_boston()
X = boston.data
y = boston.target

In [7]:
rf = RandomForestRegressor(random_state=2020)

In [8]:
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=2020, verbose=0, warm_start=False)

In [10]:
rf.estimators_

[DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=2088543072, splitter='best'),
 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=639299976, splitter='best'),
 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impuri

In [12]:
np.mean([tree.predict(X[:1]) for tree in rf.estimators_])

25.315

In [13]:
rf.predict(X[:1])

array([25.315])

In [14]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2020,
 'verbose': 0,
 'warm_start': False}

In [15]:
rf.score(X, y)

0.9820284033401907

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
?GridSearchCV

In [21]:
from sklearn.metrics import mean_squared_error, make_scorer

loss_function = make_scorer(mean_squared_error, greater_is_better=False)

In [22]:
params = {
    'n_estimators': [5, 10, 25, 50],
    'max_features': [0.3, 0.4, 0.5, 0.6],
    'min_samples_leaf': [5, 10, 15]
}

In [23]:
grid = GridSearchCV(estimator=rf, param_grid=params, scoring=loss_function, cv=5)

In [24]:
grid.fit(X, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=2020,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [25]:
grid.best_params_

{'max_features': 0.6, 'min_samples_leaf': 5, 'n_estimators': 25}

In [27]:
grid_results = pd.DataFrame(grid.cv_results_)

In [31]:
grid_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
38,0.05575,0.002735,0.003008,0.001254871,0.6,5,25,"{'max_features': 0.6, 'min_samples_leaf': 5, '...",-9.072322,-11.581454,-16.86085,-47.769405,-14.281004,-19.913007,14.170034,1
39,0.108707,0.008525,0.004783,0.0007357699,0.6,5,50,"{'max_features': 0.6, 'min_samples_leaf': 5, '...",-8.65356,-12.248097,-18.719754,-46.411869,-14.692544,-20.145165,13.53605,2
37,0.022537,0.00216,0.001191,0.0004018703,0.6,5,10,"{'max_features': 0.6, 'min_samples_leaf': 5, '...",-9.659311,-12.333912,-14.684548,-50.923818,-15.689954,-20.658309,15.27511,3
27,0.099989,0.004068,0.004793,0.0003849758,0.5,5,50,"{'max_features': 0.5, 'min_samples_leaf': 5, '...",-8.402161,-12.489176,-21.989901,-46.178528,-15.101285,-20.83221,13.421862,4
15,0.09389,0.004101,0.004185,0.001002004,0.4,5,50,"{'max_features': 0.4, 'min_samples_leaf': 5, '...",-8.120806,-13.693727,-26.005641,-42.40621,-15.286086,-21.102494,12.125459,5
14,0.04728,0.001833,0.002394,0.0004769763,0.4,5,25,"{'max_features': 0.4, 'min_samples_leaf': 5, '...",-7.995264,-16.243807,-23.782032,-43.362928,-15.243349,-21.325476,12.101191,6
26,0.049867,0.00208,0.002394,0.0004891648,0.5,5,25,"{'max_features': 0.5, 'min_samples_leaf': 5, '...",-7.863692,-13.138678,-22.799273,-47.603821,-15.685508,-21.418195,13.947997,7
43,0.095547,0.007742,0.004585,0.001216826,0.6,10,50,"{'max_features': 0.6, 'min_samples_leaf': 10, ...",-9.455479,-12.400328,-25.79771,-45.819755,-15.717116,-21.838078,13.19595,8
42,0.049667,0.002476,0.002393,0.0004885612,0.6,10,25,"{'max_features': 0.6, 'min_samples_leaf': 10, ...",-10.02517,-11.565475,-24.593605,-48.041933,-16.003523,-22.045941,13.950858,9
13,0.019346,0.002647,0.001596,0.0004884803,0.4,5,10,"{'max_features': 0.4, 'min_samples_leaf': 5, '...",-7.468959,-16.110047,-26.149814,-45.855373,-15.185688,-22.153976,13.254877,10


In [32]:
grid_results.groupby('param_min_samples_leaf')['mean_test_score'].mean()

param_min_samples_leaf
5    -22.883309
10   -25.253401
15   -26.797558
Name: mean_test_score, dtype: float64

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), rf)

In [35]:
pipe.steps

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('randomforestregressor',
  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=None, oob_score=False,
                        random_state=2020, verbose=0, warm_start=False))]

In [36]:
params = {
   'randomforestregressor__min_samples_leaf': [5, 10, 15],
   'randomforestregressor__max_features': [0.4, 0.5, 0.6]
}

In [37]:
grid = GridSearchCV(estimator=pipe, param_grid=params, scoring=loss_function, cv=10)

In [38]:
grid.fit(X, y)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

In [39]:
from category_encoders import OrdinalEncoder, OneHotEncoder

In [40]:
ore = OrdinalEncoder()
ohe = OneHotEncoder()

In [41]:
sc = StandardScaler()

''

In [43]:
cd ../data

C:\Users\Jonat\ga\DAT-01-21\Lectures\Unit4\data


In [44]:
ls

 Volume in drive C is OS
 Volume Serial Number is 0499-EE93

 Directory of C:\Users\Jonat\ga\DAT-01-21\Lectures\Unit4\data

03/17/2020  06:16 PM    <DIR>          .
03/17/2020  06:16 PM    <DIR>          ..
03/17/2020  06:16 PM    <DIR>          iowa_housing
02/23/2020  09:36 PM            67,684 MNCAATourneyCompactResults.csv
02/23/2020  09:36 PM            34,402 MNCAATourneySeeds.csv
               2 File(s)        102,086 bytes
               3 Dir(s)  813,484,126,208 bytes free


In [45]:
seeds   = pd.read_csv('MNCAATourneySeeds.csv')
results = pd.read_csv('MNCAATourneyCompactResults.csv')

In [46]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [47]:
results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [55]:
seeds['Seed'] = seeds['Seed'].str[1:].str

In [58]:
def is_int(val):
    try:
        return np.int(val)
    except:
        return np.int(val[:-1])

In [60]:
seeds['Seed'] = seeds['Seed'].apply(is_int)

In [61]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374


In [62]:
results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [63]:
results = results[['Season', 'WTeamID', 'WScore', 'LTeamID', 'LScore']]

In [64]:
results.head()

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore
0,1985,1116,63,1234,54
1,1985,1120,59,1345,58
2,1985,1207,68,1250,43
3,1985,1229,58,1425,55
4,1985,1242,49,1325,38


In [65]:
results.columns = ['Season', 'T1TeamID', 'T1Score', 'T2TeamID', 'T2Score']

In [67]:
results_swap = results[['Season', 'T2TeamID', 'T2Score', 'T1TeamID', 'T1Score']]

In [68]:
results.head()

Unnamed: 0,Season,T1TeamID,T1Score,T2TeamID,T2Score
0,1985,1116,63,1234,54
1,1985,1120,59,1345,58
2,1985,1207,68,1250,43
3,1985,1229,58,1425,55
4,1985,1242,49,1325,38


In [70]:
results_swap.columns = results.columns
results_swap.head()

Unnamed: 0,Season,T1TeamID,T1Score,T2TeamID,T2Score
0,1985,1234,54,1116,63
1,1985,1345,58,1120,59
2,1985,1250,43,1207,68
3,1985,1425,55,1229,58
4,1985,1325,38,1242,49


In [73]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374


In [75]:
results = results.merge(seeds, how='left', left_on=['Season', 'T1TeamID'], right_on=['Season', 'TeamID'])

In [77]:
results.drop('TeamID', axis=1, inplace=True)
results.columns.values[-1] = 'T1Seed'

In [78]:
results.head()

Unnamed: 0,Season,T1TeamID,T1Score,T2TeamID,T2Score,T1Seed
0,1985,1116,63,1234,54,9
1,1985,1120,59,1345,58,11
2,1985,1207,68,1250,43,1
3,1985,1229,58,1425,55,9
4,1985,1242,49,1325,38,3


In [79]:
results = results.merge(seeds, how='left', left_on=['Season', 'T2TeamID'], right_on=['Season', 'TeamID'])
results.drop('TeamID', axis=1, inplace=True)
results.columns.values[-1] = 'T2Seed'

In [81]:
results['Result'] = 1

In [82]:
results.head()

Unnamed: 0,Season,T1TeamID,T1Score,T2TeamID,T2Score,T1Seed,T2Seed,Result
0,1985,1116,63,1234,54,9,8,1
1,1985,1120,59,1345,58,11,6,1
2,1985,1207,68,1250,43,1,16,1
3,1985,1229,58,1425,55,9,8,1
4,1985,1242,49,1325,38,3,14,1


In [83]:
results_swap.head()

Unnamed: 0,Season,T1TeamID,T1Score,T2TeamID,T2Score
0,1985,1234,54,1116,63
1,1985,1345,58,1120,59
2,1985,1250,43,1207,68
3,1985,1425,55,1229,58
4,1985,1325,38,1242,49


In [1]:
game_data.head()

NameError: name 'game_data' is not defined