# Bagging

In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [3]:
X = np.load('./tatanic_X_train.npy')
y = np.load('./tatanic_y_train.npy')

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,
                                                   random_state=101)

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(622, 27)
(622,)
(267, 27)
(267,)


In [6]:
X_train[0]

array([0.22334962, 0.01517579, 0.        , 1.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [7]:
y_train[:10]

array([0., 1., 1., 1., 0., 0., 1., 1., 0., 1.])

In [8]:
X_test[0]

array([0.30873565, 0.01571255, 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [9]:
y_test[:10]

array([0., 0., 1., 1., 0., 0., 0., 0., 0., 1.])

In [10]:
clf2 = DecisionTreeClassifier(random_state=1)
eclf = BaggingClassifier(clf2, oob_score=True)

In [11]:
clf2.fit(X_train, y_train)
preds = clf2.predict(X_test)
clf2.score(X_test, y_test)

0.7865168539325843

In [12]:
eclf.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [13]:
predictions = eclf.predict(X_test)

In [14]:
eclf.score(X_test, y_test)

0.8202247191011236

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

In [16]:
confusion_matrix(y_test, predictions)

array([[145,  18],
       [ 30,  74]], dtype=int64)

In [17]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.83      0.89      0.86       163
         1.0       0.80      0.71      0.76       104

   micro avg       0.82      0.82      0.82       267
   macro avg       0.82      0.80      0.81       267
weighted avg       0.82      0.82      0.82       267



In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
cross_val_score(eclf, X, y, cv=5).mean()

  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


0.8256713007046276

In [20]:
params = {'n_estimators' : [10,20,30,40,50,55], 'max_samples' : [0.5,0.6,
        0.7, 0.8, 0.9, 1]}

In [48]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator = eclf, param_grid = params, n_jobs = -1,
                   cv=5)
grid = grid.fit(X_train, y_train)



In [49]:
grid.best_score_

0.8295819935691319

In [50]:
grid.best_params_

{'max_features': 15, 'n_estimators': 100}

In [51]:
grid.best_estimator_.oob_score_

0.8102893890675241

In [52]:
grid.best_estimator_

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=15,
         max_samples=1.0, n_estimators=100, n_jobs=None, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [53]:
grid.score(X_test, y_test)

0.8689138576779026

# Random Forest

In [32]:
import numpy as np

In [33]:
X = np.load('./tatanic_X_train.npy')
y = np.load('./tatanic_y_train.npy')

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                random_state=101)

In [63]:
X_train[0]

array([0.22334962, 0.01517579, 0.        , 1.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [64]:
y_train[:10]

array([0., 1., 1., 1., 0., 0., 1., 1., 0., 1.])

In [67]:
from sklearn.ensemble import RandomForestClassifier
eclf = RandomForestClassifier(n_estimators = 100, max_features=2, n_jobs=7,
                             oob_score=True)

In [68]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X_train, y_train, cv=5).mean()

0.8038914490527395

In [69]:
params={'n_estimators':[10,20,30,50,100], 'max_features':
        [1,2,3,4,5,6,7,10,15,20,25, len(X[0])]}

In [70]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator = eclf, param_grid = params, n_jobs = -1,
                   cv=5)
grid = grid.fit(X_train, y_train)



In [71]:
grid.best_score_

0.8215434083601286

In [72]:
grid.best_params_

{'max_features': 10, 'n_estimators': 30}

In [73]:
grid.best_estimator_.oob_score_

0.7942122186495176

In [74]:
grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=7,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [75]:
grid.score(X_test, y_test)

0.846441947565543

In [76]:
from sklearn.ensemble import RandomForestClassifier

btree = RandomForestClassifier(max_features = 20, n_estimators = 20)

In [77]:
btree.fit(X_train, y_train)
btree.score(X_test, y_test)

0.8314606741573034

In [78]:
btree.feature_importances_

array([0.2213669 , 0.22465444, 0.01870283, 0.08293071, 0.14035583,
       0.03951072, 0.0143025 , 0.01060701, 0.00785385, 0.01269783,
       0.00071332, 0.00137944, 0.00146879, 0.01183057, 0.00525502,
       0.15056849, 0.01227889, 0.00381434, 0.01254937, 0.00146553,
       0.0021914 , 0.00622618, 0.01118374, 0.00586177, 0.0002305 ,
       0.        , 0.        ])

# Adaboost

In [79]:
import numpy as np

In [81]:
elements = ['one', 'two', 'three']
weights = [0.2, 0.3, 0.5]

from numpy.random import choice
print(choice(elements))
print(choice(elements, size=10, replace=True, p=weights))

one
['one' 'two' 'two' 'three' 'three' 'three' 'two' 'three' 'one' 'one']


In [82]:
X = np.load('./tatanic_X_train.npy')
y = np.load('./tatanic_y_train.npy')

In [83]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
                                                   random_state=101)

In [84]:
X_train[:2]

array([[0.22334962, 0.01517579, 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.22334962, 0.04489301, 0.11111111, 0.5       , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

In [85]:
y_train[:10]

array([0., 1., 1., 1., 0., 0., 1., 1., 0., 1.])

In [86]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [87]:
eclf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier
        (max_depth=2), n_estimators=500, learning_rate=0.1)

In [88]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X_train, y_train, cv=5).mean()

0.8006912442396313

In [89]:
from sklearn.tree import DecisionTreeClassifier
DecisionTreeClassifier()

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [90]:
AdaBoostClassifier()

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [91]:
params = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__max_features" : [7,8,],
"base_estimator__max_depth" : [1,2,3,4,5],
"n_estimators": [23,24, 25, 26, 27],
"learning_rate": [0.4, 0.45, 0.5, 0.55, 0.6]
}

In [92]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator = eclf, param_grid = params, n_jobs = -1,
                   cv=5)
grid = grid.fit(X_train, y_train)



In [93]:
grid.best_estimator_.feature_importances_

array([0.24665954, 0.20325456, 0.02627932, 0.06691766, 0.04793137,
       0.06236273, 0.04424134, 0.01847835, 0.01845338, 0.01650334,
       0.00176866, 0.00584678, 0.00289474, 0.01697956, 0.01471794,
       0.05563815, 0.03905126, 0.00996011, 0.0277398 , 0.        ,
       0.00715002, 0.01495474, 0.0047324 , 0.02485926, 0.01276289,
       0.00986212, 0.        ])

In [94]:
grid.score(X_train, y_train)

0.9003215434083601

In [95]:
grid.score(X_test, y_test)

0.7940074906367042

# Gradient Boosting

In [7]:
import pandas as pd
import numpy as np

In [9]:
train = pd.read_csv('./house_price/train.csv')
test = pd.read_csv('./house_price/test.csv')

In [10]:
print(train.columns)
print(test.columns)
train.info()

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [11]:
pd.DataFrame(train.isnull().sum())

Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotFrontage,259
LotArea,0
Street,0
Alley,1369
LotShape,0
LandContour,0
Utilities,0


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train,test,)

ValueError: too many values to unpack (expected 4)