In [204]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn import linear_model
from sklearn import ensemble

# Decision Tree Classifier with no Hyperparameter Tuning

In [33]:
titanic = pd.read_csv('titanic.csv')
titanic

for i in range(len(titanic.PClass)):
    if titanic.PClass[i]=='1st':
        titanic.PClass[i] = 1
    elif titanic.PClass[i]=='2nd':
        titanic.PClass[i] = 2
    elif titanic.PClass[i]=='3rd':
        titanic.PClass[i] = 3
    else:
        titanic = titanic.drop(i, axis=0)

titanic = titanic.dropna()

X = titanic[['Age', 'SexCode', 'PClass']]
Y = titanic.Survived

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [39]:
#Specify model type
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(X_train, y_train) #fit the model



print(tree_model.score(X_train, y_train)) #how it performs on the data it was trained on
print(tree_model.score(X_test, y_test)) #how it performs on the data it was not trained on

0.8758278145695364
0.7763157894736842


# Decision Tree Classifier with Hyperparameter Tuning

In [40]:
X_train

Unnamed: 0,Age,SexCode,PClass
494,19.0,1,2
350,20.0,1,2
124,45.0,1,1
694,19.0,0,3
630,9.0,1,3
...,...,...,...
948,26.0,0,3
252,67.0,0,1
833,32.0,0,3
737,3.0,0,3


In [42]:
grid_para_tree = {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 31), 'max_features': range(1,4)}

grid_search_tree = ms.GridSearchCV(tree_model, grid_para_tree, cv=4, scoring='accuracy').fit(X_train, y_train)

print('Best parameters: '+ str(grid_search_tree.best_params_))

print('The best score is %.4f' %grid_search_tree.best_score_)

print('The training error is %.4f' %(grid_search_tree.best_estimator_.score(X_train, y_train)))

print('The testing error is %.4f' %(grid_search_tree.best_estimator_.score(X_test, y_test)))

In [48]:
print('Best parameters: '+ str(grid_search_tree.best_params_))

print('The best score is %.4f' %grid_search_tree.best_score_)

print('The training error is %.4f' %(grid_search_tree.best_estimator_.score(X_train, y_train)))

print('The testing error is %.4f' %(grid_search_tree.best_estimator_.score(X_test, y_test)))

Best parameters: {'criterion': 'gini', 'max_depth': 6, 'max_features': 2}
The best score is 0.8079
The training error is 0.8493
The testing error is 0.7961


# Decision Tree Regressor with no Hyperparameter Tuning

In [49]:
iris = load_iris()
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

X = df[df.columns[0:3]]
Y = df[df.columns[3]]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [57]:
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm)
137,6.4,3.1,5.5
84,5.4,3.0,4.5
27,5.2,3.5,1.5
127,6.1,3.0,4.9
132,6.4,2.8,5.6
...,...,...,...
9,4.9,3.1,1.5
103,6.3,2.9,5.6
67,5.8,2.7,4.1
117,7.7,3.8,6.7


In [56]:
tree_model = tree.DecisionTreeRegressor() #model type

tree_model.fit(X_train, y_train) #fit the model
print(tree_model.score(X_train, y_train)) #how it performs on the data it was trained on
print(tree_model.score(X_test, y_test)) #how it performs on data we held out

0.9986720653809774
0.8965493300006943


# Decision Tree Regressor with Hyperparameter Tuning

In [64]:
grid_para_tree = {'max_depth': range(1, 4), 'max_features': range(1,len(df.columns))}

grid_search_tree = ms.GridSearchCV(tree_model, grid_para_tree, cv=3, scoring='r2').fit(X_train, y_train)

print('Best parameters: '+ str(grid_search_tree.best_params_))

print('The best score is %.4f' %grid_search_tree.best_score_)

print('The training error is %.4f' %(grid_search_tree.best_estimator_.score(X_train, y_train)))

print('The testing error is %.4f' %(grid_search_tree.best_estimator_.score(X_test, y_test)))

Best parameters: {'max_depth': 3, 'max_features': 3}
The best score is 0.9224
The training error is 0.9542
The testing error is 0.8861


# Random Forest Classifier with No Hyperparameter Tuning

In [75]:
titanic = pd.read_csv('titanic.csv')

for i in range(len(titanic.PClass)):
    if titanic.PClass[i]=='1st':
        titanic.PClass[i] = 1
    elif titanic.PClass[i]=='2nd':
        titanic.PClass[i] = 2
    elif titanic.PClass[i]=='3rd':
        titanic.PClass[i] = 3
    else:
        titanic = titanic.drop(i, axis=0)

titanic = titanic.dropna()

X = titanic[['Age', 'SexCode', 'PClass']] #inputs
Y = titanic.Survived #outputs

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [205]:

randomForest = ensemble.RandomForestClassifier(random_state=1, n_estimators=200, n_jobs=-1) #store the model type we will use

# fit RF model on training set

randomForest.fit(X_train, y_train) #actual fitting

print("The training accuracy is: %.5f" % (randomForest.score(X_train, y_train))) #training error
print("The test     accuracy is: %.5f" % (randomForest.score(X_test, y_test))) #test error

ValueError: Unknown label type: 'continuous'

# Random Forest Classifier with Hyperparameter Tuning

In [88]:
lst = []

for i in [3,4,5,6,7]:
    for j in ['a','b','c','d','e']:
        for k in [14,15,16,17,18]:
            for l in ['f','g','h','i','j']:
                lst.append([i,j,k,l])
                
lst

[[3, 'a', 14, 'f'],
 [3, 'a', 14, 'g'],
 [3, 'a', 14, 'h'],
 [3, 'a', 14, 'i'],
 [3, 'a', 14, 'j'],
 [3, 'a', 15, 'f'],
 [3, 'a', 15, 'g'],
 [3, 'a', 15, 'h'],
 [3, 'a', 15, 'i'],
 [3, 'a', 15, 'j'],
 [3, 'a', 16, 'f'],
 [3, 'a', 16, 'g'],
 [3, 'a', 16, 'h'],
 [3, 'a', 16, 'i'],
 [3, 'a', 16, 'j'],
 [3, 'a', 17, 'f'],
 [3, 'a', 17, 'g'],
 [3, 'a', 17, 'h'],
 [3, 'a', 17, 'i'],
 [3, 'a', 17, 'j'],
 [3, 'a', 18, 'f'],
 [3, 'a', 18, 'g'],
 [3, 'a', 18, 'h'],
 [3, 'a', 18, 'i'],
 [3, 'a', 18, 'j'],
 [3, 'b', 14, 'f'],
 [3, 'b', 14, 'g'],
 [3, 'b', 14, 'h'],
 [3, 'b', 14, 'i'],
 [3, 'b', 14, 'j'],
 [3, 'b', 15, 'f'],
 [3, 'b', 15, 'g'],
 [3, 'b', 15, 'h'],
 [3, 'b', 15, 'i'],
 [3, 'b', 15, 'j'],
 [3, 'b', 16, 'f'],
 [3, 'b', 16, 'g'],
 [3, 'b', 16, 'h'],
 [3, 'b', 16, 'i'],
 [3, 'b', 16, 'j'],
 [3, 'b', 17, 'f'],
 [3, 'b', 17, 'g'],
 [3, 'b', 17, 'h'],
 [3, 'b', 17, 'i'],
 [3, 'b', 17, 'j'],
 [3, 'b', 18, 'f'],
 [3, 'b', 18, 'g'],
 [3, 'b', 18, 'h'],
 [3, 'b', 18, 'i'],
 [3, 'b', 18, 'j'],


In [91]:
grid_para_forest = {
    'max_depth': range(1, 5), # depth = varying max tree_depths
    'n_estimators': range(50, 300, 50), # n_estimators = number of trees in the forest
    'max_features': range(1, len(df.columns))}
                    
grid_search_forest = ms.GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', cv=3, n_jobs=-1, verbose=True)
grid_search_forest.fit(X_train, y_train)
                          
print(grid_search_forest.best_params_)
print(grid_search_forest.best_score_)
print("The training accuracy is: %.5f" % (grid_search_forest.best_estimator_.score(X_train, y_train)))
print("The test     accuracy is: %.5f" % (grid_search_forest.best_estimator_.score(X_test, y_test)))

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   39.6s finished


{'max_depth': 4, 'max_features': 3, 'n_estimators': 50}
0.8145575751605012
The training accuracy is: 0.83940
The test     accuracy is: 0.80263


# Random Forest Regressor with no Hyperparameter Tuning

In [97]:
iris = load_iris()
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

X = df[df.columns[0:3]]
Y = df[df.columns[3]]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [103]:
from sklearn import ensemble
randomForest = ensemble.RandomForestRegressor() #store the model type we will use

# fit RF model on training set
randomForest.set_params(random_state=0, n_estimators=200) #setting random state
randomForest.fit(X_train, y_train) #actual fitting

print("The training r^2 is: %.5f" % (randomForest.score(X_train, y_train))) #training error
print("The test     r^2 is: %.5f" % (randomForest.score(X_test, y_test))) #test error

The training r^2 is: 0.98995
The test     r^2 is: 0.92250


# Random Forest Regressor with Hyperparameter Tuning

In [109]:
grid_para_forest = {
    'max_depth': range(1, 6), # depth = varying max tree_depths
    'n_estimators': range(50, 400, 50), # n_estimators = number of trees in the forest
    'max_features': range(1, len(X.columns)+1)}
                    
                          
grid_search_forest = ms.GridSearchCV(randomForest, grid_para_forest, scoring='r2', cv=2, n_jobs=-1, verbose=True)
grid_search_forest.fit(X_train, y_train)
                          
print(grid_search_forest.best_params_)
print(grid_search_forest.best_score_)
print("The training r^2 is: %.5f" % (grid_search_forest.best_estimator_.score(X_train, y_train)))
print("The test     r^2 is: %.5f" % (grid_search_forest.best_estimator_.score(X_test, y_test)))

Fitting 2 folds for each of 105 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   30.6s finished


{'max_depth': 3, 'max_features': 3, 'n_estimators': 300}
0.940793827275316
The training r^2 is: 0.96210
The test     r^2 is: 0.90651


# Ensembling

In [151]:
#Data Cleaning

iris = load_iris()
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

X = df[df.columns[0:3]]
Y = df[df.columns[3]]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [152]:
#TREE

tree_model = tree.DecisionTreeRegressor()

grid_para_tree = {'max_depth': range(1, 31), 'max_features': range(1,4)}

grid_search_tree = ms.GridSearchCV(tree_model, grid_para_tree, cv=4, scoring='r2').fit(X_train, y_train)

In [153]:
#FOREST

randomForest = ensemble.RandomForestRegressor(n_jobs=-1) #store the model type we will use

grid_para_forest = {
    'max_depth': range(1, 5), # depth = varying max tree_depths
    'n_estimators': range(50, 300, 50), # n_estimators = number of trees in the forest
    'max_features': range(1, len(df.columns))}
                    
grid_search_forest = ms.GridSearchCV(randomForest, grid_para_forest, scoring='r2', cv=3, n_jobs=-1, verbose=True)
grid_search_forest.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   29.7s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [148]:



# grid_search_forest.predict([[1,2,3]])

In [154]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm)
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4


In [157]:
print(grid_search_tree.predict([[4,3.2,1.6]]))
print(grid_search_forest.predict([[4,3.2,1.6]]))

[0.17857143]
[0.21928679]


In [208]:
def model_ensemble(SL, SW, PL, n_estimator_max, tree_weight=0.5):
    
    iris = load_iris()
    df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

    X = df[df.columns[0:3]]
    Y = df[df.columns[3]]

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

    #TREE

    print('Beginning Tree Training...')
    
    tree_model = tree.DecisionTreeRegressor()

    grid_para_tree = {'max_depth': range(1, 31), 'max_features': range(1,4)}

    grid_search_tree = ms.GridSearchCV(tree_model, grid_para_tree, cv=4, scoring='r2').fit(X_train, y_train)
    
    tree_prediction = grid_search_tree.predict([[SL, SW, PL]])
    
    print(tree_prediction)
    
    #FOREST

    randomForest = ensemble.RandomForestRegressor(n_jobs=-1) #store the model type we will use

    grid_para_forest = {
        'max_depth': range(1, 5), # depth = varying max tree_depths
        'n_estimators': range(50, n_estimator_max, 50), # n_estimators = number of trees in the forest
        'max_features': range(1, len(df.columns))}

    grid_search_forest = ms.GridSearchCV(randomForest, grid_para_forest, scoring='r2', cv=3, n_jobs=-1, verbose=True)
    
    print('Beginning Forest Training...')
    
    grid_search_forest.fit(X_train, y_train)
    
    forest_prediction = grid_search_forest.predict([[SL, SW, PL]])
    print(forest_prediction)
    
    return 'FINAL ESTIMATE: ' + str((((tree_weight*tree_prediction) + ((1-tree_weight)*forest_prediction)))[0]) #average


In [209]:
model_ensemble(6, 4, 2, 200, .224)

Beginning Tree Training...
[0.34285714]
Beginning Forest Training...
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   15.2s finished


[0.33536744]


'FINAL ESTIMATE: 0.3370451335187026'