In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

df = pd.read_csv('haberman.csv')
df.head()

Unnamed: 0,patientAge,operationYear,nodesDetected,survivalStatus
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


# Factorize and split data into train/test

In [2]:
x_cols = ['patientAge', 'operationYear', 'nodesDetected']
y_col = 'survivalStatus'

# create factorized df
fdf = pd.DataFrame(columns=[*x_cols, y_col])
le = preprocessing.LabelEncoder()
for col in fdf.columns:
    fdf[col] = le.fit_transform(df[col])

# select 
X = fdf[x_cols]
y = fdf[y_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest Classifier accuracy

In [3]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

print(f'Acc Train: {metrics.accuracy_score(y_train, clf.predict(X_train))}') 
print(f'Acc Test: {metrics.accuracy_score(y_test, clf.predict(X_test))}')

Acc Train: 0.985981308411215
Acc Test: 0.7282608695652174


#### Experiment with various parameter settings to find the best training set accuracy

In [4]:
scores_n_estimators = []
scores_max_leaf_nodes = []
scores_min_samples_leaf = []
scores_max_depth = []


for i in range(1, 21):

    # n_estimators
    clf = RandomForestClassifier(n_estimators=i, random_state=42)
    clf.fit(X_train, y_train)
    scores_n_estimators.append(metrics.accuracy_score(y_test, clf.predict(X_test)))

    # max_leaf_nodes
    if i == 1:
        scores_max_leaf_nodes.append(None)
    else:
        clf = RandomForestClassifier(max_leaf_nodes=i, random_state=42)
        clf.fit(X_train, y_train)
        scores_max_leaf_nodes.append(metrics.accuracy_score(y_test, clf.predict(X_test)))

    # min_samples_leaf
    clf = RandomForestClassifier(min_samples_leaf=i, random_state=42)
    clf.fit(X_train, y_train)
    scores_min_samples_leaf.append(metrics.accuracy_score(y_test, clf.predict(X_test)))

    # max_depth
    clf = RandomForestClassifier(max_depth=i, random_state=42)
    clf.fit(X_train, y_train)
    scores_max_depth.append(metrics.accuracy_score(y_test, clf.predict(X_test)))

    
rfc_scores = pd.DataFrame({'n_estimators': scores_n_estimators, 'max_leaf_nodes': scores_max_leaf_nodes,
                          'min_samples_leaf': scores_min_samples_leaf, 'max_depth': scores_max_depth})
rfc_scores.index += 1
rfc_scores.index.names = ['n']
rfc_scores

Unnamed: 0_level_0,n_estimators,max_leaf_nodes,min_samples_leaf,max_depth
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.652174,,0.728261,0.717391
2,0.673913,0.717391,0.728261,0.717391
3,0.619565,0.717391,0.73913,0.76087
4,0.630435,0.706522,0.73913,0.76087
5,0.630435,0.73913,0.73913,0.73913
6,0.652174,0.76087,0.73913,0.728261
7,0.673913,0.73913,0.75,0.728261
8,0.673913,0.76087,0.75,0.73913
9,0.663043,0.73913,0.76087,0.728261
10,0.684783,0.73913,0.75,0.73913


#### The best accuracy appears to be 0.760870 when using ONE of the following:
* max_leaf_nodes = 8
* min_samples_leaf = 15
* max_depth = 3 or 4

# Gradient Boosting Classifier Accuracy

In [5]:
gbrt = GradientBoostingClassifier(random_state=42)
gbrt.fit(X_train, y_train)

print(f'Acc Train: {gbrt.score(X_train, y_train)}') 
print(f'Acc Test: {gbrt.score(X_test, y_test)}')

Acc Train: 0.897196261682243
Acc Test: 0.7065217391304348


#### Experiment with various parameter settings to find the best training set accuracy

In [6]:
scores_n_estimators = []
scores_max_leaf_nodes = []
scores_min_samples_leaf = []
scores_max_depth = []
scores_learning_rate = []


for i in range(1, 21):

    # n_estimators
    gbrt = GradientBoostingClassifier(n_estimators=i, random_state=42)
    gbrt.fit(X_train, y_train)
    scores_n_estimators.append(gbrt.score(X_test, y_test))

    # max_leaf_nodes
    if i == 1:
        scores_max_leaf_nodes.append(None)
    else:
        gbrt = GradientBoostingClassifier(max_leaf_nodes=i, random_state=42)
        gbrt.fit(X_train, y_train)
        scores_max_leaf_nodes.append(gbrt.score(X_test, y_test))

    # min_samples_leaf
    gbrt = GradientBoostingClassifier(min_samples_leaf=i, random_state=42)
    gbrt.fit(X_train, y_train)
    scores_min_samples_leaf.append(gbrt.score(X_test, y_test))

    # max_depth
    gbrt = GradientBoostingClassifier(max_depth=i, random_state=42)
    gbrt.fit(X_train, y_train)
    scores_max_depth.append(gbrt.score(X_test, y_test))
    
    # learning_rate
    gbrt = GradientBoostingClassifier(learning_rate=i/20.0, random_state=42)
    gbrt.fit(X_train, y_train)
    scores_learning_rate.append(gbrt.score(X_test, y_test))

    
gbrt_scores = pd.DataFrame({'n_estimators': scores_n_estimators, 'max_leaf_nodes': scores_max_leaf_nodes,
                          'min_samples_leaf': scores_min_samples_leaf, 'max_depth': scores_max_depth,
                           'learning_rate (n/20)':scores_learning_rate})
gbrt_scores.index += 1
gbrt_scores.index.names = ['n']
gbrt_scores

Unnamed: 0_level_0,n_estimators,max_leaf_nodes,min_samples_leaf,max_depth,learning_rate (n/20)
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.717391,,0.706522,0.73913,0.695652
2,0.717391,0.73913,0.706522,0.717391,0.706522
3,0.717391,0.728261,0.717391,0.706522,0.684783
4,0.717391,0.717391,0.684783,0.673913,0.673913
5,0.717391,0.717391,0.673913,0.663043,0.652174
6,0.75,0.684783,0.706522,0.663043,0.641304
7,0.728261,0.717391,0.684783,0.673913,0.663043
8,0.76087,0.706522,0.673913,0.663043,0.695652
9,0.76087,0.706522,0.663043,0.663043,0.684783
10,0.73913,0.706522,0.684783,0.673913,0.641304


#### The best accuracy appears to be 0.760870 when n_estimators=9
Overall, the accuracy appears to be lower than Random Forest Classifier