In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split
from AdaBoostClassifier import AdaBoostClassifier
from DecisionTree import decision_tree_classifier, Criterion
from RandomForest import RandomForest


In [116]:
# Reading data
d = pd.read_csv(
    "data/train.csv")[['Age', 'Sex', 'Fare', 'Pclass', 'Survived','PassengerId']].dropna()
d = d.assign(Sex=d.Sex.eq('male').astype(int))


# Constructing the X and Y matrices
X = d[['Age', 'Sex', 'Fare', 'Pclass','PassengerId']]
Y = d['Survived'].values.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [117]:
X_train

Unnamed: 0,Age,Sex,Fare,Pclass,PassengerId
328,31.0,0,20.5250,3,329
73,26.0,1,14.4542,3,74
253,30.0,1,16.1000,3,254
719,33.0,1,7.7750,3,720
666,25.0,1,13.0000,2,667
...,...,...,...,...,...
92,46.0,1,61.1750,1,93
134,25.0,1,13.0000,2,135
337,41.0,0,134.5000,1,338
548,33.0,1,20.5250,3,549


In [118]:
accuracy_scores = []
def get_accuracy(yhat,y_test):
    yhat = yhat.tolist()

    same = 0
    for i in range(len(yhat)):
        if yhat[i] == y_test[i]:
            same += 1

    print(f"ACCURACY: {same/len(yhat):.5f}")
    return same/len(yhat)


In [119]:
print("Ada Boost Classifier")
ab = AdaBoostClassifier(5, 0.001)
ab.fit(X_train, y_train)
# Predicting

yhat = ab.predict(X_test)
accuracy_scores.append(("Ada Boost Classifier",get_accuracy(yhat,y_test)))

Ada Boost Classifier
ACCURACY: 0.73427


In [120]:
print("DecisionTree Classifier - ENTROPY")
dt = decision_tree_classifier(
    Criterion.ENTROPY, min_samples_split=10, max_depth=5, min_samples_leaf=1)
root_node = dt.fit(X_train, y_train)
dt.print_tree(root_node)

# Predicting

yhat = dt.predict(X_test)
accuracy_scores.append(("DecisionTree Classifier - ENTROPY",get_accuracy(yhat,y_test)))

DecisionTree Classifier - ENTROPY
Root
-------- Split rule: Sex <= 0.5
---------------- Split rule: Pclass <= 2.5
------------------------ Split rule: PassengerId <= 362.5
-------------------------------- Split rule: Age <= 2.5
-------------------------------- Split rule: Age > 2.5
---------------------------------------- Split rule: Fare <= 28.856
---------------------------------------- Split rule: Fare > 28.856
------------------------ Split rule: PassengerId > 362.5
-------------------------------- Split rule: PassengerId <= 854.5
-------------------------------- Split rule: PassengerId > 854.5
---------------------------------------- Split rule: PassengerId <= 856.0
---------------------------------------- Split rule: PassengerId > 856.0
---------------- Split rule: Pclass > 2.5
------------------------ Split rule: Age <= 27.5
-------------------------------- Split rule: Fare <= 23.087
---------------------------------------- Split rule: Age <= 6.5
--------------------------------

In [121]:
print("DecisionTree Classifier - MISCLASSIFICATION")
dt = decision_tree_classifier(
    Criterion.MISCLASSIFICATION_RATE, min_samples_split=10, max_depth=5, min_samples_leaf=1)
root_node = dt.fit(X_train, y_train)
dt.print_tree(root_node)

# Predicting

yhat = dt.predict(X_test)
accuracy_scores.append(("DecisionTree Classifier - MISCLASSIFICATION",get_accuracy(yhat,y_test)))


DecisionTree Classifier - MISCLASSIFICATION
Root
-------- Split rule: Sex <= 0.5
---------------- Split rule: Fare <= 6.988
---------------- Split rule: Fare > 6.988
-------- Split rule: Sex > 0.5
---------------- Split rule: Age <= 3.5
------------------------ Split rule: Fare <= 39.344
-------------------------------- Split rule: PassengerId <= 48.0
-------------------------------- Split rule: PassengerId > 48.0
------------------------ Split rule: Fare > 39.344
---------------- Split rule: Age > 3.5
------------------------ Split rule: Age <= 77.0
-------------------------------- Split rule: Fare <= 387.665
-------------------------------- Split rule: Fare > 387.665
------------------------ Split rule: Age > 77.0
ACCURACY: 0.75524


In [122]:
print("DecisionTree Classifier - GINI IMPURITY")
dt = decision_tree_classifier(
    Criterion.ENTROPY, min_samples_split=10, max_depth=5, min_samples_leaf=1)
root_node = dt.fit(X_train, y_train)
dt.print_tree(root_node)

# Predicting

yhat = dt.predict(X_test)
accuracy_scores.append(("DecisionTree Classifier - GINI IMPURITY",get_accuracy(yhat,y_test)))

DecisionTree Classifier - GINI IMPURITY
Root
-------- Split rule: Sex <= 0.5
---------------- Split rule: Pclass <= 2.5
------------------------ Split rule: PassengerId <= 362.5
-------------------------------- Split rule: Age <= 2.5
-------------------------------- Split rule: Age > 2.5
---------------------------------------- Split rule: Fare <= 28.856
---------------------------------------- Split rule: Fare > 28.856
------------------------ Split rule: PassengerId > 362.5
-------------------------------- Split rule: PassengerId <= 854.5
-------------------------------- Split rule: PassengerId > 854.5
---------------------------------------- Split rule: PassengerId <= 856.0
---------------------------------------- Split rule: PassengerId > 856.0
---------------- Split rule: Pclass > 2.5
------------------------ Split rule: Age <= 27.5
-------------------------------- Split rule: Fare <= 23.087
---------------------------------------- Split rule: Age <= 6.5
--------------------------

In [123]:
print("RANDOM-FOREST - using decision tree classifier - GINI IMPURITY")
rf = RandomForest(dt,num_trees = 5,min_features = 2)
rf.fit(X,Y)
yhat = rf.predict(X_test)
accuracy_scores.append(("RANDOM-FOREST - using decision tree classifier - GINI IMPURITY",get_accuracy(yhat,y_test)))



RANDOM-FOREST - using decision tree classifier - GINI IMPURITY
['Fare', 'PassengerId']
['Pclass', 'Fare', 'Age', 'PassengerId']
['PassengerId', 'Fare', 'Age', 'Pclass', 'Sex']
['Sex', 'Age', 'Fare', 'Pclass']
['Age', 'Sex', 'Fare', 'PassengerId', 'Pclass']
ACCURACY: 0.60839


In [124]:
print("RANDOM-FOREST - using ada boost classifier")
rf = RandomForest(ab,num_trees = 5,min_features = 2)
rf.fit(X,Y)
yhat = rf.predict(X_test)
accuracy_scores.append(("RANDOM-FOREST - using ada boost classifier",get_accuracy(yhat,y_test)))

RANDOM-FOREST - using ada boost classifier
['Fare', 'Pclass', 'Age', 'PassengerId']
['Age', 'PassengerId', 'Pclass']
['Fare', 'Sex', 'Pclass', 'PassengerId', 'Age']
['PassengerId', 'Fare', 'Age', 'Sex']
['PassengerId', 'Sex', 'Pclass', 'Age', 'Fare']
ACCURACY: 0.61538


In [128]:
# print(accuracy_scores)
names = []
scores = []
for name,accuracy in accuracy_scores:
    names.append(name)
    scores.append(accuracy)

print("")
import pandas as pd
data = {"Method":names,"Scores":scores}
df = pd.DataFrame(data)
df






Unnamed: 0,Method,Scores
0,Ada Boost Classifier,0.734266
1,DecisionTree Classifier - ENTROPY,0.776224
2,DecisionTree Classifier - MISCLASSIFICATION,0.755245
3,DecisionTree Classifier - GINI IMPURITY,0.776224
4,RANDOM-FOREST - using decision tree classifier...,0.608392
5,RANDOM-FOREST - using ada boost classifier,0.615385
