# Machine Learning Methods: Titanic Data Collection

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import joblib
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import svm

In [9]:
titanic_file = pd.read_csv("titanic.csv")
titanic_file = titanic_file[titanic_file[['Sex','Fare',"Pclass", "SibSp","Age"]].notnull().all(1)]

TRIALS = 100

In [None]:
data_gauss = np.empty((0,1))
data_multinomial= np.empty((0,1))
data_bernoulli = np.empty((0,1))
data_svm = np.empty((0,1))
data_dt_base = np.empty((0,1))
data_dt_cross_validated = np.empty((0,1))


for x in range(TRIALS):
    X = titanic_file.drop(columns=["Survived","Name","Ticket","PassengerId","Embarked","Cabin",
                               "Parch"])
    y = titanic_file["Survived"]
    le_sex = LabelEncoder()
    X["Sex_n"] = le_sex.fit_transform(X["Sex"])
    X = X.drop(columns=["Sex"])
    X_training, X_testing, y_training, y_testing = train_test_split(X,y,test_size=0.25)
    #gauss
    model = GaussianNB()
    model.fit(X_training, y_training)
    prediction = model.predict(X_testing)
    score = accuracy_score(y_testing,prediction)
    data_gauss = np.vstack((data_gauss,score))
    #Bernoulli
    model = BernoulliNB()
    model.fit(X_training, y_training)
    prediction = model.predict(X_testing)
    score = accuracy_score(y_testing,prediction)
    data_bernoulli = np.vstack((data_bernoulli,score))
    #Multinomial
    model = MultinomialNB()
    model.fit(X_training, y_training)
    prediction = model.predict(X_testing)
    score = accuracy_score(y_testing,prediction)
    data_multinomial = np.vstack((data_multinomial,score))
    #SVM
    model = svm.SVC()
    model.fit(X_training, y_training)
    prediction = model.predict(X_testing)
    score = accuracy_score(prediction, y_testing)
    data_svm = np.vstack((data_svm,score))
    #base decision tree
    model = DecisionTreeClassifier()
    model.fit(X_training,y_training)
    prediction = model.predict(X_testing)
    score = accuracy_score(y_testing,prediction)
    data_dt_base = np.vstack((data_dt_base,score))
    #cross validated decision tree
    model = DecisionTreeClassifier()
    parameters = {"max_depth":range(3,20)}
    clf = GridSearchCV(model, parameters, n_jobs=4)
    clf.fit(X=X_training, y=y_training)
    cross_validated_model = clf.best_estimator_
    prediction = cross_validated_model.predict(X_testing)
    score = accuracy_score(y_testing, prediction)
    data_dt_cross_validated = np.vstack((data_dt_cross_validated,score))

print(np.mean(data_gauss),np.std(data_gauss))
print(np.mean(data_bernoulli),np.std(data_bernoulli))
print(np.mean(data_multinomial),np.std(data_multinomial))
print(np.mean(data_svm),np.std(data_svm))
print(np.mean(data_dt_base),np.std(data_dt_base))
print(np.mean(data_dt_cross_validated),np.std(data_dt_cross_validated))