In [141]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [142]:
training = pd.read_csv("clean_train.csv")
testing = pd.read_csv("clean_test.csv")
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X_train = training[features]
y_train = training["Survived"]
X_test = testing[features]
training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamSize,IsAlone,Adult,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,2,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,2,0,1,2
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,1,1,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,2,0,1,2
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,1,1,1,0


In [143]:
X_training, X_testing, y_training, y_testing = train_test_split(X_train, y_train, random_state=0)

### SVM Model

In [144]:
svc = SVC()
svc.fit(X_training, y_training)
y_pred = svc.predict(X_testing)
acc_svc = accuracy_score(y_testing, y_pred)

print(acc_svc)

0.7174887892376681


### LinearSVC Model

In [145]:
linsvc = LinearSVC()
linsvc.fit(X_training, y_training)
y_pred = linsvc.predict(X_testing)
acc_linsvc = accuracy_score(y_testing, y_pred)

print(acc_linsvc)

0.7713004484304933


### RandomForest Model

In [146]:
rfc = RandomForestClassifier()
rfc.fit(X_training, y_training)
y_pred = rfc.predict(X_testing)
acc_rfc = accuracy_score(y_testing, y_pred)

print(acc_rfc)

0.7982062780269058


### LogisticRegression Model

In [147]:
logreg = LogisticRegression()
logreg.fit(X_training, y_training)
y_pred = logreg.predict(X_testing)
acc_logreg = accuracy_score(y_testing, y_pred)

print(acc_logreg)

0.7892376681614349


### KNeighbors Model

In [148]:
knn = KNeighborsClassifier()
knn.fit(X_training, y_training)
y_pred = knn.predict(X_testing)
acc_knn = accuracy_score(y_testing, y_pred)

print(acc_knn)

0.726457399103139


### GaussianNB Model

In [149]:
gnb = GaussianNB()
gnb.fit(X_training, y_training)
y_pred = gnb.predict(X_testing)
acc_gnb = accuracy_score(y_testing, y_pred)

print(acc_gnb)

0.7802690582959642


### DecisionTree Model

In [150]:
dt = DecisionTreeClassifier()
dt.fit(X_training, y_training)
y_pred = dt.predict(X_testing)
acc_dt = accuracy_score(y_testing, y_pred)

print(acc_dt)

0.7488789237668162


### XGBoost Model

In [151]:
xgb = XGBClassifier(objective="binary:logistic", n_estimators=10, seed=123)
xgb.fit(X_training, y_training)
y_pred = xgb.predict(X_testing)
acc_xgb = accuracy_score(y_testing, y_pred)

print(acc_xgb)

0.820627802690583


### GradientBoosting Model

In [152]:
gbc = GradientBoostingClassifier(n_estimators=10)
gbc.fit(X_training, y_training)
y_pred = gbc.predict(X_testing)
acc_gbc = accuracy_score(y_testing, y_pred)

print(acc_gbc)

0.8251121076233184


### NeuralNetwork Model

In [153]:
nnc = MLPClassifier(hidden_layer_sizes=[10, 10])
nnc.fit(X_training, y_training)
y_pred = nnc.predict(X_testing)
acc_nnc = accuracy_score(y_testing, y_pred)

print(acc_nnc)

0.7533632286995515


In [154]:
model_performance = pd.DataFrame({"Model":["SVC", "Linear SVC", "Random Forest", 
              "Logistic Regression", "K Nearest Neighbors", "Gaussian Naive Bayes",  
              "Decision Tree", "XGBClassifier", "Gradient Boosting", "Neural Networks"],
                                 "Accuracy":[acc_svc, acc_linsvc, acc_rfc, 
              acc_logreg, acc_knn, acc_gnb, acc_dt, acc_xgb, acc_gbc, acc_nnc]})

model_performance.sort_values(by="Accuracy", ascending=False)

Unnamed: 0,Model,Accuracy
8,Gradient Boosting,0.825112
7,XGBClassifier,0.820628
2,Random Forest,0.798206
3,Logistic Regression,0.789238
5,Gaussian Naive Bayes,0.780269
1,Linear SVC,0.7713
9,Neural Networks,0.753363
6,Decision Tree,0.748879
4,K Nearest Neighbors,0.726457
0,SVC,0.717489


In [136]:
gbc = GradientBoostingClassifier()

parameters = {"learning_rate": [0.01, 0.03, 0.1, 0.3],
            "n_estimators": [5, 10, 15, 20, 30],
            "max_depth": [1, 2, 3, 4],
            "min_samples_split": [2, 3, 5, 10],
            "max_features": [1, 2, 3, 5, None],
            "max_leaf_nodes": [2, 3, 5, None],
            }

grid_cv = GridSearchCV(gbc, parameters, scoring=make_scorer(accuracy_score))
grid_cv.fit(X_train, y_train)

grid_cv.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.3, loss='deviance', max_depth=4,
              max_features=3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=5, presort='auto',
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [139]:
gbc = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.3, loss='deviance', max_depth=4,
              max_features=3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=5, presort='auto',
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": testing["PassengerId"],
    "Survived": y_pred
})

submission.to_csv("submission.csv", index=False)
print(submission.shape)
print(submission.head())

(418, 2)
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0


In [None]:
rf_clf = RandomForestClassifier()

parameters = {"n_estimators": [4, 5, 6, 7, 8, 9, 10, 15], 
              "criterion": ["gini", "entropy"],
              "max_features": ["auto", "sqrt", "log2"], 
              "max_depth": [2, 3, 5, 10], 
              "min_samples_split": [2, 3, 5, 10],
              "min_samples_leaf": [1, 5, 8, 10]
             }

grid_cv = GridSearchCV(rf_clf, parameters, scoring = make_scorer(accuracy_score))
grid_cv = grid_cv.fit(X_train, y_train)

grid_cv.best_estimator_

In [140]:
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=9, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": testing["PassengerId"],
    "Survived": y_pred
})

submission.to_csv("submission.csv", index=False)
print(submission.shape)
print(submission.head())

(418, 2)
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         0
