In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
from sklearn.externals import joblib



In [2]:
df = pd.read_csv('csvs/train_after_eda.csv')

In [3]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,CabinAlpha,HaveCabin
827,1,2,male,1.0,0,2,37.0042,C,Master,,0
514,0,3,male,24.0,0,0,7.4958,S,Mr,,0
64,0,1,male,32.0,0,0,27.7208,C,Mr,,0
534,0,3,female,30.0,0,0,8.6625,S,Miss,,0
661,0,3,male,40.0,0,0,7.225,C,Mr,,0


In [4]:
dummy = pd.get_dummies(df['Sex'])
df = pd.concat([df, dummy], axis = 1)
dummy = pd.get_dummies(df['Embarked'])
df = pd.concat([df, dummy], axis = 1)
dummy = pd.get_dummies(df['Title'])
df = pd.concat([df, dummy], axis = 1)
df = df.drop(['Sex', 'Embarked', 'Title', 'CabinAlpha'], axis = 1)
df.head()





Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,HaveCabin,female,male,C,...,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
0,0,3,22.0,1,0,7.25,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,38.0,1,0,71.2833,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0
2,1,3,26.0,0,0,7.925,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,1,35.0,1,0,53.1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,3,35.0,0,0,8.05,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
train, test = train_test_split(df, test_size = 0.3)

In [7]:
x_train = train.drop('Survived', axis = 1)
x_test = test.drop('Survived', axis = 1)
y_train = train['Survived']
y_test = test['Survived']

In [8]:
logregmodel = LogisticRegression()

In [9]:
logregmodel.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
y_predict_lgm = logregmodel.predict(x_test)

In [42]:
x_test.columns
x_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,HaveCabin,female,male,C,Q,...,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
264,3,22.0,0,0,7.75,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
669,1,36.0,1,0,52.0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
188,3,40.0,1,1,15.5,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
473,2,23.0,0,0,13.7917,1,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
651,2,18.0,0,1,23.0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [11]:
metrics.accuracy_score(y_predict_lgm, y_test)


0.8432835820895522

In [12]:
randomforestmodel = RandomForestClassifier()

In [13]:
randomforestmodel.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
y_predict_rfm = randomforestmodel.predict(x_test)

In [15]:
metrics.accuracy_score(y_test, y_predict_rfm)

0.7985074626865671

In [16]:
gnbmodel = GaussianNB()

In [17]:
gnbmodel.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [18]:
y_predict_gnb = gnbmodel.predict(x_test)

In [19]:
metrics.accuracy_score(y_test, y_predict_gnb)

0.6791044776119403

In [20]:
svcmodel = SVC() 

In [21]:
svcmodel.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [22]:
y_predict_svc = svcmodel.predict(x_test)

In [23]:
metrics.accuracy_score(y_test, y_predict_svc)

0.7574626865671642

In [24]:
knnmodel = KNeighborsClassifier()

In [25]:
knnmodel.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [26]:
y_predict_knn = knnmodel.predict(x_test)

In [27]:
metrics.accuracy_score(y_test, y_predict_knn)

0.7649253731343284

In [28]:
decisiontreemodel = DecisionTreeClassifier()

In [29]:
decisiontreemodel.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [30]:
y_predict_dt = decisiontreemodel.predict(x_test)

In [31]:
metrics.accuracy_score(y_test, y_predict_dt)

0.7873134328358209

In [32]:
type(y_predict_dt)

numpy.ndarray

In [33]:
y_pred = np.stack((y_predict_dt, y_predict_gnb, y_predict_knn, y_predict_lgm, y_predict_rfm, y_predict_svc), axis = 1)

In [34]:
y_pred.shape

(268, 6)

In [35]:
df1 = pd.DataFrame(y_pred, columns = ['dt', 'gnb', 'knn', 'lgm', 'rfm', 'svc'])

In [36]:
df1

Unnamed: 0,dt,gnb,knn,lgm,rfm,svc
0,1,0,1,1,1,1
1,1,1,1,1,1,1
2,0,0,0,0,0,0
3,1,1,0,1,1,1
4,1,0,1,1,1,1
5,1,0,1,1,1,1
6,0,0,0,0,0,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [37]:
model = LogisticRegression()

In [38]:
model.fit(df1, y_test)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [39]:
y_ensemble = model.predict(y_pred)

In [40]:
metrics.accuracy_score(y_ensemble, y_test)

0.8208955223880597