In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn import tree
from keras import Model
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from keras.layers import Dense , Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preprocessing

In [1]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.head()

In [1]:
train.shape

In [1]:
train.info()

In [1]:
train.isnull().sum()

In [1]:
train.describe()

In [1]:
train.head()

In [1]:
train.Age.fillna(round(train.Age.mean(),2),inplace = True)

In [1]:
labelencoder = LabelEncoder()
train['SexLabel'] = labelencoder.fit_transform(train['Sex'])
train.head()

In [1]:
max_value = train.Fare.max()
print(max_value)
min_value = train.Fare.min()
print(min_value)
def scale_fare(x):
    s = (max_value - x)/(max_value - min_value)
    return round(s,3)

train.Fare = train.Fare.apply(scale_fare)

In [1]:
train.head()

In [1]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(train[['Survived']]).toarray())
train = train.join(enc_df)
train.head()

In [1]:
# train.rename(columns={0:'Survived0',1:'Survived1'},inplace = True)
# train.head()

In [1]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask,cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5,annot=True, cbar_kws={"shrink": .5})

In [1]:
trainy = train.Survived
trainy.head()

In [1]:
trainy2 = train[[0,1]]

In [1]:
trainx = train.drop('Survived',axis = 1)
trainx.head()

In [1]:
X_train, X_test, y_train, y_test = train_test_split(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']]
                                                    ,trainy
                                                    , test_size=0.15, random_state=0)

In [1]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test.head()

In [1]:
test.isnull().sum()

In [1]:
test.Age.fillna(round(test.Age.mean(),2),inplace = True)
test.Fare.fillna(round(test.Fare.mean(),4),inplace = True)

In [1]:
max_value = test.Fare.max()
print(max_value)
min_value = test.Fare.min()
print(min_value)
def scale_fare(x):
    s = (max_value - x)/(max_value - min_value)
    return round(s,3)

test.Fare = test.Fare.apply(scale_fare)

In [1]:
labelencoder = LabelEncoder()
test['SexLabel'] = labelencoder.fit_transform(test['Sex'])

In [1]:
test.head()

# Logistic Regression

In [1]:
LR = LogisticRegression()
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
weight = ['balanced', None]
parameters = {'solver':solver,
              'class_weight':weight}

In [1]:
clf = GridSearchCV(LR, parameters)
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy)

In [1]:
best_parameters = pd.DataFrame(clf.cv_results_)
best_parameters.head()

In [1]:
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
LR = LogisticRegression()
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
weight = ['balanced', None]
parameters = {'solver':solver,
              'class_weight':weight}

clf = GridSearchCV(LR, parameters)

In [1]:
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','Parch']],trainy)
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
LR = LogisticRegression()
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
weight = ['balanced', None]
parameters = {'solver':solver,
              'class_weight':weight}

clf = GridSearchCV(LR, parameters)

In [1]:
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp']],trainy)
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
LR = LogisticRegression()
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
weight = ['balanced', None]
parameters = {'solver':solver,
              'class_weight':weight}

clf = GridSearchCV(LR, parameters)

In [1]:
clf.fit(trainx[['Pclass','SexLabel','Fare','Age']],trainy)
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
LR = LogisticRegression(solver='sag',class_weight=None)
LR.fit(X_train, y_train)

In [1]:
y_pred = LR.predict(X_test)
y_pred = pd.DataFrame(y_pred,columns=['y_pred'])
y_pred.head(5)

In [1]:
counter = 0
y_pred['index'] = 0
for indexs in X_test.index.values:
    y_pred['index'][counter] = indexs
    counter = counter +  1

In [1]:
y_pred = y_pred.set_index('index')
y_pred.head(5)

In [1]:
y_test = pd.DataFrame(y_test,columns=['Survived'])
y_test.head(5)

In [1]:
common = y_test.merge(y_pred, left_index=True, right_index=True)
common.head(5)

In [1]:
mat = confusion_matrix(y_test,y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [1]:
print(classification_report(y_test, y_pred))

In [1]:
count_misclassified = (common.Survived != common.y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

In [1]:
LR = LogisticRegression(solver='sag',class_weight=None)
LR.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy)

In [1]:
y_pred = LR.predict(test[['Pclass','SexLabel','Fare','Age','SibSp','Parch']])
y_pred = pd.DataFrame(y_pred,columns=['Survived'])
y_pred.head(5)

In [1]:
y_pred['PassengerId'] = test['PassengerId']
y_pred.head(5)

In [1]:
len(y_pred)

In [1]:
y_pred = y_pred[['PassengerId','Survived']]
y_pred.to_csv('Submission_Logestic.csv',index = False)

# SVM

In [1]:
svm = SVC()
kernel = ['linear','poly','rbf','sigmoid']
weight = ['balanced', None]
parameters = {'kernel':kernel,
              'class_weight':weight}

In [1]:
clf = GridSearchCV(svm, parameters)
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy)

In [1]:
best_parameters = pd.DataFrame(clf.cv_results_)
best_parameters.head()

In [1]:
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
# svm = SVC()
# kernel = ['linear','poly','rbf','sigmoid']
# weight = ['balanced', None]
# parameters = {'kernel':kernel,
#               'class_weight':weight}

# clf = GridSearchCV(svm, parameters)

In [1]:
# clf.fit(trainx[['Pclass','SexLabel','Fare','Age','Parch']],trainy)
# best_parameters = pd.DataFrame(clf.cv_results_)
# indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
# best_parameters.loc[indexs,'params']

In [1]:
# best_parameters.loc[indexs,'mean_test_score']

In [1]:
clf.fit(trainx[['Pclass','SexLabel','Fare','Age']],trainy)
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
svm = SVC(kernel = 'linear' ,class_weight ='balanced')
svm.fit(X_train, y_train)

In [1]:
y_pred = svm.predict(X_test)
y_pred = pd.DataFrame(y_pred,columns=['y_pred'])
y_pred.head(5)

In [1]:
counter = 0
y_pred['index'] = 0
for indexs in X_test.index.values:
    y_pred['index'][counter] = indexs
    counter = counter +  1

In [1]:
y_pred = y_pred.set_index('index')
y_pred.head(5)

In [1]:
y_test = pd.DataFrame(y_test,columns=['Survived'])
y_test.head(5)

In [1]:
common = y_test.merge(y_pred, left_index=True, right_index=True)
common.head(5)

In [1]:
mat = confusion_matrix(y_test,y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [1]:
print(classification_report(y_test, y_pred))

In [1]:
count_misclassified = (common.Survived != common.y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

In [1]:
svm = SVC(kernel = 'linear' ,class_weight ='balanced')
svm.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy)

In [1]:
y_pred = svm.predict(test[['Pclass','SexLabel','Fare','Age','SibSp','Parch']])
y_pred = pd.DataFrame(y_pred,columns=['Survived'])
y_pred.head(5)

In [1]:
y_pred['PassengerId'] = test['PassengerId']
y_pred.head(5)

In [1]:
y_pred = y_pred[['PassengerId','Survived']]
y_pred.to_csv('Submission_bestSVM_gridsearch.csv',index = False)

# Decision Tree

In [1]:
DT = tree.DecisionTreeClassifier()
criterion = ['gini', 'entropy']
max_depth = [3,4,5,6,7,8,9]
min_sample = [10,15,20,25,40,50,60,70,100]
weight = ['balanced', None]
parameters = {'criterion':criterion,
              'max_depth':max_depth,
              'min_samples_split':min_sample,
              'class_weight':weight}

In [1]:
clf = GridSearchCV(DT, parameters)
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy)

In [1]:
best_parameters = pd.DataFrame(clf.cv_results_)
best_parameters.head()

In [1]:
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
DT = tree.DecisionTreeClassifier()
criterion = ['gini', 'entropy']
max_depth = [3,4,5,6,7,8,9]
min_sample = [10,15,20,25,40,50,60,70,100]
weight = ['balanced', None]
parameters = {'criterion':criterion,
              'max_depth':max_depth,
              'min_samples_split':min_sample,
              'class_weight':weight}

clf = GridSearchCV(DT, parameters)

In [1]:
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','Parch']],trainy)
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
DT = tree.DecisionTreeClassifier()
criterion = ['gini', 'entropy']
max_depth = [3,4,5,6,7,8,9]
min_sample = [10,15,20,25,40,50,60,70,100]
weight = ['balanced', None]
parameters = {'criterion':criterion,
              'max_depth':max_depth,
              'min_samples_split':min_sample,
              'class_weight':weight}

clf = GridSearchCV(DT, parameters)

In [1]:
clf.fit(trainx[['Pclass','SexLabel','Fare','Age']],trainy)
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
DT = tree.DecisionTreeClassifier(criterion= 'entropy',max_depth = 8 ,min_samples_split = 15,class_weight =None )
DT.fit(X_train, y_train)

In [1]:
tree.plot_tree(DT) 

In [1]:
y_pred = DT.predict(X_test)
y_pred = pd.DataFrame(y_pred,columns=['y_pred'])
y_pred.head(5)

In [1]:
counter = 0
y_pred['index'] = 0
for indexs in X_test.index.values:
    y_pred['index'][counter] = indexs
    counter = counter +  1

In [1]:
y_pred = y_pred.set_index('index')
y_pred.head(5)

In [1]:
y_test = pd.DataFrame(y_test,columns=['Survived'])
y_test.head(5)

In [1]:
common = y_test.merge(y_pred, left_index=True, right_index=True)
common.head(5)

In [1]:
mat = confusion_matrix(y_test,y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [1]:
print(classification_report(y_test, y_pred))

In [1]:
count_misclassified = (common.Survived != common.y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

In [1]:
DT = tree.DecisionTreeClassifier(criterion= 'entropy',max_depth = 8 ,min_samples_split = 15,class_weight =None )
DT.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy)

In [1]:
y_pred = DT.predict(test[['Pclass','SexLabel','Fare','Age','SibSp','Parch']])
y_pred = pd.DataFrame(y_pred,columns=['Survived'])
y_pred.head(5)

In [1]:
y_pred['PassengerId'] = test['PassengerId']
y_pred.head(5)

In [1]:
y_pred = y_pred[['PassengerId','Survived']]
y_pred.to_csv('Submission_bestDT_gridsearch.csv',index = False)

# Deep Learning Models

In [1]:
def create_model(nlayer1 = 20 , active1 = 'relu' ,
                 nlayer2 = 10 ,active2 = 'relu',
                 nlayer3 = 4 ,active3 = 'relu'):
    # create model
    model = Sequential()
    model.add(Dense(nlayer1, input_dim=6, activation= active1))
    model.add(Dropout(0.2))
    model.add(Dense(nlayer2, activation=active2))
    model.add(Dense(nlayer3, activation=active3))
    model.add(Dense(2, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



model = KerasClassifier(build_fn=create_model, verbose=0, validation_split=0.2)

neurons1 = [20,10]
neurons2 = [15,10]
neurons3 = [4]
nepochs = [30]
nbatch_size = [5,15,50]
active_func = ['relu','linear','tanh']

param_grid = {"epochs" :nepochs ,'batch_size':nbatch_size,
              "nlayer1":neurons1,"nlayer2":neurons2,"nlayer3":neurons3,
              "active1":active_func,"active2":active_func,"active3":active_func}

stopper = EarlyStopping(monitor='val_accuracy', patience=8, verbose=1)
fit_params = dict(callbacks=[stopper])

In [1]:
clf = GridSearchCV(estimator=model, param_grid=param_grid)
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy2,**fit_params)

In [1]:
best_parameters = pd.DataFrame(clf.cv_results_)
best_parameters.head()

In [1]:
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
active1= 'relu'
active2= 'linear'
active3= 'linear'
batch_size = 5
epochs= 30
nlayer1= 20
nlayer2= 10
nlayer3= 4

In [1]:
model = Sequential()
model.add(Dense(nlayer1, input_dim=6, activation= active1))
model.add(Dropout(0.2))
model.add(Dense(nlayer2, activation=active2))
model.add(Dense(nlayer3, activation=active3))
model.add(Dense(2, activation='softmax'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
stopper = EarlyStopping(monitor='val_accuracy', patience=8, verbose=1)
model.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp','Parch']],trainy2,
          batch_size = batch_size,
          epochs = epochs,
          verbose=0, validation_split=0.2 ,callbacks=[stopper])

In [1]:
pred = model.predict(test[['Pclass','SexLabel','Fare','Age','SibSp','Parch']])
pred = pd.DataFrame(pred)
pred['Survived'] = pred.idxmax(axis=1)
pred['PassengerId'] = test['PassengerId']
result = pred[['PassengerId','Survived']]
result.head()

In [1]:
result.to_csv('submission_best_nn3.csv',index = False)

In [1]:
def create_model(nlayer1 = 20 , active1 = 'relu' ,
                 nlayer2 = 10 ,active2 = 'relu',
                 nlayer3 = 4 ,active3 = 'relu'):
    # create model
    model = Sequential()
    model.add(Dense(nlayer1, input_dim=5, activation= active1))
    model.add(Dropout(0.2))
    model.add(Dense(nlayer2, activation=active2))
    model.add(Dense(nlayer3, activation=active3))
    model.add(Dense(2, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



model = KerasClassifier(build_fn=create_model, verbose=0, validation_split=0.2)

neurons1 = [20,10]
neurons2 = [15,10]
neurons3 = [4]
nepochs = [30]
nbatch_size = [5,15,50]
active_func = ['relu','linear','tanh']

param_grid = {"epochs" :nepochs ,'batch_size':nbatch_size,
              "nlayer1":neurons1,"nlayer2":neurons2,"nlayer3":neurons3,
              "active1":active_func,"active2":active_func,"active3":active_func}

stopper = EarlyStopping(monitor='val_accuracy', patience=8, verbose=1)
fit_params = dict(callbacks=[stopper])

In [1]:
clf = GridSearchCV(estimator=model, param_grid=param_grid)
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','Parch']],trainy2,**fit_params)

In [1]:
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
def create_model(nlayer1 = 20 , active1 = 'relu' ,
                 nlayer2 = 10 ,active2 = 'relu',
                 nlayer3 = 4 ,active3 = 'relu'):
    # create model
    model = Sequential()
    model.add(Dense(nlayer1, input_dim=4, activation= active1))
    model.add(Dropout(0.2))
    model.add(Dense(nlayer2, activation=active2))
    model.add(Dense(nlayer3, activation=active3))
    model.add(Dense(2, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



model = KerasClassifier(build_fn=create_model, verbose=0, validation_split=0.2)

neurons1 = [20,10]
neurons2 = [15,10]
neurons3 = [4]
nepochs = [30]
nbatch_size = [5,15,50]
active_func = ['relu','linear','tanh']

param_grid = {"epochs" :nepochs ,'batch_size':nbatch_size,
              "nlayer1":neurons1,"nlayer2":neurons2,"nlayer3":neurons3,
              "active1":active_func,"active2":active_func,"active3":active_func}

stopper = EarlyStopping(monitor='val_accuracy', patience=8, verbose=1)
fit_params = dict(callbacks=[stopper])

In [1]:
clf = GridSearchCV(estimator=model, param_grid=param_grid)
clf.fit(trainx[['Pclass','SexLabel','Fare','Age']],trainy2,**fit_params)

In [1]:
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']

In [1]:
def create_model(nlayer1 = 20 , active1 = 'relu' ,
                 nlayer2 = 10 ,active2 = 'relu',
                 nlayer3 = 4 ,active3 = 'relu'):
    # create model
    model = Sequential()
    model.add(Dense(nlayer1, input_dim=5, activation= active1))
    model.add(Dropout(0.2))
    model.add(Dense(nlayer2, activation=active2))
    model.add(Dense(nlayer3, activation=active3))
    model.add(Dense(2, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



model = KerasClassifier(build_fn=create_model, verbose=0, validation_split=0.2)

neurons1 = [20,10]
neurons2 = [15,10]
neurons3 = [4]
nepochs = [30]
nbatch_size = [5,15,50]
active_func = ['relu','linear']

param_grid = {"epochs" :nepochs ,'batch_size':nbatch_size,
              "nlayer1":neurons1,"nlayer2":neurons2,"nlayer3":neurons3,
              "active1":active_func,"active2":active_func,"active3":active_func}

stopper = EarlyStopping(monitor='val_accuracy', patience=8, verbose=1)
fit_params = dict(callbacks=[stopper])

In [1]:
clf = GridSearchCV(estimator=model, param_grid=param_grid)
clf.fit(trainx[['Pclass','SexLabel','Fare','Age','SibSp']],trainy2,**fit_params)

In [1]:
best_parameters = pd.DataFrame(clf.cv_results_)
indexs = best_parameters[best_parameters['rank_test_score'] == 1].index[0]
best_parameters.loc[indexs,'params']

In [1]:
best_parameters.loc[indexs,'mean_test_score']