### Import dependencies, read files and show dataframes

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [6]:
# Show train and test datasets
train_df = pd.read_csv('train.csv', encoding='utf-8')
test_df = pd.read_csv('test.csv', encoding='utf-8')

train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


### Preprocessing: check empty cells and correct them.

In [8]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# Correct invalid data
train_df = train_df.drop(columns='Cabin', axis=1)
test_df = test_df.drop(columns='Cabin', axis=1)
test_df = test_df.dropna()

In [10]:
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)

In [11]:
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)

In [12]:
# Check empty cells again
train_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [13]:
train_df.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)
test_df.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,0
4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,0
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.4500,0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,1


In [14]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0000,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,0
...,...,...,...,...,...,...,...,...,...,...
409,1301,3,"Peacock, Miss. Treasteall",1,3.0,1,1,SOTON/O.Q. 3101315,13.7750,0
411,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",1,37.0,1,0,19928,90.0000,2
412,1304,3,"Henriksson, Miss. Jenny Lovisa",1,28.0,0,0,347086,7.7750,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,1


### Split on train and test datasets

In [15]:
X = train_df.drop(columns = ['PassengerId','Name','Ticket','Survived'], axis=1)
y = train_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

### Start training Logistic regression model


In [16]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Show metrics

In [17]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Train accuracy = {:.3%}'.format(training_data_accuracy))

test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_prediction)
print('Test accuracy = {:.3%}'.format(test_accuracy))

matrix = confusion_matrix(y_train, model.predict(X_train))
tp = matrix[0][0]
tn = matrix[0][1]
fp = matrix[1][0]
fn = matrix[1][1]
print("\nConfusion matrix for train: \n")
print("TP = {}. \tTN = {} \nFP = {}. \tFN = {}".format(tp, tn, fp, fn))

test_predict = model.predict(X_test)
print("\nCount of survived people = ", np.count_nonzero(test_predict == 1))
print("Count of died people = ", np.count_nonzero(test_predict == 0))

Train accuracy = 80.758%
Test accuracy = 78.212%

Confusion matrix for train: 

TP = 390. 	TN = 59 
FP = 78. 	FN = 185

Count of survived people =  58
Count of died people =  121


### Start training SVC model


In [18]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train, y_train)



### Show metrics

In [19]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Train accuracy = {:.3%}'.format(training_data_accuracy))

test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_prediction)
print('Test accuracy = {:.3%}'.format(test_accuracy))

matrix = confusion_matrix(y_train, model.predict(X_train))
tp = matrix[0][0]
tn = matrix[0][1]
fp = matrix[1][0]
fn = matrix[1][1]
print("\nConfusion matrix for train: \n")
print("TP = {}. \tTN = {} \nFP = {}. \tFN = {}".format(tp, tn, fp, fn))

test_predict = model.predict(X_test)
print("\nCount of survived people = ", np.count_nonzero(test_predict == 1))
print("Count of died people = ", np.count_nonzero(test_predict == 0))

Train accuracy = 36.938%
Test accuracy = 44.134%

Confusion matrix for train: 

TP = 0. 	TN = 449 
FP = 0. 	FN = 263

Count of survived people =  179
Count of died people =  0


### Start training MLP model


In [20]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(
    solver= 'lbfgs',
    hidden_layer_sizes= (100,100),
    random_state = 0
).fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


### Show metrics

In [21]:
X_train_prediction = clf.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Train accuracy = {:.3%}'.format(training_data_accuracy))

test_prediction = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_prediction)
print('Test accuracy = {:.3%}'.format(test_accuracy))

matrix = confusion_matrix(y_train, clf.predict(X_train))
tp = matrix[0][0]
tn = matrix[0][1]
fp = matrix[1][0]
fn = matrix[1][1]
print("\nConfusion matrix for train: \n")
print("TP = {}. \tTN = {} \nFP = {}. \tFN = {}".format(tp, tn, fp, fn))

test_predict = clf.predict(X_test)
print("\nCount of survived people = ", np.count_nonzero(test_predict == 1))
print("Count of died people = ", np.count_nonzero(test_predict == 0))

Train accuracy = 82.022%
Test accuracy = 76.536%

Confusion matrix for train: 

TP = 393. 	TN = 56 
FP = 72. 	FN = 191

Count of survived people =  61
Count of died people =  118


### Start training decision tree model


In [22]:
from sklearn.tree import DecisionTreeClassifier

max_depth = 3
clf2 = DecisionTreeClassifier(
    criterion=  'entropy',
    random_state=20,        
    max_depth=max_depth,    
#     max_leaf_nodes=4,
).fit(X_train, y_train)

### Show metrics

In [27]:
X_train_prediction = clf.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Train accuracy = {:.3%}'.format(training_data_accuracy))

test_prediction = clf2.predict(X_test)
test_accuracy = accuracy_score(y_test, test_prediction)
print('Test accuracy = {:.3%}'.format(test_accuracy))

matrix = confusion_matrix(y_train, clf2.predict(X_train))
tp = matrix[0][0]
tn = matrix[0][1]
fp = matrix[1][0]
fn = matrix[1][1]
print("\nConfusion matrix for train: \n")
print("TP = {}. \tTN = {} \nFP = {}. \tFN = {}".format(tp, tn, fp, fn))

test_predict = clf.predict(X_test)
print("\nCount of survived people = ", np.count_nonzero(test_predict == 1))
print("Count of died people = ", np.count_nonzero(test_predict == 0))

Train accuracy = 82.022%
Test accuracy = 79.888%

Confusion matrix for train: 

TP = 401. 	TN = 48 
FP = 70. 	FN = 193

Count of survived people =  61
Count of died people =  118


### Start training Random Forest


In [24]:
from sklearn.ensemble import RandomForestClassifier

clfr = RandomForestClassifier().fit (X_train, y_train)

### Show metrics

In [26]:
X_train_prediction = clfr.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Train accuracy = {:.3%}'.format(training_data_accuracy))

test_prediction = clfr.predict(X_test)
test_accuracy = accuracy_score(y_test, test_prediction)
print('Test accuracy = {:.3%}'.format(test_accuracy))

matrix = confusion_matrix(y_train, clfr.predict(X_train))
tp = matrix[0][0]
tn = matrix[0][1]
fp = matrix[1][0]
fn = matrix[1][1]
print("\nConfusion matrix for train: \n")
print("TP = {}. \tTN = {} \nFP = {}. \tFN = {}".format(tp, tn, fp, fn))

test_predict = clf.predict(X_test)
print("\nCount of survived people = ", np.count_nonzero(test_predict == 1))
print("Count of died people = ", np.count_nonzero(test_predict == 0))

Train accuracy = 98.596%
Test accuracy = 81.006%

Confusion matrix for train: 

TP = 446. 	TN = 3 
FP = 7. 	FN = 256

Count of survived people =  61
Count of died people =  118
