In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Reading data from csv using pandas
data = pd.read_csv('train.csv', sep=',')
data_test = pd.read_csv('test.csv', sep=',')

In [None]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
data.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Eklund, Mr. Hans Linus",male,347082,C23 C25 C27,S
freq,1,577,7,4,644


In [None]:
data[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


In [None]:
train_df = data.drop(['Ticket', 'Cabin'], axis=1)
test_df = data_test.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

In [None]:
for dataset in combine:
  dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
  dataset['Title'] = dataset['Title'].map(title_mapping)
  dataset['Title'] = dataset['Title'].fillna(0)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


In [None]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
train_df.shape, test_df.shape
combine = [train_df, test_df]

In [None]:
for dataset in combine:
  dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,3
2,1,3,1,26.0,0,0,7.925,S,2
3,1,1,1,35.0,1,0,53.1,S,3
4,0,3,0,35.0,0,0,8.05,S,1


In [None]:
# Fill missing values of age based on the pclass, sex
# By mean of combitions of pclass and sex
# we have 3 pclass and 2 sex  which form a combination of 6
guess_ages = np.zeros((2,3))
for dataset in combine:
  for i in range(0, 2):
    for j in range(0,3):
      guess_df = dataset[(dataset['Pclass'] == j+1) & (dataset['Sex'] == i)]['Age'].dropna()

      ages = guess_df.median()
      guess_ages[i,j] = int( ages/0.5 + 0.5 ) * 0.5
  for i in range(0, 2):
      for j in range(0, 3):
        dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                'Age'] = guess_ages[i,j]
  dataset['Age'] = dataset['Age'].astype(int)
train_df.head()

# to split age into bands
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

# Normalize age
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] < 64, 'Age']

train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,7.25,S,1
1,1,1,1,2,1,0,71.2833,C,3
2,1,3,1,1,0,0,7.925,S,2
3,1,1,1,2,1,0,53.1,S,3
4,0,3,0,2,0,0,8.05,S,1


In [None]:
freq_embarked = train_df.Embarked.dropna().mode()[0]
for dataset in combine:
  dataset['Embarked'] = dataset['Embarked'].fillna(freq_embarked)

train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index = False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [None]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,7.25,0,1
1,1,1,1,2,1,0,71.2833,1,3
2,1,3,1,1,0,0,7.925,0,2
3,1,1,1,2,1,0,53.1,0,3
4,0,3,0,2,0,0,8.05,0,1


In [None]:

test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
test_df.head()


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,892,3,0,2,0,0,7.8292,2,1
1,893,3,1,2,1,0,7.0,0,3
2,894,2,0,3,0,0,9.6875,2,1
3,895,3,0,1,0,0,8.6625,0,1
4,896,3,1,1,1,1,12.2875,0,3


In [None]:
combine = [train_df, test_df]
for dataset in combine:
  dataset['isAlone'] = 0
  dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
  dataset.loc[dataset['FamilySize'] == 1, 'isAlone'] = 1

In [None]:
for dataset in combine:
  dataset['AgeClass'] = dataset['Pclass'] * dataset['Age']
train_df.loc[:,['Age', 'AgeClass']]

Unnamed: 0,Age,AgeClass
0,1,3
1,2,2
2,1,3
3,2,2
4,2,6
...,...,...
886,1,2
887,1,1
888,1,3
889,1,1


In [None]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    int64  
 3   Age         891 non-null    int64  
 4   SibSp       891 non-null    int64  
 5   Parch       891 non-null    int64  
 6   Fare        891 non-null    float64
 7   Embarked    891 non-null    int64  
 8   Title       891 non-null    int64  
 9   isAlone     891 non-null    int64  
 10  FamilySize  891 non-null    int64  
 11  AgeClass    891 non-null    int64  
 12  custom      891 non-null    int64  
dtypes: float64(1), int64(12)
memory usage: 90.6 KB


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,isAlone,FamilySize,AgeClass,custom
0,0,3,0,1,1,0,7.25,0,1,1,1,3,5
1,1,1,1,2,1,0,71.2833,1,3,1,1,2,7
2,1,3,1,1,0,0,7.925,0,2,0,0,3,7
3,1,1,1,2,1,0,53.1,0,3,1,1,2,7
4,0,3,0,2,0,0,8.05,0,1,0,0,6,6


In [None]:
# Get only sepcific columns
def processdata (data, label) :
  # Data normalization and converting pandas to numpy array
  temp = data[cols]
  temp['Fare'] = temp['Fare'].apply(lambda x:x / temp['Fare'].max()).round(2)
  # temp['Age'] = temp['Age'].apply(lambda x:x / temp['Age'].max()).round(2)
  # temp.fillna(value=0, inplace=True)
  
  temp = temp.to_numpy()
  # temp = temp / np.amax(temp, axis=0)
  if (label) :
    labels = data.loc[:, ['Survived']]
    return temp, labels.to_numpy()
  else :
    return temp

In [None]:
train_df = train_df.sample(frac=1) # to shuffle
cols = ['Age', 'Sex', 'Pclass', 'Fare', 'Embarked', 'Title', 'isAlone', 'AgeClass']
input_data_n, labels_n = processdata(train_df, True) # to fill null values
test_data_n = processdata(test_df, False)
print(input_data_n.shape)
labels_n.shape
# input_data.loc[input_data.Age.notnull()] # Returns
# input_data = input_data.dropna(how='any') # To drop any rows that have missing data.

(891, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


(891, 1)

In [None]:
# hyper parameters
length = len(input_data_n)
split_size = 1

In [None]:
length = len(input_data_n)
train_input = input_data_n[:int(length*split_size)]
train_label = labels_n[:int(length*split_size)]
test_input = input_data_n[int(length*split_size):]
test_label = labels_n[int(length*split_size):]
print(train_input.shape)
print(train_label.shape)
print(test_input.shape)
print(test_label.shape)

(891, 9)
(891, 1)
(0, 9)
(0, 1)


In [None]:
classifiers = [ 
        SVC(), 
        GaussianNB(), 
        KNeighborsClassifier(), 
        Perceptron(), 
        LinearSVC(), 
        SGDClassifier(), 
        DecisionTreeClassifier(), 
        RandomForestClassifier()
      ]

In [None]:
def getScores(validation):
  train_scores = []
  test_scores = []
  for clf in classifiers:
    clf.fit(train_input, train_label)
    train_scores.append(round(clf.score(train_input, train_label) * 100, 2))
    if (validation) :
       test_scores.append(round(clf.score(test_input, test_label) * 100, 2))
  return train_scores, test_scores

In [None]:
def getSubmission (value):
  for clf in classifiers:
    if (clf.__class__.__name__ == value):
      print(clf.__class__.__name__)
      predictions = clf.predict(test_data_n)
  return predictions

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, input_shape=[train_input.shape[1]], activation='relu'),
    tf.keras.layers.Dense(5, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 10)                100       
_________________________________________________________________
dense_34 (Dense)             (None, 5)                 55        
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 6         
Total params: 161
Trainable params: 161
Non-trainable params: 0
_________________________________________________________________


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])

In [None]:
#history = model.fit(train_input, train_label, validation_data=(test_input, test_label), epochs=500, verbose=2)
history = model.fit(train_input, train_label, epochs=500, verbose=2)

Epoch 470/500
28/28 - 0s - loss: 0.3851 - acc: 0.8260
Epoch 471/500
28/28 - 0s - loss: 0.3905 - acc: 0.8294
Epoch 472/500
28/28 - 0s - loss: 0.3804 - acc: 0.8395
Epoch 473/500
28/28 - 0s - loss: 0.3952 - acc: 0.8339
Epoch 474/500
28/28 - 0s - loss: 0.3846 - acc: 0.8406
Epoch 475/500
28/28 - 0s - loss: 0.3869 - acc: 0.8328
Epoch 476/500
28/28 - 0s - loss: 0.3832 - acc: 0.8395
Epoch 477/500
28/28 - 0s - loss: 0.3949 - acc: 0.8260
Epoch 478/500
28/28 - 0s - loss: 0.3888 - acc: 0.8249
Epoch 479/500
28/28 - 0s - loss: 0.3918 - acc: 0.8361
Epoch 480/500
28/28 - 0s - loss: 0.3900 - acc: 0.8272
Epoch 481/500
28/28 - 0s - loss: 0.3851 - acc: 0.8350
Epoch 482/500
28/28 - 0s - loss: 0.3881 - acc: 0.8339
Epoch 483/500
28/28 - 0s - loss: 0.3845 - acc: 0.8339
Epoch 484/500
28/28 - 0s - loss: 0.3862 - acc: 0.8305
Epoch 485/500
28/28 - 0s - loss: 0.3903 - acc: 0.8339
Epoch 486/500
28/28 - 0s - loss: 0.3917 - acc: 0.8339
Epoch 487/500
28/28 - 0s - loss: 0.3846 - acc: 0.8328
Epoch 488/500
28/28 - 0s - l

In [None]:
def getNNPrediction (input, label, predictions = False):
  pred = model.predict(input)
  pred = np.around(pred).astype(int)
  if (predictions):
    return pred.flatten()
  acc_pred = np.mean(pred == label)
  return acc_pred

def getNNScores(validation):
  train_scores, test_scores = getScores(validation)
  acc_pred = getNNPrediction(train_input, train_label) 
  train_scores.append(acc_pred * 100)
  if (validation):
    acc_pred = getNNPrediction(train_input, train_label)
    test_scores.append(acc_pred * 100)
  else:
    test_scores = train_scores
  return train_scores, test_scores  

In [None]:
train_scores, test_scores = getNNScores(False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  """
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  """
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  """
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  """


In [None]:
names = list(map(lambda x: x.__class__.__name__, classifiers))
names.append("Neural network")
models = pd.DataFrame({
    'models': names,
    'train-accuracy': train_scores,
    'validation-accuracy': test_scores 
})
models.sort_values(by='validation-accuracy', ascending=False)

Unnamed: 0,models,train-accuracy,validation-accuracy
6,DecisionTreeClassifier,89.56,89.56
7,RandomForestClassifier,89.56,89.56
2,KNeighborsClassifier,86.2,86.2
8,Neural network,83.501684,83.501684
4,LinearSVC,79.12,79.12
0,SVC,78.23,78.23
5,SGDClassifier,75.76,75.76
3,Perceptron,74.97,74.97
1,GaussianNB,74.3,74.3


Unnamed: 0,models,train-accuracy,validation-accuracy
6,DecisionTreeClassifier,89.56,89.56
7,RandomForestClassifier,89.56,89.56
2,KNeighborsClassifier,85.41,85.41
8,Neural network,83.05275,83.05275
4,LinearSVC,79.01,79.01
0,SVC,78.34,78.34
3,Perceptron,74.86,74.86
1,GaussianNB,71.16,71.16
5,SGDClassifier,38.38,38.38


In [None]:
# predictions = getNNPrediction(test_data_n, [], True)
predictions = getSubmission('DecisionTreeClassifier')
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})

output.to_csv('my_submission.csv', index=False)

DecisionTreeClassifier


In [None]:
test_validation = train_df[:int(length*split_size)]
test_validation.loc[:, 'prediction'] = getNNPrediction(train_input, [], True)
cols_val = ['Survived', 'prediction']
cols_val = cols_val + list(cols)
wrongPred = test_validation.loc[test_validation['prediction'] != test_validation['Survived'], cols_val]
wrongPred.groupby(['prediction']).mean().astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0_level_0,Survived,Age,Sex,Pclass,Fare,Embarked,Title,isAlone,AgeClass
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,2,0,2,23,0,1,0,3
1,0,1,0,2,33,0,2,0,2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0_level_0,Survived,Age,Sex,Pclass,Fare,Embarked,Title,isAlone,AgeClass,custom
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,2,0,2,33,0,1,0,3,6
1,0,1,0,2,38,0,2,0,2,6


In [None]:
acc = history.history['acc']
loss = history.history['loss']
val_acc = history.history['val_acc']
val_loss = history.history['val_loss']
epochs = range(len(acc))

plt.plot(acc)
plt.plot(val_acc)
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.legend(['Accuracy', 'Validation accuracy'])
plt.title('Accuracy')

KeyError: ignored

KeyError: ignored