In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as k
from tensorflow.keras import layers as l

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# Converting Text column 'Sex' to numeric
df_train.Sex = pd.get_dummies(df_train['Sex'], columns='Sex')

In [6]:
# Making sure null values and zeros are erradicated from column 'Age'. Such places will be occupied by the mean for it 
# to be more convenient rather than having null values or zeros for passenger age
df_train.Age = df_train.Age.replace(0, np.NaN)
mean_age = int(df_train['Age'].mean())
df_train.Age = df_train.Age.replace(np.NaN, mean_age)
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.560236,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,13.00501,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,0.0,29.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [7]:
# Categorizing and converting the 'Embarked' column to corresponding numeric values
df_train.Embarked = pd.Categorical(df_train.Embarked, ('C', 'Q', 'S'), ordered=True)
df_train.Embarked = df_train.Embarked.cat.codes

In [8]:
# Wiping out unnecessary details from the train dataset
df_train = df_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df_train.head(63)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,2
1,1,1,1,38.0,1,0,71.2833,0
2,1,3,1,26.0,0,0,7.9250,2
3,1,1,1,35.0,1,0,53.1000,2
4,0,3,0,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
58,1,2,1,5.0,1,2,27.7500,2
59,0,3,0,11.0,5,2,46.9000,2
60,0,3,0,22.0,0,0,7.2292,0
61,1,1,1,38.0,0,0,80.0000,-1


In [9]:
# Making sure column 'Embarked' does not contain NULL values
mean_embarked = int(df_train['Embarked'].mean())
df_train.Embarked = df_train.Embarked.replace(-1, mean_embarked)

In [10]:
df_train['family_size'] = df_train.SibSp + df_train.Parch + 1
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,family_size
0,0,3,0,22.0,1,0,7.2500,2,2
1,1,1,1,38.0,1,0,71.2833,0,2
2,1,3,1,26.0,0,0,7.9250,2,1
3,1,1,1,35.0,1,0,53.1000,2,2
4,0,3,0,35.0,0,0,8.0500,2,1
...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,2,1
887,1,1,1,19.0,0,0,30.0000,2,1
888,0,3,1,29.0,1,2,23.4500,2,4
889,1,1,0,26.0,0,0,30.0000,0,1


In [11]:
max(df_train['family_size'])

11

In [12]:
def family_strength(fam_size):
    strength = 0
    if fam_size == 1:
#         alone
        strength = 1
    elif fam_size <=5:
#         medium
        strength = 2
    else:
#         strong
        strength = 3

In [15]:
df_train['family_strength'] = df_train['family_size'].map(int, family_strength)

In [16]:
df_train['alone'] = [1 if i == 1 else 0 for i in df_train.family_size]
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,family_size,family_strength,alone
0,0,3,0,22.0,1,0,7.2500,2,2,2,0
1,1,1,1,38.0,1,0,71.2833,0,2,2,0
2,1,3,1,26.0,0,0,7.9250,2,1,1,1
3,1,1,1,35.0,1,0,53.1000,2,2,2,0
4,0,3,0,35.0,0,0,8.0500,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,2,1,1,1
887,1,1,1,19.0,0,0,30.0000,2,1,1,1
888,0,3,1,29.0,1,2,23.4500,2,4,4,0
889,1,1,0,26.0,0,0,30.0000,0,1,1,1


In [None]:
y = df_train['Survived']
x = df_train.drop(['Survived'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

# Splitting data to train and test where 10% will be used to test and the rest to be trained
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [None]:
df_train[['Age', 'Survived']].groupby(['Age'], as_index=False).mean().sort_values(by='Survived')

In [None]:
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Cleaning the test dataset same as the train dataset

In [None]:
df_test.Sex = pd.get_dummies(df_test['Sex'])
df_test.Embarked = pd.Categorical(df_test.Embarked, ('C', 'Q', 'S'), ordered=True)
df_test.Embarked = df_test.Embarked.cat.codes
df_test.head()

In [None]:
passenger_id = df_test['PassengerId']
df_test = df_test.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
df_test.head()

In [None]:
df_test.Age = df_test.Age.replace(0, np.NaN)
test_mean_age = int(df_test.Age.mean())
df_test.Age = df_test.Age.replace(np.NaN, test_mean_age)

In [None]:
test_mean_fare = df_test.Fare.mean()
df_test.Fare = df_test.Fare.replace(np.NaN, test_mean_fare)

In [None]:
test_mean_embarked = int(df_test.Embarked.mean())
df_test.Embarked = df_test.Embarked.replace(-1, test_mean_embarked)
df_test.describe()

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier

In [None]:
# scores_ = []
# highest_score = 0
# estimator = 0
# for i in range (1, 25):
#     rnd_forest = RandomForestClassifier(n_estimators=i)
#     rnd_forest.fit(x, y)
#     scores_.append(round(rnd_forest.score(x_test, y_test), 2))
#     if scores_[i-1] >= highest_score:
#         highest_score = scores_[i-1]
#         estimator = i
# scores_

In [None]:
# import matplotlib.pyplot as plt
# %matplotlib inline

# # Plotting a graph with respect to n_estimators and scores
# plt.plot([i for i in range(1, 25)], scores_, color='blue')

# for i in range(1, 25):
#     plt.text(i, scores_[i - 1], (i, scores_[i - 1]))

In [None]:
# # Training the dataset for prediction
# model = DecisionTreeClassifier()
# model.fit(x, y)
# y_pred = model.predict(df_test)
# print(highest_score)
# print(estimator)

In [None]:
# model.score(df_test, y_pred)

In [None]:
# # Final dataset after prediction
# df_test['Survived'] = y_pred
# df_test.head()

In [None]:
# # Generating the submission file as a csv
# submit_prediction = pd.DataFrame({
#     'PassengerId': passenger_id,
#     'Survived': y_pred
# })

# submit_prediction.to_csv('submission.csv', index=False)

In [None]:
df_train.keys()

In [None]:
model = k.Sequential([
    l.Flatten(),
    l.Dense(24, activation=tf.nn.relu),
#     l.Dense(64, activation=tf.nn.softmax),
#     l.Dense(16, activation=tf.nn.sigmoid),
    l.Dense(2, activation=tf.nn.sigmoid)
])

In [None]:
model.compile(optimizer=tf.optimizers.Adam(),
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
model.fit(np.array(x), np.array(y), epochs=10)

In [None]:
y_pred = model.predict(df_test)
survived = []
for j in range(0, len(y_pred)):
    survived.append(np.argmax(y_pred[j]))


In [None]:
submit_prediction = pd.DataFrame({
    'PassengerId': passenger_id,
    'Survived': survived
})

submit_prediction.to_csv('submission.csv', index=False)