In [134]:
import warnings

warnings.simplefilter(action='ignore', category=Warning)

import pandas as pd
import numpy as np

train_df = pd.read_csv('Datasets/train.csv')
test_df = pd.read_csv('Datasets/test.csv')

## Choosing the features

I have chosen the age of the passenger, their gender and their social class as features to train the classifier.

The reasoning for selecting these are as follows:

1. I chose age as young people would be more likely to survive or be saved by older passengers and thus contribtes majorly to the survivability. 

2. People on the sinking cruise tried to prioritise saving the women and children first so the gender of the passenger should also be a feature.

3. Passengers of a higher class would be prioritised over the rest of the passengers by the crew of the ship. So the social class also must be included as a feature.


## Filling the gaps in the age column

In order to do that, I have decided to consider the title of the passenger with the missing age and then fill the gap with the average age for that particular title.

In [135]:
#extracting the different titles used by the passengers and adding to a separate 'titles' column
name_df = train_df['Name']
titles = name_df.apply(lambda x: x.split(',')[1].split('.')[0].strip())
title_list = np.unique(titles) #stores list of unique titles

train_df['titles'] = titles


title_grp = train_df.groupby('titles')

title_age_dict = {}

#creating a dict where key is the title and value is the average age for that title
for title in title_list:
    title_age_dict[title] = title_grp.get_group(title)['Age'].mean()

title_age_dict = {k:round(title_age_dict[k], 1) for k in title_age_dict}
#title_age_dict

In [None]:
'''
now replacing all NaN values in age column
'''
''' for title in title_list:
    ages = title_grp.get_group(title)['Age']
    if ages.hasnans == True:
        ages = ages.fillna(title_age_dict[title])
        train_df['Age'].loc[ages.index.values] = ages.values '''


Creating a function df_cleaner to perform the above tasks on test and train dataset as well as converting 'male' and 'female' strings in Sex column to numerical 1 and 2 so that classifier can use these values

In [137]:
def df_cleaner(df):
    df['Sex'] = df['Sex'].replace({'male': 1, 'female': 2})

    name_df = df['Name']
    titles = name_df.apply(lambda x: x.split(',')[1].split('.')[0].strip())
    title_list = np.unique(titles)

    df['titles'] = titles

    title_grp = df.groupby('titles')
    for title in title_list:
        ages = title_grp.get_group(title)['Age']
        if ages.hasnans == True:
            ages = ages.fillna(title_age_dict[title])
            df['Age'].loc[ages.index.values] = ages.values

    return df

In [138]:
train_df = df_cleaner(train_df)
test_df = df_cleaner(test_df)

In [139]:
#creating test and train splits
x_train_df = train_df.iloc[:, [2, 4, 5]].values

y_train_df = train_df.iloc[:, [1]].values

x_test_df = test_df.iloc[:, [1, 3, 4]].values

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_train_df, y_train_df, test_size = 0.25, random_state = 0)


In [140]:
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train) 

from sklearn.linear_model import LogisticRegression

#creating Logisting Regression Classifier
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

from sklearn.metrics import accuracy_score

print ("Accuracy : ", accuracy_score(y_test, y_pred))

Accuracy :  0.6233183856502242


In [141]:
x_test = sc_x.transform(x_test_df)

y_pred = classifier.predict(x_test)

In [142]:
import csv

submission = pd.DataFrame(test_df['PassengerId'])
submission['Survived'] = y_pred

#creating csv for submission to kaggle
submission.to_csv('submission.csv', index=False)