## Import dependencies

In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import matplotlib

## Load data

In [None]:
dataset = pd.read_csv('dataset/train.csv')
dataset.head()

## Feature selection and data cleaning

In [None]:
new_features = ['Age', 'Pclass', 'Sex', 'Fare']
target_var = ['Survived']
df_x = dataset[new_features]
df_y = dataset[target_var]

print(df_x.isnull().sum())

df_x['Age'][df_x['Age'].isnull()] = df_x['Age'].median()

map_to_int={'male' : 0, 'female' : 1}
df_x['Sex']=df_x['Sex'].apply(lambda x:map_to_int[x])

print(df_x.isnull().sum())

## Train the model

In [None]:
model = linear_model.LogisticRegression()

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state = 55)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

## Data visualization

In [None]:
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 5]
plt.style.use('dark_background')
matplotlib.rcParams.update({'font.size': 10})

plt.subplot(2,2,1)
plt.title('male vs female survival')
fraction_of_male_survived = np.sum(df_x['Sex'][df_y['Survived']==1]==0)/np.sum(df_x['Sex']==0)
fraction_of_female_survived = np.sum(df_x['Sex'][df_y['Survived']==1]==1)/np.sum(df_x['Sex']==1)
plt.bar(["fraction_of_male_survived", "fraction_of_female_survived"], [fraction_of_male_survived, fraction_of_female_survived])
plt.tight_layout()

plt.subplot(2,2,2)
plt.title('survival of passenger classes')
p1class_survival = np.sum(df_x['Pclass'][df_y['Survived']==1]==1)/np.sum(df_x['Pclass']==1)
p2class_survival = np.sum(df_x['Pclass'][df_y['Survived']==1]==2)/np.sum(df_x['Pclass']==2)
p3class_survival = np.sum(df_x['Pclass'][df_y['Survived']==1]==3)/np.sum(df_x['Pclass']==3)
plt.bar(["p1 class", "p2 class", "p3 class"], [p1class_survival, p2class_survival, p3class_survival])
plt.tight_layout()

plt.subplot(2,2,3)
plt.title('survival vs fare')
under_10 = np.sum(df_x['Fare'][df_y['Survived']==1] <= 10)/np.sum(df_x['Fare']<= 10)
_10_to_30 = ((df_x['Fare'][df_y['Survived']==1] <= 30) & (df_x['Fare'][df_y['Survived']==1] > 10)).sum()/((df_x['Fare'] > 10) & (df_x['Fare'] <= 30)).sum()
above_30 = np.sum(df_x['Fare'][df_y['Survived']==1] > 30)/np.sum(df_x['Fare']> 30) 
plt.bar(["under 10", "10 to 30", "above 30"], [under_10, _10_to_30, above_30])
plt.tight_layout()

plt.subplot(2,2,4)
plt.title('survival vs age')
under_10 = np.sum(df_x['Age'][df_y['Survived']==1] <= 10)/np.sum(df_x['Age']<= 10)
_10_to_30 = ((df_x['Age'][df_y['Survived']==1] <= 30) & (df_x['Age'][df_y['Survived']==1] > 10)).sum()/((df_x['Age'] > 10) & (df_x['Age'] <= 30)).sum()
above_30 = np.sum(df_x['Age'][df_y['Survived']==1] > 30)/np.sum(df_x['Age']> 30) 
plt.bar(["under 10", "10 to 30", "above 30"], [under_10, _10_to_30, above_30])
plt.tight_layout()

## Evaluation

In [None]:
print("model score: ", model.score(x_test, y_test))
print(classification_report(y_test, y_predict))
print(classification_report(y_train, model.predict(x_train)))