In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load data

In [1]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
sample_submission_data = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [1]:
train_data.head()

In [1]:
# an initial decription before cleaning and exploratory data analysis
train_data.describe()

It is strange that the minimum value for fare is zero. Let's look for nans first then take care of it.

In [1]:
#null data at a glance
train_data.info()
# just double checking
train_data.isna().sum()

We have null data for the age, cabin, and embarked columns.

In [1]:
# maybe the 0 for the fares is because some of them of are babies? let's check
fare_mask = train_data['Fare']<5
free_loaders = train_data[fare_mask]
print(free_loaders[['Age','Fare']])

So some people are just not paying. Let's not worry about them.

In [1]:
# ask ta for help
# ignore for now
# train_data.Fare = train_data.Fare.map(lambda x: np.nan if x==0 else x)
# classmeans = train_data.pivot_table('Fare', index='Pclass', aggfunc='mean')
# print(classmeans)
# train_data.Fare = train_data[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isna(x['Fare']) else x['Fare'], axis=0 )
# print(free_loaders[['Age','Fare']])

In [1]:
# replace nan ages with mean age
meanAge=np.mean(train_data['Age'])
train_data.Age=train_data.Age.fillna(meanAge)

In [1]:
# Now for the cabin, since the majority of values are missing, it might be best to treat that
# as a piece of information itself, so we’ll set these to be ‘Unknown’
train_data.Cabin = train_data.Cabin.fillna('Unknown')

In [1]:
# Fill embarked with the mean
train_data.Embarked = train_data.Embarked.fillna(method="ffill")

In [1]:
train_data.isna().sum()
train_data.isnull().sum()

All null values are gone. Let's see how that changes are stats.

In [1]:
train_data.describe()

# EDA

There is a well known correlation between gender and survival. It is also in the titantic tutorial.

In [1]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

In [1]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

In [1]:
# Let's do some more eda with plotting stuff
import seaborn as sns
import matplotlib.pyplot as plt

corr = train_data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

# the plotting
sns.heatmap(corr, mask = mask, center = 0, cmap='cool',linewidths=1,annot=True,fmt=".2f")
plt.show()

In [1]:
# just to see how many survived
sns.countplot(x='Survived',data=train_data)

In [1]:
sns.pairplot(data=train_data,hue='Survived')

In [1]:
# see sex against survival
sns.barplot(x='Sex',y='Survived',data=train_data)

In [1]:
# see the number of people who survived or didn't against sex, class, and embarked from
f, axes = plt.subplots(3,figsize=(9,15))
#sns.set_style("darkgrid")
sns.countplot(x='Pclass',hue='Survived',data=train_data,ax=axes[0])
sns.countplot(x='Sex',hue='Survived',data=train_data,ax=axes[1])
sns.countplot(x='Embarked',hue='Survived',data=train_data,ax=axes[2])
plt.show()

In [1]:
# there is a lot of different cabins
# we can probably reduce this
print(train_data['Cabin'].unique())

In [1]:
# this can probably be reduced as well
print(train_data['Ticket'].unique())

In [1]:
# this finds since sex is tied with survival so too is title
train_data['name_title'] = train_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
train_data['name_title'].value_counts()

In [1]:
# already took of care of null values
train_data['cabin_multiple'] = train_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
train_data['cabin_multiple'].value_counts()

In [1]:
pd.pivot_table(train_data, index = 'Survived', columns = 'cabin_multiple', values = 'Ticket' ,aggfunc ='count')

In [1]:
train_data['cabin_adv'] = train_data.Cabin.apply(lambda x: str(x)[0])
print(train_data.cabin_adv.value_counts())

In [1]:
pd.pivot_table(train_data,index='Survived',columns='cabin_adv', values = 'Name', aggfunc='count')

In [1]:
#understand ticket values better 
#numeric vs non numeric 
train_data['numeric_ticket'] = train_data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
train_data['ticket_letters'] = train_data.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) >0 else 0)

In [1]:
train_data['numeric_ticket'].value_counts()

In [1]:
pd.set_option("max_rows", None)
train_data['ticket_letters'].value_counts()

In [1]:
pd.pivot_table(train_data,index='Survived',columns='numeric_ticket', values = 'Ticket', aggfunc='count')

In [1]:
pd.pivot_table(train_data,index='Survived',columns='ticket_letters', values = 'Ticket', aggfunc='count')

# Models
We prepare the models for buliding then bulid them

In [1]:
# dropping irrelvant features first
train_reduced = train_data.drop(["PassengerId","Name","Cabin","Ticket","ticket_letters"],axis=1)
train_reduced.head()

In [1]:
train_reduced['Sex'] = pd.get_dummies(train_reduced.Sex, drop_first=True)

In [1]:
# label encoder will be used for Embarked, name_title, and cabin_adv
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

lr = LabelEncoder()
train_reduced['Embarked']= lr.fit_transform(train_reduced['Embarked'])
train_reduced['name_title']= lr.fit_transform(train_reduced['name_title'])
train_reduced['cabin_adv']= lr.fit_transform(train_reduced['cabin_adv'])
train_reduced.head()

In [1]:
y = train_reduced['Survived']
X = train_reduced.drop("Survived",axis=1)

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.30, random_state=0)

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE

# linear regression model
line_reg = LinearRegression()
#y_train_array = np.array(y_train)
#y_train_reg = y_train_array.reshape(-1,1)
#y_test_array = np.array(y_test)
#y_test_reg = y_test_array.reshape(-1,1)
line_reg.fit(X_train, y_train)

In [1]:
knn = KNN(n_neighbors=4)
knn.fit(X_train,y_train)

In [1]:
dt = DecisionTreeClassifier(criterion='entropy',random_state=0)
dt.fit(X_train,y_train)

In [1]:
sgd = SGDClassifier(random_state=0)
sgd.fit(X_train,y_train)

# Evaluation
We will find the predictions of each model and get the accuracy score. The one with the best accuracy will be used.

In [1]:
pred_line_reg = line_reg.predict(X_test)
#acc_line_reg = accuracy_score(y_test_reg, pred_line_reg)
#acc_line_reg = line_reg.score(y_test, pred_line_reg)
#acc_line_reg = line_reg.score(y_test,pred_line_reg)

# can not evalute with accuracy score metric will used rmse instead
rmse = np.sqrt(MSE(y_test,pred_line_reg))
print(rmse)

In [1]:
pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, pred_knn)
print(acc_knn)

In [1]:
pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, pred_dt)
print(acc_dt)

In [1]:
pred_sgd = sgd.predict(X_test)
acc_sgd = accuracy_score(y_test, pred_sgd)
print(acc_sgd)

# Submission
The best model is knn. We will use that to generate the submission data.

In [1]:
print(len(test_data.columns))
print(test_data.columns)
test_data.head()

In [1]:
# now we know what needs to be fixed
test_data.isna().sum()

In [1]:
meanAge=np.mean(test_data['Age'])
test_data.Age=test_data.Age.fillna(meanAge)
test_data.Cabin = test_data.Cabin.fillna('Unknown')
meanFare=np.mean(test_data['Fare'])
test_data.Fare=test_data.Age.fillna(meanFare)
test_data.isna().sum()

In [1]:
# adding the features we engineered
inbetween = test_data
inbetween['name_title'] = inbetween.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
inbetween['cabin_multiple'] = inbetween.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
inbetween['cabin_adv'] = inbetween.Cabin.apply(lambda x: str(x)[0])
inbetween['numeric_ticket'] = inbetween.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
inbetween['ticket_letters'] = inbetween.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) >0 else 0)
inbetween.head()

In [1]:
id_col = test_data['PassengerId']
inbetween['Sex'] = pd.get_dummies(inbetween.Sex, drop_first=True)
inbetween['Embarked']= lr.fit_transform(inbetween['Embarked'])
inbetween['name_title']= lr.fit_transform(inbetween['name_title'])
inbetween['cabin_adv']= lr.fit_transform(inbetween['cabin_adv'])
inbetween.head()

In [1]:
X_semifinal = test_data.drop(["PassengerId","Name","Cabin","Ticket","ticket_letters"],axis=1)
X_final = scaler.fit_transform(X_semifinal)

# error here
predictions = knn.predict(X_final)

output = pd.DataFrame({'PassengerId': id_col, 'Survived': predictions})
output.head()

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")