# Titanic Machine Learning Practice Project
In this practice project, we use machine learning to predict if a given passenger will survive the Titanic crash. This is an ongoing competition on kaggle.com. I did this project to gain knowledge and experience in machine learning and data science. 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Loading Data

In the code below I load the data and partition the data into variables. Our prediction clastrain_test_splitput into the y variable. Our predictor classes are put into X for training data and X_test for the testing data that we will eventually make our final predictions with. PassengerId, Ticket,
and Name are excluded from the training and testing data because these categories are not expected contribute helpful information for survival prediction. 

I then split the training data into train and validation sets. 

In [2]:
# Importing Data
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

# View aspects of each dataframe
print('Training Data:\n', train.shape)
print('\n\nTesting Data:\n', test.shape)

y = train.Survived

# Selecting all features, excluding PassengerId, Survived, and Name
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
X = train[features]
X_test = test[features]

# Splitting training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

Training Data:
 (891, 12)


Testing Data:
 (418, 11)


In [3]:
gs = pd.read_csv('../input/titanic/gender_submission.csv')

In [4]:
gs.shape

(418, 2)

## Data Exploration


In [5]:
# Checking for columns with missing values
print('Training Data')
print(X_train.shape)
miss_val_col = X_train.isnull().sum()
print(miss_val_col[miss_val_col>0])

print('\nValidation Data')
print(X_val.shape)
miss_val_col = X_val.isnull().sum()
print(miss_val_col[miss_val_col>0])

print('\nTesting Data')
print(X_test.shape)
miss_val_col = X_test.isnull().sum()
print(miss_val_col[miss_val_col>0])

Training Data
(668, 8)
Age         132
Cabin       519
Embarked      2
dtype: int64

Validation Data
(223, 8)
Age       45
Cabin    168
dtype: int64

Testing Data
(418, 8)
Age       86
Fare       1
Cabin    327
dtype: int64


In [6]:
X_train = X_train.drop('Cabin',axis=1)
X_val = X_val.drop('Cabin',axis=1)
X_test = X_test.drop('Cabin',axis=1)

In [7]:
# Identifying categorical and numerical columns
print(X_train.dtypes)
cat_cols = X_train.columns[X_train.dtypes == 'object']
print('\nUnique values of categorical columns:\n',X_train[cat_cols].nunique())

num_cols = X_train.columns.drop(cat_cols)
print('\nNumerical columns:\n',num_cols)

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

Unique values of categorical columns:
 Sex         2
Embarked    3
dtype: int64

Numerical columns:
 Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


## Preprocessing

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Numerical data preprocessing
num_preproc = SimpleImputer(strategy='mean')

# Categorical data preprocesseing
cat_preproc = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

# Combining numerical and categorical preprocessors
preproc = ColumnTransformer(transformers=[
    ('num',num_preproc,num_cols),
    ('cat',cat_preproc,cat_cols)
])

## Building model and pipeline

In [9]:
from sklearn.ensemble import RandomForestClassifier

first_model = RandomForestClassifier(n_estimators=100, random_state=42)


In [10]:
first_pipeline = Pipeline(steps=[
    ('preprocessor',preproc),
    ('model',first_model)
])

first_pipeline.fit(X_train,y_train)
y_pred = first_pipeline.predict(X_val)

In [11]:
# Computing accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred)

0.7802690582959642

## Analyzing for improved parameters

In [12]:
# Method to build pipeline and compute accuracy based on preprocessing and model parameters
def pipe_acc(num_preprocessor,cat_preprocessor,model):
    
    preproc = ColumnTransformer(transformers=[
        ('num',num_preprocessor,num_cols),
        ('cat',cat_preprocessor,cat_cols)
    ])
    my_pipeline = Pipeline(steps=[
        ('preprocessor',preproc),
        ('model',model)
    ])
    
    my_pipeline.fit(X_train,y_train)
    return accuracy_score(y_val,my_pipeline.predict(X_val))

### Numerical Preprocessors

In [13]:
# Analyzing numerical category preprocessors

# Simple imputer parameters
print('Simple Imputer:')
strategy = ['mean','median','most_frequent','constant']
for s in strategy:
    print(s+': %.3f' % (pipe_acc(SimpleImputer(strategy=s), cat_preproc, first_model)))

Simple Imputer:
mean: 0.780
median: 0.771
most_frequent: 0.776
constant: 0.785


In [14]:
from sklearn.impute import KNNImputer

#KNN Imputer parameters
print('KNN Imputer:')
n_neighbors = [1, 5, 10, 15, 20]
weights = ['uniform', 'distance']
for n in n_neighbors:
    for w in weights:
        print(w+' '+str(n)+' NN: %.3f' % (pipe_acc(KNNImputer(n_neighbors=n,weights=w),cat_preproc,first_model)))

KNN Imputer:
uniform 1 NN: 0.771
distance 1 NN: 0.771
uniform 5 NN: 0.771
distance 5 NN: 0.767
uniform 10 NN: 0.794
distance 10 NN: 0.780
uniform 15 NN: 0.794
distance 15 NN: 0.776
uniform 20 NN: 0.803
distance 20 NN: 0.780


In [15]:
fin_num_preproc = KNNImputer(n_neighbors = 5, weights='distance')

### Categorical Preprocessors

In [16]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Final Model

In [17]:
fin_cat_preprocessor = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

fin_model = first_model

fin_preproc = ColumnTransformer(transformers=[
    ('num',fin_num_preproc,num_cols),
    ('cat',fin_cat_preprocessor,cat_cols)
])

fin_pipeline = Pipeline(steps=[
    ('preprocessor',fin_preproc),
    ('model',fin_model)
])


fin_pipeline.fit(X_train,y_train)
pred_test = fin_pipeline.predict(X_test)

In [18]:
pred_test.shape

(418,)

## Saving output for submission

In [19]:
output = pd.DataFrame({'PassengerId': test.PassengerId,
                       'Survived': pred_test})
output.to_csv('submission.csv', index=False)

In [20]:
s = pd.read_csv('./submission.csv')
s.shape

(418, 2)