# Titanic Survival - Exploration + Baseline Model

This is a simple notebook on exploration and baseline model to predict who will survive the sinking of the Titanic

## **Contents**   
[1. Set Environment, Define  Functions, Load Data](#1)  
[2. Examine Basic Structure of Data set and Features](#2)  
&nbsp;&nbsp;&nbsp;&nbsp; [2.1 Basic Dataframe Info](#2.1)  
&nbsp;&nbsp;&nbsp;&nbsp; [2.2 Feature Values](#2.2)  
[3. Hot-Code Categorical Data]((#3)  
[4. RF Check of Feature Importance](#4)  
[5. Create Test Data Set](#5)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.1 Check working director files](#5.1)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.2 Roeve any desired working director files](#5.2)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.3 Roeve any desired working director files](#5.3)  
[6. Impute Missing Feature Values](#6)  
[7. Treat Feature Imblances](#7)


&nbsp;&nbsp;&nbsp;&nbsp; [5.1 Number of Relatives & CabinYN](#5.1)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.2 Titles (from Name)](#5.2)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.3 Age Categories](#5.3)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.4 Fare Categories](#5.4)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.5 Age | Class](#5.5)  
&nbsp;&nbsp;&nbsp;&nbsp; [5.6 Fare/Person](#5.6)  
[6. 2nd Data Exploration](#6)  
[7. Feature Engineering](#7)  
[8. 3rd Data Exploration](#8)  
[9. Model Testing](#9)  
[10. ???](#10)

## <a id="1">1. Initialize </a>

[//]: # (This syntax works like a comment, and won't appear in any output.)

In [None]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import lightgbm as lgb

## Function to Hot-Code a categorical variable_
    # Takes as parameters 1) a dataframe 2) a string variable with the column name to recode
    # Leaves in tack the initial variable that was recoded

def HotC(dframe,col):   # Function to Hot-Code a categorical variable
    if not(isinstance(dframe,pd.DataFrame)):
        print('!!ERROR!! The first variable in the HotC function must be a dataframe')
        return
    if not(isinstance(col,str)):
        print('!!ERROR!! The second variable in the HotC function must be a string representing a column in the dataframe')
        return
    #df2=pd.DataFrame(dframe[col].str.get_dummies())
    df2=pd.get_dummies(dframe[col],prefix=col)
    df3=pd.concat([dframe,df2],axis=1)
    return df3

print('Modules and User Defined Functions set')

# Input Titanic Data
path='../input/titanic-machine-learning-from-disaster/'

train_df = pd.read_csv(path + "train.csv")
test_df = pd.read_csv(path + 'test.csv')

# Write out Data sets for download
train_df.to_csv('train_df_raw.csv', index = False)
test_df.to_csv('test_df_raw.csv', index = False)

# Create variables with column header names
train_colnames = list(train_df.columns.values)
test_colnames = list(test_df.columns.values)
print("Data Loaded")



## <a id="2">2.  Initial Data Exploration </a> 
### &nbsp;&nbsp;  <a id="2.1">2.1  Basic DataFrame Info </a>


In [None]:
## Basic Dataframe structure
# Widen display
pd.options.display.max_columns=20   ## Force number of coluns to show
pd.options.display.max_rows=1000     ## Force number of rows to show

##     Display train_df
print('BASIC STRUCTURE: train_df')
train_df.info()
print('SAMPLE DATA: train_df')
print(train_df.head(5))

##    Display test_df
print('\n','\n','============================================')
print('BASIC STRUCTURE: test.df')
test_df.info()
print(train_df.head(5))



### &nbsp;&nbsp;  <a id="2.2">2.2  Check Feature Values </a>

In [None]:
## Value Counts for Features
print('Value Counts for Dataset Features','\n')
for elem in train_df.columns.values:
    print(elem)
    if train_df[elem].nunique()<8:
        if train_df[elem].dtype != np.object:
            if train_df[elem].nunique()>7:
                print(train_df[elem].value_counts(bins=8))
            else:
                print(train_df[elem].value_counts())
        else:
            print(train_df[elem].value_counts())
    else:
        print('Unique values:',train_df[elem].nunique())
    print('\n')

### &nbsp;&nbsp;  <a id="2.3">2.3  Check Distributions of Numerical Features </a>

In [None]:
## Generate distributions of numberical features
train_df.hist(bins=50, figsize=(20,15))
plt.show
print('Distributions of train_df')

## <a id="3">4. Create Validation Data Set </a>
### &nbsp;&nbsp;  <a id="4.1">4.1  Check files in working directory </a>

In [None]:
## Module to allow the user to remove any data files they want before creatring the data validation_set
while not(input("Are you sure? (y/n): ").lower().strip()[:1] == "y" or input("Are you sure? (y/n): ").lower().strip()[:1] == "n"):
    print('invalid')
if input("Are you sure? (y/n): ").lower().strip()[:1] == "y" or input("Are you sure? (y/n): ").lower().strip()[:1] == "n":
    print('T')
else:
    print('F')
import os.path
fileList = os.listdir("../working/")
print(fileList)


question = lambda q: raw_input(q).lower().strip()[0] == "y" or question(q)

In [None]:
## Take straight percentage of the data set as a validation set
#  To avoid generating a new set evertime, save the validation set or set the random_state=# parameter
#  Or, you can save the train and validation sets and relod them next time
from sklearn.model_selection import train_test_split
import os.path
def errorAlert():   # Function to present error meassage if validation file exists
    print('''validation_set.csv and train_set.csv already exist.
    No new file will be created. To create a new validation file
    delete validation_set.csv AND train_set.csv in the working direcotry and rerun this module.
    Current directories & files in the working directory for this Notebook are:\n''')
    print('NOTEBOOK DIRECTORIES')
    print(os.listdir("../"))
    print('\nWORKING DIRECTORY FILES')
    fileList = os.listdir("../working/")
    for f in fileList: 
        print(f)
def selectPerc(perc):         # select straight percent & save new train & validation files
    train_set,validation_set = train_test_split(train_df, test_size=perc,random_state=42)     # Split off .2 of training set 
    print('NEWLY CREATED train_set')
    print(train_set.info()
    print('NEWLY CREATED test_set')
    print(validation_set.info())
    test_set=test_df   # To keep naming conventions similar, create and a data set called test_set
    
    # Save the newly created datasets
    test_set.to_csv('test_set',index=False)    # Save the newly named test set
    train_set.to_csv('train_set',index=False)    # Save the new training set
    validation_set.to_csv('validation_set.csv',index=False)  # Save the new Validation set
    
    
# First check if validation set has already been created
if os.path.isfile('../working/validation_set.csv') or os.path.isfile('../working/validation_set.csv'):   # Test for validation_set.csv and print message if exists
    errorAlert()   # if exists, call error alert message
else:
    selectPerc(0.2)  # if doesn't exist, create desired validation set


## <a id="3">3.  Hot-Code Categorical Data </a> 

In [None]:
## Hot Code All Categorical Data
# Copy data set


hc_train=train_df
hc_train=hc_train.dropna()
hc_test=test_df
hc_test=hc_test.dropna()
hc_data=[hc_train,hc_test]

for elem in hc_train:
    if hc_train[elem].dtype == np.object and hc_train[elem].nunique()<8:
        hc_train=HotC(hc_train,elem)
        hc_train=hc_train.drop([elem], axis=1)
        print(elem,'hot-coded')
    else:
        if hc_train[elem].dtype == np.object:
            hc_train=hc_train.drop([elem], axis=1)
            print('DROPPED',elem)
            
for elem in hc_test:
    if hc_test[elem].dtype == np.object and hc_test[elem].nunique()<8:
        hc_test=HotC(hc_test,elem)
        hc_test=hc_test.drop([elem], axis=1)
        print(elem,'hot-coded')
    else:
        if hc_test[elem].dtype == np.object:
            hc_test=hc_test.drop([elem], axis=1)
            print('DROPPED',elem)

# Drop Passenger id
#hc_train=hc_train.drop(['PassengerId'], axis=1)
#hc_train=hc_test.drop(['PassengerId'], axis=1)
            
print(hc_train.info())
print(hc_test.info())



In [None]:
## Apply Random Forest for all Features for an idea of importance
# Define testing dataframes
## Fit Several Models to compare effectiveness
# Define testing dataframes
X_train = hc_train.drop("Survived", axis=1)
Y_train = hc_train["Survived"]
X_test  = hc_test

#Apply Random Forest and Logistical Regression
#Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

#Logistic Regression:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

results = pd.DataFrame({
    'Model': ['Logistic Regression','Random Forest'],
    'Score': [acc_log,acc_random_forest]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')

feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
print(result_df)
print(feature_importances)


## <a id="4">4.  Convert Features to Floating Point (or Drop) </a> 

In [None]:
## Convert Fare from float to int64 using 'astype()'
data = [train_df, test_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

## Hot-Code Sex feature
train_df=HotC(train_df,'Sex')
train_df=train_df.drop(['Sex'], axis=1)

test_df=HotC(test_df,'Sex')
test_df=test_df.drop(['Sex'], axis=1)

## Hot-Code Embarked and re-name columns
train_df=HotC(train_df,'Embarked')
test_df=HotC(test_df,'Embarked')

##Rename Embarked Hot-codes
train_df=train_df.rename(index=str, columns={'C':'Emb_C','Q':'Emb_Q','S':'Emb_S'})
test_df=test_df.rename(index=str, columns={'C':'Emb_C','Q':'Emb_Q','S':'Emb_S'})

##Drop Embarked Column
train_df=train_df.drop(['Embarked'], axis=1)
test_df=test_df.drop(['Embarked'], axis=1)

##Drop cabin as mostly available for survivers so sample very biased
#train_df=train_df.drop(['Cabin'], axis=1)
#test_df=test_df.drop(['Cabin'], axis=1)
#pd.options.display.max_columns=20

print('Converted Values')
print('Train_df')
train_df.info()
print(train_df.head(10))
print('test_df')
test_df.info()
print(test_df.head(10)) 



## <a id="5">5.  Feature Engineering </a> 
### &nbsp;&nbsp;  <a id="5.1">5.1  Number of Relatives  & CabinYN</a>

In [None]:
## Feature to indicated number of relatives & if traveling alone. 
data = [train_df, test_df]
for dataset in data:
    # Relatives & CabinNoYN
    dataset['Relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['Relatives'] > 0, 'Not_alone'] = 0
    dataset.loc[dataset['Relatives'] == 0, 'Not_alone'] = 1
    dataset['Not_alone'] = dataset['Not_alone'].astype(int)
    # Code Existence of Cabin number as 0/1   
    dataset['CabinYN'] = dataset['Cabin'].apply(lambda x: 1 if not pd.isnull(x) else np.nan)
    dataset['CabinYN'] = dataset['CabinYN'].fillna(0)
    dataset['CabinYN'] = dataset['CabinYN'].astype(int)
## Lookk at relationships with Survived 
g = sns.catplot("Not_alone", "Survived", 'male',col='Pclass', col_wrap=3, data=train_df, kind="bar", height=6, palette="autumn",aspect=.8,legend=True)
print('IMPACT OF TRAVEL COMPANIONS AND KNOWING CABIN #')
g = sns.catplot("Relatives", "Survived", col='male', col_wrap=2, data=train_df, kind="bar", height=6, palette="autumn",aspect=.8,legend=True)
g = sns.catplot("CabinYN", "Survived", col='male', col_wrap=2,data=train_df, kind="bar", height=6, palette="autumn",aspect=.8,legend=True)
plt.show()

In [None]:


### &nbsp;&nbsp;  <a id="2.4">2.4  Missing Data Imputations </a>

In [None]:
## Age missing values
# Use random numbers based on the mean age value in regards to the standard deviation and is_null
embarked_common_value='S'
data = [train_df, test_df]
for dataset in data:
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)
    
    # Fill the 2 embarked missing features with the most common values from embarked
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)
    
# Ck For missing values in Age and Embarked
print('Sum of missing values in Age and Embarked')
dataset['Age','Embarked'].isnull().sum
  
train_df.info()
test_df.info()

In [None]:



train_df.info()
test_df.info()


### &nbsp;&nbsp;  <a id="2.6">2.6  Feature Extraction </a>

In [None]:
## Use the Name feature to extract the titles from the Name to build a new feature
data = [train_df, test_df]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)






## &nbsp;&nbsp;  <a id="5.1">5.1  Remove any Unwanted Features</a>

In [None]:
# Drop Ticket from the data set
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

# Delete PassengerId from train_df (not there, so does not need to be deleted)
# train_df = train_df.drop(['PassengerId'], axis=1)    Don't need to drop; not there
# Drop Passenger Name
#train_df = train_df.drop(['Name'], axis=1)   # Don't need to drop; not there

#train_df = train_df.drop(['PassengerId'], axis=1)   # Don't need to drop; not there

In [None]:
## Create Categories for Age Feature
data = [train_df, test_df]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# let's see how it's distributed
print('distribution of train_df')
train_df['Age'].value_counts()


In [None]:
## Create categories for Fare
data = [train_df, test_df]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)
    


In [None]:
## Create some additional variables

# Age X Class
data = [train_df, test_df]
for dataset in data:
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']

# Fare per Person
for dataset in data:
    dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
    dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
# Let's take a last look at the training set, before we start training the models.
train_df.head(10)

print(train_df.head(10))
print(test_df.head(10)) 

In [None]:
# Look at correlation between key variables

'''
print('Correlation of Pclass & relatives')
#print(train_df.corr().loc['relatives','Pclass'])
import scipy.stats
print(scipy.stats.pearsonr(train_df['Pclass'].values,train_df['relatives'].values)[0],'    --using scipy.stats pearsonr')
print(train_df.corr().loc['Pclass','relatives'],'    --using pandas pearsonr \n')

print('Correlation of relatives & Age')
print(train_df.corr().loc['relatives','Age'])
print('\n')
'''
## Create a string with all variable names
col_name=list(train_df.columns.values)

train_df[col_name]

cor_dataset=train_df[col_name]
print('Correlation Matrix')
print(cor_dataset.corr())
print(sns.heatmap(cor_dataset.corr()))

In [None]:
print("Results")
## Fit Several Models to compare effectiveness
# Define testing dataframes
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df

#print(X_test.head(10))

#SGD-Stochastic Gradient Descent
sgd = linear_model.SGDClassifier(max_iter=50, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
sgd.score(X_train, Y_train)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)

#Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

#Logistic Regression:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

# K Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)

# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
    
# Perceptron:
perceptron = Perceptron(max_iter=10)
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)

# Linear Support Vector Machine
linear_svc = LinearSVC(max_iter=2000)
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

'''
# lgb_light
# params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          #'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          #'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          #'min_split_gain':.01, 'min_child_weight':1}

# lgb_light = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)
lgb_light = lgb
lgb_light.fit(x_train,Y_train)
Y_pred = lgb_light.predict(X_test)
acc_lgb_light = round(lgm_light.score(X_train, Y_train) * 100, 2)
'''
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

results = pd.DataFrame({
    'Model': ['LinearSVC','KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

  
    