# Kaggle Titanic - Logistic Regression Deep Dive

After my first test submission, I'm going to do deep dives on a variety of learners to get a handle on Scikit-learn and see how far I can get on the leaderboard before ensembling them all.

## Import Libraries

In [68]:
import numpy as np
import os
import re
import pandas as pd
from patsy import dmatrices
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics, preprocessing

In [69]:
print(os.getcwd())
os.chdir('/Users/rjf/Desktop/projects/kaggle/tuts/titanic/data/')
print(os.getcwd())

# Load train and test datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Store our passenger ID for easy access
PassengerId = test['PassengerId']

/Users/rjf/Desktop/projects/kaggle/tuts/titanic/data
/Users/rjf/Desktop/projects/kaggle/tuts/titanic/data


In [70]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
all_data = [train, test]

In [72]:
# From https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [73]:
for dataset in all_data:
    dataset['hasCabin'] = dataset['Cabin'].apply(lambda x : 0 if type(x) == float else 1)
    # Add feature familySize = Siblings + Parch + 1 (for the passenger)
    dataset['familySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Add feature isAlone from familySize reflecting if a passenger is traveling w/o family (0/1 indicator)
    dataset['isAlone'] = 0
    dataset.loc[dataset['familySize'] == 1, 'isAlone'] = 1
    
    # Remove NULLS from the embarked col
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
    # Replace NULLS from the Fare col with median value of Fare in the training data (to avoid look-ahead bias)
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    
    # Add feature fareQuartile to capture quartile of distribution of fares in which each passengers fare is
    # again using the training data to avoid look-ahed bias
    #dataset['fareQuartile'] = pd.cut(train['Fare'], 4)
    
    # Add Title to each dataset
    dataset['Title'] = dataset['Name'].apply(get_title)
    
    # Title extract and recode with .replace() method        
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
    # For the many missing instances of Age, impute categorical mean age of the title as the age for that passengerID
    # From https://www.kaggle.com/ash316/eda-to-prediction-dietanic, awesomely exhaustive EDA example
    dataset.loc[(dataset.Age.isnull())&(dataset.Title=='Mr'),'Age']=33
    dataset.loc[(dataset.Age.isnull())&(dataset.Title=='Mrs'),'Age']=36
    dataset.loc[(dataset.Age.isnull())&(dataset.Title=='Master'),'Age']=5
    dataset.loc[(dataset.Age.isnull())&(dataset.Title=='Miss'),'Age']=22
    dataset.loc[(dataset.Age.isnull())&(dataset.Title=='Other'),'Age']=46
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0).astype(int)
    
    # Randomly assign NaNs on Age to other categories used later
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    # for EDA only: train['ageQuintile'] = pd.cut(train['Age'], 5)
    
    # Encode cats as floats
    
    # Map sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Map embarked
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    # Map fare (why???)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Map age (why???)
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [74]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,hasCabin,familySize,isAlone,Title
0,892,3,"Kelly, Mr. James",1,2,0,0,330911,0,,2,0,1,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,2,1,0,363272,0,,0,0,2,0,3
2,894,2,"Myles, Mr. Thomas Francis",1,3,0,0,240276,1,,2,0,1,1,1
3,895,3,"Wirz, Mr. Albert",1,1,0,0,315154,1,,0,0,1,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,1,1,1,3101298,1,,0,0,3,0,3


In [75]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']

train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)

In [76]:
train.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,hasCabin,familySize,isAlone,Title
0,0,3,1,1,0,0,0,0,2,0,1
1,1,1,0,2,0,3,1,1,2,0,3
2,1,3,0,1,0,1,0,0,1,1,2
3,1,1,0,2,0,3,0,1,2,0,3
4,0,3,1,2,0,1,0,0,1,1,1


In [98]:
# we just need to get a response + predictor np arrays, having been given explicit test + train datasets
#y_train = train.as_matrix(columns=train.columns[0])
#x_train = train.as_matrix(columns=train.columns[1:])

# Setup model with patsy module's dmatrices
y_train, X_train = dmatrices('Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked + hasCabin + familySize + isAlone + Title',
                  train, return_type="dataframe")
print(X_train.columns)
print(y_train.columns)

Index(['Intercept', 'Pclass', 'Sex', 'Age', 'Parch', 'Fare', 'Embarked',
       'hasCabin', 'familySize', 'isAlone', 'Title'],
      dtype='object')
Index(['Survived'], dtype='object')


In [99]:
# Flatten y into a 1-D array, so that scikit-learn will properly understand it as the response variable
y_train = np.ravel(y_train)
print(type(y_train))

<class 'numpy.ndarray'>


In [100]:
# Fit the model!
model = LogisticRegression()
model = model.fit(X_train, y_train)

# Training set accuracy
model.score(X_train, y_train)

0.81481481481481477

In [101]:
y_train.mean()

0.38383838383838381

In [102]:
# Checking coefficients
#print(np.transpose(model.coef_))
#print(X.columns)
#pd.DataFrame(zip(X.columns, np.transpose(model.coef_))) - broken in Python 3.x after zip() becomes an iterator, use instead:
pd.DataFrame(list(zip(X_train.columns, np.transpose(model.coef_))))

Unnamed: 0,0,1
0,Intercept,[1.15016014633]
1,Pclass,[-0.526525677717]
2,Sex,[-2.07115251523]
3,Age,[-0.59368147554]
4,Parch,[0.216291737465]
5,Fare,[0.270586904942]
6,Embarked,[0.249708848603]
7,hasCabin,[0.806567768608]
8,familySize,[-0.552654632946]
9,isAlone,[-0.271849411433]


In [127]:
#TODO - Generate predictions + probabilities here
#print(metrics.accuracy_score(y_train, preds))
#print(metrics.roc_auc_score(y_test, probs[:, 1]))

## Test Set Preditions

Predict class labels for the test set and class probabilities. First we have to drop Survived from the Training data so the matrix sizes will match.

In [116]:
# Re-Run model as model2

# Convert train to NP Array after dropping Survived
X_train = train.drop('Survived', axis=1)
X_train = X_train.as_matrix()

# Convert test set to NP Array
X_test = test.as_matrix()
print(type(X_test))
print(type(X_train))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [130]:
# Still re-running the model
# Try w/ class-weight = auto from: https://stackoverflow.com/questions/28716241/controlling-the-threshold-in-logistic-regression-in-scikit-learn

model2 = LogisticRegression(class_weight='balanced')
model2 = model2.fit(X_train, y_train)
model2.score(X_train, y_train)

0.80471380471380471

In [131]:
preds = model2.predict(X_test)
#print(preds)

In [132]:
probs = model2.predict_proba(X_test)
#print(probs)

In [133]:
# TO-DO NEED TO CAST PREDS AS INT NOT FLOAT
# Generate submission file
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": preds
    })
submission.to_csv('titanic_logsub_20171207_v2.csv', index=False)
# v1 - Successful submission increased score to 78% over 77%, jumping 1300+ places on the board by using simple log reg over the ensemble
# v2 - balancing classes made things worse :(

In [None]:
# Next - try this: https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python