In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

from patsy import dmatrix, dmatrices

In [2]:
%matplotlib inline

In [3]:
scaler = StandardScaler(copy=True)

In [4]:
train = pd.read_csv('data/raw/train.csv')
test = pd.read_csv('data/raw/test.csv')

Feature Selection/Manipulation

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 39.2+ KB


In [7]:
#this can be improved by avg age in class
train['Age'] = train.Age.replace(np.nan, train.Age.mean())
test['Age'] = test.Age.replace(np.nan, train.Age.mean())

In [8]:
train['Embarked'] = train.Embarked.replace(np.nan, train.Embarked.value_counts().idxmax())
test['Embarked'] = test.Embarked.replace(np.nan, train.Embarked.value_counts().idxmax())

In [9]:
test.Fare = test.Fare.replace(np.nan, train.Fare.mean())

In [10]:
train['Cabin'] = train.Cabin.replace(np.nan, 'Not-Reported')
test['Cabin'] = test.Cabin.replace(np.nan, 'Not-Reported')

In [11]:
train['CabinPrefix'] = [x[0] for x in train.Cabin]
test['CabinPrefix'] = [x[0] for x in test.Cabin]

In [12]:
train['Salutation'] = [x.split(',')[1][1:5] for x in train.Name]
test['Salutation'] = [x.split(',')[1][1:5] for x in test.Name]

In [13]:
test.shape

(418, 13)

In [14]:
train.shape

(891, 14)

In [15]:
y, X = dmatrices('Survived ~ C(Pclass) + C(Sex) + Age + Fare + C(Embarked) + Parch + C(CabinPrefix) + C(Salutation)', train, return_type= 'dataframe')
X_test = dmatrix('C(Pclass) + C(Sex) + Age + Fare + C(Embarked) + Parch + C(CabinPrefix) + C(Salutation)', test, return_type= 'dataframe')

In [16]:
X_test.shape

(418, 24)

In [17]:
X.shape

(891, 33)

Rectify Columns

In [18]:
def ColumnRect(train, test):
    for col in train.columns:
        if col not in test.columns:
            print('found one for test', col)
            test = pd.concat([test, pd.DataFrame(np.zeros([test.shape[0], 1]))], axis =1)
            colNames = list(test.columns)
            colNames[-1] = col
            test.columns = colNames
            print('col added to test: ', col)
            
    for col in test.columns:
        if col not in train.columns:
            train = pd.concat([train, pd.DataFrame(np.zeros([train.shape[0], 1]))], axis =1)
            colNames = list(train.columns)
            colNames[-1] = col
            train.columns = colNames
            print('col added to train: ', col)
        
    #sort the resulting columns'
    train = train.reindex_axis(sorted(train.columns), axis=1)
    test = test.reindex_axis(sorted(test.columns), axis=1)
    
    if (train.columns == test.columns).all():
        print('Column Rect. Complete')
            
    return(train, test)

In [20]:
X, X_test = ColumnRect(X, X_test)

found one for test C(CabinPrefix)[T.T]
col added to test:  C(CabinPrefix)[T.T]
found one for test C(Salutation)[T.Col.]
col added to test:  C(Salutation)[T.Col.]
found one for test C(Salutation)[T.Don.]
col added to test:  C(Salutation)[T.Don.]
found one for test C(Salutation)[T.Jonk]
col added to test:  C(Salutation)[T.Jonk]
found one for test C(Salutation)[T.Lady]
col added to test:  C(Salutation)[T.Lady]
found one for test C(Salutation)[T.Majo]
col added to test:  C(Salutation)[T.Majo]
found one for test C(Salutation)[T.Mlle]
col added to test:  C(Salutation)[T.Mlle]
found one for test C(Salutation)[T.Mme.]
col added to test:  C(Salutation)[T.Mme.]
found one for test C(Salutation)[T.Sir.]
col added to test:  C(Salutation)[T.Sir.]
found one for test C(Salutation)[T.the ]
col added to test:  C(Salutation)[T.the ]
col added to train:  C(Salutation)[T.Dona]
Column Rect. Complete


In [21]:
Xsc = pd.DataFrame(scaler.fit(X).transform(X))
Xsc.columns = X.columns

In [22]:
Xsc_test = pd.DataFrame(scaler.fit(X).transform(X_test))
Xsc_test.columns = X.columns

In [23]:
Xsc.shape

(891, 34)

In [24]:
Xsc_test.shape

(418, 34)

In [46]:
Xsc.Intercept= pd.DataFrame(np.ones([train.shape[0], 1]))
Xsc_test.Intercept= pd.DataFrame(np.ones([test.shape[0], 1]))

In [56]:
lr = LogisticRegression(fit_intercept = True, n_jobs = -1)
svc = 
parameters = {'C': np.logspace(-4, 4, 10), 'penalty' : ['l1', 'l2']}
clf = GridSearchCV(lr, parameters, cv = 10, verbose=False)

In [57]:
clf.fit(Xsc, y.Survived)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-04,   7.74264e-04,   5.99484e-03,   4.64159e-02,
         3.59381e-01,   2.78256e+00,   2.15443e+01,   1.66810e+02,
         1.29155e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=False)

In [58]:
clf.best_params_, clf.best_score_

({'C': 0.35938136638046259, 'penalty': 'l1'}, 0.82267115600448937)

In [59]:
submit = clf.predict(Xsc_test)

In [60]:
Xsc_test.head()

Unnamed: 0,Age,C(CabinPrefix)[T.B],C(CabinPrefix)[T.C],C(CabinPrefix)[T.D],C(CabinPrefix)[T.E],C(CabinPrefix)[T.F],C(CabinPrefix)[T.G],C(CabinPrefix)[T.N],C(CabinPrefix)[T.T],C(Embarked)[T.Q],...,C(Salutation)[T.Mr. ],C(Salutation)[T.Mrs.],C(Salutation)[T.Ms. ],C(Salutation)[T.Rev.],C(Salutation)[T.Sir.],C(Salutation)[T.the ],C(Sex)[T.male],Fare,Intercept,Parch
0,0.369449,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,3.251373,...,0.850532,-0.403962,-0.03352,-0.082339,-0.03352,-0.03352,0.737695,-0.490783,1,-0.473674
1,1.331378,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,-0.307562,...,-1.175735,2.47548,-0.03352,-0.082339,-0.03352,-0.03352,-1.355574,-0.507479,1,-0.473674
2,2.485693,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,3.251373,...,0.850532,-0.403962,-0.03352,-0.082339,-0.03352,-0.03352,0.737695,-0.453367,1,-0.473674
3,-0.207709,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,-0.307562,...,0.850532,-0.403962,-0.03352,-0.082339,-0.03352,-0.03352,0.737695,-0.474005,1,-0.473674
4,-0.592481,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,-0.307562,...,-1.175735,2.47548,-0.03352,-0.082339,-0.03352,-0.03352,-1.355574,-0.401017,1,0.76763


In [61]:
submission = pd.concat([test.PassengerId, pd.DataFrame(submit)], axis = 1)
submission.index = submission.PassengerId
submission = submission.drop('PassengerId', axis = 1)
submission.columns = ['Survived']
submission.Survived = submission.Survived.astype(int)

In [62]:
submission.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [63]:
clf.best_estimator_

LogisticRegression(C=0.35938136638046259, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [64]:
submission.to_csv('QP_titanicSubmit.csv')

In [67]:
pd.DataFrame(Xsc, columns= X.columns).head()

Unnamed: 0,Age,C(CabinPrefix)[T.B],C(CabinPrefix)[T.C],C(CabinPrefix)[T.D],C(CabinPrefix)[T.E],C(CabinPrefix)[T.F],C(CabinPrefix)[T.G],C(CabinPrefix)[T.N],C(CabinPrefix)[T.T],C(Embarked)[T.Q],...,C(Salutation)[T.Mr. ],C(Salutation)[T.Mrs.],C(Salutation)[T.Ms. ],C(Salutation)[T.Rev.],C(Salutation)[T.Sir.],C(Salutation)[T.the ],C(Sex)[T.male],Fare,Intercept,Parch
0,-0.592481,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,-0.307562,...,0.850532,-0.403962,-0.03352,-0.082339,-0.03352,-0.03352,0.737695,-0.502445,1,-0.473674
1,0.638789,-0.235981,3.755222,-0.196116,-0.193009,-0.121681,-0.067153,-1.835115,-0.03352,-0.307562,...,-1.175735,2.47548,-0.03352,-0.082339,-0.03352,-0.03352,-1.355574,0.786845,1,-0.473674
2,-0.284663,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,-0.307562,...,-1.175735,-0.403962,-0.03352,-0.082339,-0.03352,-0.03352,-1.355574,-0.488854,1,-0.473674
3,0.407926,-0.235981,3.755222,-0.196116,-0.193009,-0.121681,-0.067153,-1.835115,-0.03352,-0.307562,...,-1.175735,2.47548,-0.03352,-0.082339,-0.03352,-0.03352,-1.355574,0.42073,1,-0.473674
4,0.407926,-0.235981,-0.266296,-0.196116,-0.193009,-0.121681,-0.067153,0.544925,-0.03352,-0.307562,...,0.850532,-0.403962,-0.03352,-0.082339,-0.03352,-0.03352,0.737695,-0.486337,1,-0.473674


In [66]:
Xsort

NameError: name 'Xsort' is not defined

In [34]:
submission = pd.concat([pd.DataFrame(test.PassengerId), pd.DataFrame(submit)], axis = 1)

In [41]:
submission.shape

(418, 1)

In [141]:
test.shape

(418, 13)