In [12]:
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

Populating the interactive namespace from numpy and matplotlib


In [13]:
# --------------------------- load train data
titanic_train = pd.read_csv("train_processed.csv",index_col="PassengerId")

feature_names = ["Pclass","Age","SibSp","Parch","Fare","IsMale","Ticket-4digit","Ticket-5digit","Ticket-6digit","Ticket-7digit","Ticket-A","Ticket-C","Ticket-F","Ticket-Others","Ticket-P","Ticket-S","Ticket-W"]
Xtrain = titanic_train[feature_names]
ytrain = titanic_train["Survived"]

In [14]:
# --------------------------- load test data
titanic_test = pd.read_csv("test_processed.csv",index_col="PassengerId")
Xtest = titanic_test[feature_names]

In [15]:
# --------------------------- scale train data
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)

In [16]:
# --------------------------- scale test data
Xtest_scaled = scaler.transform(Xtest)

In [17]:
# --------------------------- LR
lrcv = LogisticRegressionCV(Cs=50,cv=10)
lrcv.fit(Xtrain_scaled,ytrain)

LogisticRegressionCV(Cs=50, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [18]:
lrcv.C_

array([ 0.05963623])

In [19]:
lrcv.score(Xtrain_scaled,ytrain)

0.81593714927048255

In [20]:
def pretty_print_coef(coefs, names=None, sort=False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)     for coef, name in lst)
pretty_print_coef(lrcv.coef_.ravel(),feature_names,True)

'-1.137 * IsMale + -0.671 * Pclass + -0.395 * Age + -0.32 * SibSp + 0.182 * Ticket-5digit + -0.179 * Ticket-W + 0.155 * Fare + -0.143 * Ticket-A + -0.132 * Ticket-6digit + 0.073 * Ticket-4digit + 0.06 * Ticket-C + -0.046 * Parch + 0.04 * Ticket-S + 0.026 * Ticket-F + -0.011 * Ticket-Others + 0.01 * Ticket-7digit + -0.003 * Ticket-P'

In [22]:
coefs = pd.DataFrame({"names":feature_names,"coefs":lrcv.coef_.ravel()},columns=["names","coefs"])
coefs["rank"] = np.abs(coefs.coefs)
coefs.sort_index(by="rank",inplace=True,ascending=False)
del coefs["rank"]
coefs

Unnamed: 0,names,coefs
5,IsMale,-1.136537
0,Pclass,-0.670794
1,Age,-0.394732
2,SibSp,-0.320244
7,Ticket-5digit,0.181507
16,Ticket-W,-0.179247
4,Fare,0.15468
10,Ticket-A,-0.142501
8,Ticket-6digit,-0.131742
6,Ticket-4digit,0.072625
