In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import sklearn.preprocessing as skpreprocess
import sklearn.linear_model as sklinear

### load training data

In [3]:
titanic_train = pd.DataFrame.from_csv("train_processed.csv")
feature_names = ["Pclass","Age","SibSp","Parch","Fare","IsMale","EmbarkC","EmbarkQ","EmbarkS"]

Xtrain = titanic_train[feature_names]
ytrain = titanic_train["Survived"]

### scale the training data

In [4]:
# scale the train data
scaler = skpreprocess.StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
print "mean of each column: ",Xtrain_scaled.mean(axis=0)
print "std of each column: ",Xtrain_scaled.std(axis=0)

mean of each column:  [ -8.77213254e-17   2.27277979e-16   4.38606627e-17   5.38289951e-17
   3.98733297e-18  -1.15632656e-16  -1.99366649e-17   0.00000000e+00
  -8.37339924e-17]
std of each column:  [ 1.  1.  1.  1.  1.  1.  1.  1.  1.]


### traing with LogisticRegression

In [5]:
# fit a LR classifier
lr = sklinear.LogisticRegression()
lr.fit(Xtrain_scaled,ytrain)
train_accuracy = lr.score(Xtrain_scaled,ytrain)
print "training accuracy is: ", train_accuracy

training accuracy is:  0.799102132435


In [12]:
def pretty_print_coef(coefs, names=None, sort=False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)     for coef, name in lst)

pretty_print_coef(lr.coef_.ravel(),feature_names,True)

'-1.288 * IsMale + -0.902 * Pclass + -0.497 * Age + -0.35 * SibSp + 0.103 * Fare + -0.092 * EmbarkS + 0.078 * EmbarkC + -0.072 * Parch + 0.038 * EmbarkQ'

### load test data and scale

In [7]:
titanic_test = pd.DataFrame.from_csv("test_processed.csv")
Xtest = titanic_test[feature_names]
Xtest_scaled = scaler.transform(Xtest)

### predict and submit

In [8]:
predictions = lr.predict(Xtest_scaled)
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("submission.csv", index=False)