In [8]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [9]:
ori_train_data = pd.read_csv('../data/train.csv')

In [10]:
def process_data(data):
    data['class_1'] = data.apply(lambda row: 1 if row['Pclass'] == 1 else 0, axis = 1)
    data['class_2'] = data.apply(lambda row: 1 if row['Pclass'] == 2 else 0, axis = 1)
    data['class_3'] = data.apply(lambda row: 1 if row['Pclass'] == 3 else 0, axis = 1)
    data = data.drop(['Pclass'], axis=1)

    data['gender_male'] = data.apply(lambda row: 1 if row['Sex'] == 'male' else 0, axis = 1)
    data['gender_female'] = data.apply(lambda row: 1 if row['Sex'] == 'female' else 0, axis = 1)
    data = data.drop(['Sex'], axis=1)

    data['age_0-10'] = data.apply(lambda row: 1 if 0 <= row['Age'] < 10 else 0, axis = 1)
    data['age_10-20'] = data.apply(lambda row: 1 if 10 <= row['Age'] < 20 else 0, axis = 1)
    data['age_20-30'] = data.apply(lambda row: 1 if 20 <= row['Age'] < 30 else 0, axis = 1)
    data['age_30-40'] = data.apply(lambda row: 1 if 30 <= row['Age'] < 40 else 0, axis = 1)
    data['age_40-50'] = data.apply(lambda row: 1 if 40 <= row['Age'] < 50 else 0, axis = 1)
    data['age_50-60'] = data.apply(lambda row: 1 if 50 <= row['Age'] < 60 else 0, axis = 1)
    data['age_60-70'] = data.apply(lambda row: 1 if 60 <= row['Age'] < 70 else 0, axis = 1)
    data['age_70-80'] = data.apply(lambda row: 1 if 70 <= row['Age'] < 80 else 0, axis = 1)
    data['age_80-90'] = data.apply(lambda row: 1 if 80 <= row['Age'] < 90 else 0, axis = 1)
    data['age_90-100'] = data.apply(lambda row: 1 if 90 <= row['Age'] < 100 else 0, axis = 1)
    data = data.drop(['Age'], axis=1)

    data['has_relatives'] = data.apply(lambda row: 1 if (row['SibSp'] + row['Parch']) != 0 else 0, axis = 1)
    data = data.drop(['SibSp', 'Parch'], axis=1)

    data['embarked_S'] = data.apply(lambda row: 1 if row['Embarked'] == 'S' else 0, axis = 1)
    data['embarked_C'] = data.apply(lambda row: 1 if row['Embarked'] == 'C' else 0, axis = 1)
    data['embarked_Q'] = data.apply(lambda row: 1 if row['Embarked'] == 'Q' else 0, axis = 1)
    data = data.drop(['Embarked',], axis=1)

    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1)
    
    return data

In [11]:
train_data = process_data(ori_train_data)

In [12]:
X = train_data.values[:, 1:]
Y = train_data.values[:, 0]

In [13]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

In [14]:
from sklearn.linear_model import LogisticRegression
clf_gini = LogisticRegression(random_state=0)
clf_gini.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
y_pred_gini = clf_gini.predict(X_test)
print('Accuracy is ', accuracy_score(y_test,y_pred_gini) * 100)
confusion_matrix(y_test,y_pred_gini)

Accuracy is  80.22388059701493


array([[146,  13],
       [ 40,  69]], dtype=int64)

In [16]:
ori_test_data = pd.read_csv('../data/test.csv')

In [17]:
test_data = process_data(ori_test_data)

In [18]:
y_test_gini = clf_gini.predict(test_data)

In [19]:
passenger_ids = ori_test_data.values[:, 0]

In [20]:
df = pd.DataFrame({'PassengerId':passenger_ids, 'Survived':y_test_gini})

In [21]:
df.to_csv('../output/logistic_regression_prediction.csv', index=False)