In [1]:
# Data source
# https://www.kaggle.com/c/titanic/data

# References
# http://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html

In [2]:
# Modules

import pandas as pd
from sklearn import linear_model, preprocessing

In [3]:
# Process raw data

# Read csv data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
target = pd.read_csv('gender_submission.csv')

# Get features
# Remove clearly unrelated features such as PassengerId or Name improves accuracy for Decision Tree
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
targets = ['Not Survived', 'Survived']

X = train[features]
y = train['Survived']

x = test[features]
t = target['Survived']

# Encode string values
# http://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.LabelEncoder.html

def label_encode(df, col_names):
    
    les = {}
    
    for col_name in col_names:
    
        le = preprocessing.LabelEncoder()
        le.fit(df[col_name])
        df.loc[:, col_name] = le.transform(df[col_name])
    
        les[col_name] = le
        
        print('Encoded Feature Name: {col_name}'.format(col_name=col_name))
        print('Encoded Feature Classes: {classes}'.format(classes=le.classes_))
    
    return les

# Fill NaN values with string to avoid label encoding issues
# http://stackoverflow.com/questions/36808434/label-encoder-encoding-missing-values

# Try clearly out-of-range value -1.0 for continuous data 'Age'
# http://stackoverflow.com/questions/9365982/missing-values-in-scikits-machine-learning

X['Age'].fillna(-1.0, inplace=True)
X['Embarked'].fillna('None', inplace=True)

x['Age'].fillna(-1.0, inplace=True)
x['Embarked'].fillna('None', inplace=True)
x['Fare'].fillna(-1.0, inplace=True)

col_encodees = ['Sex', 'Embarked']

encoders = label_encode(X, col_encodees)

for col_encodee in col_encodees:
    x[col_encodee] = encoders[col_encodee].transform(x[col_encodee])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Encoded Feature Name: Sex
Encoded Feature Classes: ['female' 'male']
Encoded Feature Name: Embarked
Encoded Feature Classes: ['C' 'None' 'Q' 'S']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
# Show processed data

X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22,1,0,7.2500,3
1,1,0,38,1,0,71.2833,0
2,3,0,26,0,0,7.9250,3
3,1,0,35,1,0,53.1000,3
4,3,1,35,0,0,8.0500,3
5,3,1,-1,0,0,8.4583,2
6,1,1,54,0,0,51.8625,3
7,3,1,2,3,1,21.0750,3
8,3,0,27,0,2,11.1333,3
9,2,0,14,1,0,30.0708,0


In [5]:
# Doc ref: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# [C] Setting to 10.0 results in ~7% accurracy increase as compared to default 1.0
# The C parameters defines the error term, higher the value of C may cause overfitting
# [gamma] Setting to 0.01 results in ~10% accurracy increase as compared to default 'auto'
# gamma defines how much influence a single training example has. Higher the value of gamma may cause overfitting
# A good default value for gamma is 0.1, where gamma is often 0 < gamma < 1
# [kernel] more complex kernels performs better than linear kernels
# Using Polynomial(2) achieves 98% accuracy, which out performs 80% accuracy of Radial Basis function
# Polynomial comes with a performance cost

clf = linear_model.LogisticRegression()
clf = clf.fit(X, y)

In [13]:
# Use test data to calculate accuracy

prediction = clf.predict(X)
comparison = prediction == y
accuracy = comparison.value_counts()[True] / comparison.count() * 100
print('Accuracy: {accuracy}%'.format(accuracy=accuracy))

Accuracy: 80.13468013468014%
