In [39]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [72]:
# read and clean data
def clean_data(csv_filename):
    return (
        pd.read_csv(csv_filename)
        .assign(
            age=lambda x: (
                x.groupby(['Pclass', 'Sex'])['Age']
                .transform(lambda x: x.fillna(x.mean()).astype(int))
        ))    
        .assign(sex=lambda x: x['Sex'].map({'female': 1, 'male': 0}))
        .assign(age_band=lambda x: pd.qcut(x.age, 5, labels=False))
        .assign(family_size=lambda x: x.SibSp + x.Parch + 1)
        .assign(is_alone=lambda x: np.where(x.family_size == 1, 1, 0))
        .assign(
            embarked=lambda x: x.Embarked.fillna(x.Embarked.mode()[0]
        ))
        .assign(
            embarked_int=lambda x: x.embarked.map(
                {p:i for i,p in enumerate(np.sort(x.embarked.unique()))}
        ))
        .assign(fare=lambda x: x.Fare.fillna(x.Fare.median()))
        .assign(fare_band=lambda x: pd.qcut(x.fare, 4, labels=False))
        .rename(columns={
            'Pclass': 'passenger_class',
            'Survived': 'survived',
        })
        .drop([
            'Age','Cabin', 'Ticket', 
            'Name', 'Sex', 'SibSp', 
            'Parch', 'family_size',
            'Embarked', 'Fare', 'fare', 
            'age', 'embarked',], 
            axis=1
        )
    )

train_df = clean_data('train.csv')
test_df = clean_data('test.csv')

train_df.head()

Unnamed: 0,PassengerId,survived,passenger_class,sex,age_band,is_alone,embarked_int,fare_band
0,1,0,3,0,1,0,2,0
1,2,1,1,1,3,0,0,3
2,3,1,3,1,1,1,2,1
3,4,1,1,1,3,0,2,3
4,5,0,3,0,3,1,2,1


In [79]:
# create training and testing dataframes
x_train = train_df.drop(['survived', 'PassengerId'], axis=1)
y_train = train_df.survived

x_test = test_df.drop('PassengerId', axis=1)

# logistic regression

In [80]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)
score_log_reg = log_reg.score(x_train, y_train)
print(f'score: {score_log_reg}')

score: 0.792368125701459


In [83]:
(
    pd.DataFrame({
        'feature': train_df.columns.drop(['survived', 'PassengerId']),
        'correlation': pd.Series(log_reg.coef_[0]),
    })
    .sort_values(by='correlation')
)

Unnamed: 0,correlation,feature
0,-0.997558,passenger_class
4,-0.22875,embarked_int
2,-0.213235,age_band
5,0.095799,fare_band
3,0.108466,is_alone
1,2.504031,sex


# support vector machines

In [84]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
score_svc = svc.score(x_train, y_train)
print(f'score: {score_svc}')

score: 0.8282828282828283


# k-nearest neighbors

In [85]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
score_knn = knn.score(x_train, y_train)
print(f'score: {score_knn}')

score: 0.8529741863075196


# naive bayes

In [86]:
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_test)
score_gaussian = gaussian.score(x_train, y_train)
print(f'score: {score_gaussian}')

score: 0.7519640852974186


# perceptron

In [87]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_test)
score_perceptron = perceptron.score(x_train, y_train)
print(f'score: {score_perceptron}')

score: 0.7890011223344556


# linear SVC

In [88]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC()
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_test)
score_linear_svc = linear_svc.score(x_train, y_train)
print(f'score: {score_linear_svc}')

score: 0.792368125701459


# stochastic gradient descent

In [89]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_test)
score_sgd = sgd.score(x_train, y_train)
print(f'score: {score_linear_svc}')

score: 0.792368125701459


# decision tree

In [90]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)
score_decision_tree = decision_tree.score(x_train, y_train)
print(f'score: {score_decision_tree}')

score: 0.8731762065095399


# random forest

In [91]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
score_random_forest = random_forest.score(x_train, y_train)
print(f'score: {score_random_forest}')

score: 0.8731762065095399


# submit!

In [93]:
(
    pd.DataFrame({
        'PassengerId': test_df['PassengerId'],
        'Survived': y_pred,
    })
    .to_csv('submission.csv', index=False)
)