In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
MAIN = pd.read_csv("/Users/ssahoo/Downloads/train.csv")
EVAL = pd.read_csv("/Users/ssahoo/Downloads/test.csv")

In [3]:
title_map = {'Capt':'shipguy',
 'Col':'shipguy',
 'Countess':'rich',
 'Don':'rich',
 'Dona':'rich',
 'Dr':'shipguy',
 'Jonkheer':'rich',
 'Lady':'rich',
 'Major':'shipguy',
 'Master':'master',
 'Miss':'miss',
 'Mlle':'miss',
 'Mme':'mrs',
 'Mr':'mr',
 'Mrs':'mrs',
 'Ms':'miss',
 'Rev':'shipguy',
 'Sir':'rich'}

In [4]:
def name_title(s):
    try:
        halves = s.split(".")
        pieces = halves[0].split(" ")
        return title_map[pieces[-1]]
    except:
        print(s)
def feature_so_pretty(data):
    try:
        X = data.drop(['Survived','PassengerId'], axis=1)
    except:
        X = data.drop(['PassengerId'], axis=1)

    X['Cabin'] = X['Cabin'].fillna('U').apply(lambda x: x[0])
    X['Title'] = X['Name'].apply(lambda x: name_title(x))
    X['Family Size'] = X['SibSp'] + X['Parch'] + 1
    
    cabin_dummies = pd.get_dummies(X['Cabin'], prefix='Cabin')
    gender_dummies = pd.get_dummies(X['Sex'], prefix='Gender')
    em_dummies = pd.get_dummies(X['Embarked'], prefix='Embarked')
    class_dummies = pd.get_dummies(X['Pclass'], prefix='Class')
    title_dummies = pd.get_dummies(X['Title'], prefix='Title')
    
    X[list(title_dummies)] = title_dummies
    X[list(cabin_dummies)] = cabin_dummies
    X[list(gender_dummies)] = gender_dummies
    X[list(em_dummies)] = em_dummies
    X[list(class_dummies)] = class_dummies

    # group by Sex, Pclass, and Title 
    grouped = X.groupby(['Sex','Pclass', 'Title'])  # view the median Age by the grouped features 
    X['Age'] = grouped.Age.apply(lambda x: x.fillna(x.median()))
    mean_age, std_age = np.mean(X['Age']), np.std(X['Age'])
    X['Age'] = X['Age'].apply(lambda x: (x-mean_age)/std_age)
    
    
    '''tfidf = TfidfVectorizer(min_df=5, max_df=0.9)
    values = tfidf.fit_transform(X['Ticket'])
    df1 = pd.DataFrame(values.toarray(), columns=tfidf.get_feature_names())
    X[tfidf.get_feature_names()] = df1'''
    
    mean_fare = np.mean(X['Fare'])
    std_fare = np.std(X['Fare'])

    X['Fare'] = (X['Fare'].fillna(X['Fare'].median())).apply(lambda x: (x-mean_fare)/std_fare)

    
    X = X.drop(['Sex','Embarked','Name','Ticket','Pclass','Title','Cabin'], axis=1).to_numpy()
    
    return X

In [5]:
DATA = MAIN.drop('Survived', axis=1).append(EVAL).reset_index(drop=True)

In [6]:
XX = feature_so_pretty(DATA)  

In [7]:
X = XX[:len(MAIN)]
y = np.array(MAIN['Survived'])

In [8]:
X.shape

(891, 28)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

In [10]:
from sklearn.ensemble import RandomForestClassifier as RFC
rfc = RFC(random_state=123, n_estimators=100)
rfc.fit(X_train, y_train.T);
y_pred = rfc.predict(X_test)
print(f"Random Forest Accuracy: {np.mean(y_test==y_pred)*100:.2f}%")

Random Forest Accuracy: 85.56%


In [11]:
X_eval = XX[len(MAIN):]
X_eval.shape

(418, 28)

In [12]:
rfc.fit(X, y.T);
y_pred_final = rfc.predict(X_eval)

In [13]:
out = EVAL[['PassengerId']].copy()
out['Survived'] = y_pred_final
out.to_csv("gender_submission.csv", index=False)

In [14]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', gamma=1000)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f'SVM Accuracy: {np.mean(y_pred==y_test)*100:0.02f}%"')

svm.fit(X, y.T);
y_pred_final = svm.predict(X_eval)

out = EVAL[['PassengerId']].copy()
out['Survived'] = y_pred_final
out.to_csv("gender_submission.csv", index=False)

SVM Accuracy: 85.56%"


In [15]:
%%time
import tensorflow as tf
tf_model = tf.keras.models.Sequential([
  #tf.keras.layers.Flatten(input_shape=(294,)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(5, activation='relu'),
  tf.keras.layers.Dense(10, activation='relu'),
  #tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(2, activation='softmax')
])

tf_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
tf_model.fit(X_train, y_train, epochs=100, verbose=0);
tf_model.evaluate(X_test,  y_test, verbose=0)

CPU times: user 6.91 s, sys: 1.52 s, total: 8.43 s
Wall time: 6.17 s


[0.497669517993927, 0.87777776]

In [16]:
y_pred = [0 if x[0]>x[1] else 1 for x in tf_model.predict(X_eval)]
out = EVAL[['PassengerId']].copy()
out['Survived'] = y_pred
out.to_csv("gender_submission.csv", index=False)

In [17]:
from xgboost import XGBClassifier

# fit model no training data
xgb = XGBClassifier(gamma=10)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(f'XGB Accuracy: {np.mean(y_pred==y_test)*100:0.02f}%"')

XGB Accuracy: 86.67%"
