In [None]:
# almost certainly will need these imported
import pandas as pd
import numpy as np
import xgboost as xg

# sklearn stuff
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_curve, f1_score, auc, accuracy_score, log_loss, classification_report,confusion_matrix,roc_curve,roc_auc_score

# will be doing some optimization I'm sure
import hyperopt
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_result = pd.read_csv("gender_submission.csv")

In [None]:
train_data.head()


In [None]:
train_data.shape

In [None]:
# how many NaN's by column?

for column in train_data.columns:
    print("variable:", column, "NaN count:", train_data[column].isna().sum())

Well, with 687 NaN's out of 891 rows (77%) for 'Cabin' it seems reasonable to just drop that column.  With only 2 NaN's for 'Embarked' I'm going to just drop the NaN's.  The really problematic variable is 'Age' - which, from a common sense perspective, seems likely to be relevant to the outcome, and has just enough NaN's to be problematic.  Also I'm skeptical that these values can be reasonably imputed from the other variables.

But for now, let's drop 'Cabin' and the rows with NaN's for 'Embarked'.

In [None]:
train_data.drop('Cabin', axis=1, inplace=True)

In [None]:
train_data.columns

In [None]:
train_data.shape

In [None]:
indexdrop = train_data[train_data['Embarked'].isna()].index

In [None]:
train_data.drop(indexdrop, inplace=True)
train_data.shape

In [None]:
# what do the sample results look like?
sample_result.head()

In [None]:
# and the test set?
test_data.head()

In [None]:
# kaggle sample code
"""
y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
"""

In [None]:
# output.head()


In [None]:
# ok, well, so what?  compare some models, maybe optimize some hyperparameters

results_cols = ['model type', 'hyperparameters', 'f1', 'roc_auc', 'accuracy']
results = pd.DataFrame(columns = results_cols)

In [None]:
# choosing all reasonable candidates for predictor
# AGE REMAINS PROBLEMATIC WITH AROUND 20% NAN'S

# predictor_variables = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# can't run anything with NaN's, will omit variable as a first measure

predictor_variables = ["Pclass", "Sex", "SibSp", "Parch", "Fare", "Embarked"]

y = train_data["Survived"]
X = train_data[predictor_variables]


In [None]:
# encode variables that will require it
# specifically "Sex" and "Embarked"
X[["Sex", "Embarked"]] = X[["Sex", "Embarked"]].apply(LabelEncoder().fit_transform)


In [None]:
X.head()

This appears to have worked, despite the above warning.

In [None]:
# split the train_data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=133)

In [None]:
X_train.shape


This gives us 711 observations for training, and 178 for validation.  Logistic regression seems like a good starting point, as good as any.

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
ra = roc_auc_score(y_val, y_pred)
item = ["Logistic regression", "N/A", f1, ra, accuracy]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Logistic regression\n", "Accuracy:", accuracy, "f1:", f1, "roc_auc:", ra)


Well, I would hope for much better accuracy than that - but then, logistic regression is simple and quick.  Let's move on. (Also .concat doesn't seem to exist yet.)

In [None]:
results

In [None]:
# random forest

rfc = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=20)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
ra = roc_auc_score(y_val, y_pred)
item = ["Random Forest", "n_estimators=200, max_depth=5", f1, ra, accuracy]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Random Forest Classifier\n", "Accuracy:", accuracy, "f1:", f1, "roc_auc:", ra)

In [None]:
results

I suppose I should use hyperopt (or gridsearch) to choose hyperparameters; there's enough variability in the assessment metrics.

In [None]:
rf_space = {"max_depth": hp.quniform("max_depth", 1, 8, 1),
                "n_estimators": hp.quniform("n_estimators", 80, 320, 20)}

def objective(rf_space):
    rfc = RandomForestClassifier(
        n_estimators=int(rf_space["n_estimators"]),
        max_depth=int(rf_space["max_depth"]),
        random_state=100)
    
    evaluation = [(X_train, y_train), (X_val, y_val)]
    
    rfc.fit(X_train, y_train)
    
    y_pred = rfc.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    ra = roc_auc_score(y_val, y_pred)
    return {'loss': -accuracy, 'accuracy': accuracy, 'f1': f1, 'roc_auc': ra, 'status': STATUS_OK}

trials = Trials()

best_hyperparams = hyperopt.fmin(fn = objective,
                    space = rf_space,
                    algo = hyperopt.tpe.suggest,
                    max_evals = 100,
                    trials = trials)
    
assess = trials.best_trial['result']

item = ["Random Forest (Hyperopt)", best_hyperparams, assess['f1'], assess['roc_auc'], assess['accuracy']]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Random Forest Classifier - Hyperopt\n", "Hyperparameters:", best_hyperparams, "\nAccuracy:", accuracy, "f1:", f1, "roc_auc:", ra)    
    

In [None]:
results.tail()

This got accuracy up to 0.8539, a modest improvement.  I'm just going to optimize from the start for k-NN.

In [None]:
# k-Nearest Neighbors using hyperopt

knn_space = {"n_neighbors": hp.quniform("n_neighbors", 1, 12, 1),
            "weights": hp.choice("weights", ["uniform", "distance"])}


def objective(knn_space):
    knn = KNeighborsClassifier(
        n_neighbors=int(knn_space["n_neighbors"]),
        weights=knn_space["weights"])
        
    evaluation = [(X_train, y_train), (X_val, y_val)]
    
    rfc.fit(X_train, y_train)
    
    y_pred = rfc.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    ra = roc_auc_score(y_val, y_pred)
    return {'loss': -accuracy, 'accuracy': accuracy, 'f1': f1, 'roc_auc': ra, 'status': STATUS_OK}

trials = Trials()

best_hyperparams = hyperopt.fmin(fn = objective,
                    space = knn_space,
                    algo = hyperopt.tpe.suggest,
                    max_evals = 100,
                    trials = trials)
    
assess = trials.best_trial['result']

item = ["k-Nearest Neighbors (Hyperopt)", best_hyperparams, assess['f1'], assess['roc_auc'], assess['accuracy']]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("k-Nearest Neighbors (Hyperopt)\n", "Hyperparameters:", best_hyperparams, "\nAccuracy:", accuracy, "f1:", f1, "roc_auc:", ra)    
    




In [None]:
results.tail()

I find it more than a bit odd that the assessment metrics for the RF model with n_estimators=200 and max_depth=5 are identical with those for the k-NN model just optimized using hyperopt.  There's got to be an explanation for this, other than some kind of lazy data leak, right?

In [None]:
# gaussian Naive Bayes Classifier

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
ra = roc_auc_score(y_val, y_pred)
item = ["Gaussian Naive Bayes", "N/A", f1, ra, accuracy]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Gaussian Naive Bayes\n", "Accuracy:", accuracy, "f1:", f1, "roc_auc:", ra)


In [None]:
# Multi-Layer Perceptron using Hyperopt

mlp_space={'hidden_layer_sizes': hp.uniform("hidden_layer_sizes", 50, 150),
       'activation': hp.choice('activation', ['relu', 'tanh', 'logistic']),
        'learning_rate_init': hp.uniform('learning_rate_init', 0.0001,0.01)
    }

def objective(space):
        mlp = MLPClassifier(
                    hidden_layer_sizes = mlp_space['hidden_layer_sizes'],
                    activation = mlp_space['activation'],
                    learning_rate_init = mlp_space['learning_rate_init'])
    
    
        evaluation = [( X_train, y_train), ( X_val, y_val)]
    
        mlp.fit(X_train, y_train)
        
        y_pred = mlp.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        ra = roc_auc_score(y_val, y_pred)
        return {'loss': -accuracy, 'accuracy': accuracy, 'f1': f1, 'roc_auc': ra, 'status': STATUS_OK}

trials = Trials()

best_hyperparams = hyperopt.fmin(fn = objective,
                    space = mlp_space,
                    algo = hyperopt.tpe.suggest,
                    max_evals = 100,
                    trials = trials)
    
assess = trials.best_trial['result']

item = ["Multi-Layer Perceptron (Hyperopt)", best_hyperparams, assess['f1'], assess['roc_auc'], assess['accuracy']]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Multi-Layer Perceptron (Hyperopt)\n", "Hyperparameters:", hyperparams, "\nAccuracy:", accuracy, "f1:", f1, "roc_auc:", ra)    
    

    

Well, that failed, and "TypeError: len of pyll.Apply either undefined or unknown" isn't producing search results that are immediately useful.  I guess I'll just run MLP with defaults for completeness.

In [None]:

mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
ra = roc_auc_score(y_val, y_pred)
item = ["MLP Classifier", "defaults", f1, ra, accuracy]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("MLP Classifier\n", "Accuracy:", accuracy, "f1:", f1, "roc_auc:", ra)


In [None]:
results

Interestingly, I'm seeing repeated values for accuracy.  I suppose that with 178 observations in the validation set, this makes sense.

In [None]:
# following someone's example I found at 
# https://towardsdatascience.com/top-10-binary-classification-algorithms-a-beginners-guide-feeacbd7a3e2
# here is a homespun neural net using keras

from keras import layers
from keras import models
from keras import optimizers
from keras import losses
from keras import regularizers
from keras import metrics

model=models.Sequential()
model.add(layers.Dense(8,kernel_regularizer=regularizers.l2(0.003),activation='relu',input_shape=(6,)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(8,kernel_regularizer=regularizers.l2(0.003),activation='relu'))
model.add(layers.Dropout(0.6))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,epochs=4,batch_size=512,validation_data=(X_val ,y_val))
print("score on train: " + str(model.evaluate(X_train,y_train)[1]))
print("score on val: "+ str(model.evaluate(X_val,y_val)[1]))

Wow, that was a hassle to adapt and the results were simply dismal.  Plainly, I'm failing to understand what's going on here and need to read some more about keras in general.  I'm going to make some predictions and submit them as a first step.

In [None]:
results

Based on these results, and seeing as how accuracy is the metric used by Kaggle here (is it?), I'm going to implement the RF model with n_estimators=120 and max_depth=4.

In [None]:
test_data.columns

In [None]:
test_data[["Sex", "Embarked"]] = test_data[["Sex", "Embarked"]].apply(LabelEncoder().fit_transform)

In [None]:
for column in test_data.columns:
    print("variable:", column, "NaN count:", test_data[column].isna().sum())

In [None]:
# there's a NaN for fare, here's a simple fix
mean_fare = test_data["Fare"].mean()
test_data["Fare"].fillna(value=mean_fare, inplace=True)
print("Mean fare:", mean_fare)

In [None]:
rfc = RandomForestClassifier(n_estimators=120, max_depth=4, random_state=20)
rfc.fit(X_train, y_train)
predict = rfc.predict(test_data[predictor_variables])
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predict})
output.to_csv('submission.csv', index=False)

In [None]:
output
