# Spaceship Titanic Kaggle Competition Attempt 1


## Basic Wrangling

In [None]:
# the usual imports
import pandas as pd
import numpy as np
import xgboost as xg

# sklearn stuff
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_curve, f1_score, auc, accuracy_score, log_loss, classification_report,confusion_matrix,roc_curve,roc_auc_score

# will be doing some optimization I'm sure
import hyperopt
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [None]:
train_data.head()

It looks at a casual glance like we have more variables than in the Titanic challenge.  Also, more of them are quantitative.  Interesting.

In [None]:
quant_list = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

In [None]:
train_data.shape

In [None]:
# how many NaN's by column?
total = 0
for column in train_data.columns:
    print("variable:", column, "NaN count:", train_data[column].isna().sum())
    total += train_data[column].isna().sum()
print("total NaN's:", total)

This is problematic.  We have 8693 observations, and 2324 total NaN's.  Probably, there are some rows with multiple NaN's, but at any rate, this is simply too many to discard. __Particularly__ considering this is a Kaggle competition.  It seems obvious to me that this challenge is very much about imputation of missing values.  Of course there are a variety of ways to go about this, I'm going to try to approach it systematically.


In [None]:
# what about the test set?
total = 0
for column in test_data.columns:
    print("variable:", column, "NaN count:", test_data[column].isna().sum())
    total += test_data[column].isna().sum()
print("total NaN's:", total)

So the missing values will be a problem, too, for the test set.  Simply ignoring all records with NaN's isn't even a possible (though bad) solution.

In [None]:
# it may be relevant whether people are travelling in parties
# multiple people with same last name might be a reasonable proxy for this

train_data[["FirstName", "LastName"]]=train_data["Name"].str.split(" ", expand=True)


In [None]:
train_data[train_data["Name"].isnull()]

Just wanted to make sure that the null values propagated across that split, instead of something... weird.

In [None]:
train_data.head(10)

In [None]:
train_data["LastName"].value_counts()["Susent"]

In [None]:
# train_data["LastNameCount"] = train_data["LastName"].value_counts()[train_data["LastName"]]
# produces KeyError: '[nan] not in index'

train_data["LastNameCount"] = train_data.groupby("LastName")["LastName"].transform('count')


In [None]:
train_data.head(20)

We also have some useful data in the deceptively informative 'Cabin' field.  From the Kaggle competition page:

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    
This should be broken up into three columns...


In [None]:
train_data[["Deck", "Num", "Side"]]=train_data["Cabin"].str.split("/", expand=True)

In [None]:
train_data.head()

In [None]:
# turns out that 'PassengerId' consists of a group number and a passenger number...
train_data[["Group", "Passenger"]]=train_data["PassengerId"].str.split("_", expand=True)
train_data["GroupCount"]=train_data.groupby("Group")["Group"].transform('count')
train_data.head()

Of course, I'm going to need to expand the test data as well.

In [None]:
test_data[["Deck", "Num", "Side"]]=test_data["Cabin"].str.split("/", expand=True)
test_data[["FirstName", "LastName"]]=test_data["Name"].str.split(" ", expand=True)
test_data["LastNameCount"] = test_data.groupby("LastName")["LastName"].transform('count')
test_data[["Group", "Passenger"]]=test_data["PassengerId"].str.split("_", expand=True)
test_data["GroupCount"]=test_data.groupby("Group")["Group"].transform('count')

It occurs to me that the "LastNameCount" should arguably be aggregated across both the test and training sets; I'll come back to consider this.

In [None]:
test_data.head()

It also occurs to me that all of the actual information in the "Cabin" and "Name" columns now exists in other columns.  This will cause fitting problems with several methods, and of course is just redundant.  If the original variables are ever needed, they can easily be reconstructed.  Let's drop them.

In [None]:
train_data.drop(["Cabin", "Name"], axis=1, inplace=True)
test_data.drop(["Cabin", "Name"], axis=1, inplace=True)


In [None]:
train_data.head()

Ok, looks like I'm ready to proceed to some basic EDA, which will inform imputation (I suspect this will be important here) and model selection. 

# EDA

In [None]:
# these may very well be needed here
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# let's start with value counts...
vc_list = ["HomePlanet", "CryoSleep", "Destination", "VIP", "LastNameCount", "GroupCount", "Deck", "Side"]
for col in vc_list:
    print("variable:", col)
    print(train_data[col].value_counts())

In [None]:
# let's compare value counts across the outcome variable, "Transported":
for col in vc_list:
    print("variable:", col)
    print(train_data.groupby("Transported")[col].value_counts())

In [None]:
# perhaps it will be more informative to reverse the order of the groupby and value counts:
for col in vc_list:
    print(train_data.groupby(col)["Transported"].value_counts())

In [None]:
# this could be lengthy, but...
pd.set_option("display.max_rows", None)
train_data.groupby("LastName")["Transported"].value_counts()

In [None]:
train_data.groupby("FirstName")["Transported"].value_counts()

I was hoping to see a lot more names that were only true or only false.  That's not to say there isn't an association with names, there could very well be.  I should consider what kind of metric might be applicable here.

However, some of these variables can be visualized usefully.


In [None]:
train_data.groupby("HomePlanet")["Transported"].value_counts()

In [None]:
vc = train_data.groupby("HomePlanet")["Transported"].value_counts()

In [None]:
vc

In [None]:
type(vc)

In [None]:
vc[0]

In [None]:
vc.index

In [None]:
vc.index[1][0]

In [None]:
len(vc.index)

In [None]:
len(vc.iloc[::2].index)

In [None]:
X_axis = np.arange(len(vc.index)/2)
plt.bar(X_axis - 0.2, vc[vc.index.get_level_values('Transported') == False], 0.4, label="Not Transported")
plt.bar(X_axis + 0.2, vc[vc.index.get_level_values('Transported') == True], 0.4, label="Transported")
plt.xticks(X_axis, ['Earth', 'Europa', 'Mars'])
plt.xlabel("HomePlanet")
plt.legend()
plt.show()

That worked nicely, and on the first try - can I generalize it?  Side by side bar graphs should serve well for the categorical variables here.

In [None]:
for col in vc_list:
    vc = train_data.groupby(col)["Transported"].value_counts()
    X_axis = np.arange(len(vc.index)/2)
    X_axis_list = []
    for i in np.arange(len(vc.index)/2):
        X_axis_list.append(vc.iloc[::2].index[i][0])
    plt.bar(X_axis - 0.2, vc[vc.index.get_level_values('Transported') == False], 0.4, label="Not Transported")
    plt.bar(X_axis + 0.2, vc[vc.index.get_level_values('Transported') == True], 0.4, label="Transported")
    plt.xticks(X_axis, X_axis_list)
    plt.xlabel(col)
    plt.legend()
    plt.show()

Great!  It would have been tedious to do all of those manually.  There are some interesting points to note in passing:  HomePlanet seems to have a substantial effect, at least with respect to Earth v Europa.  Obviously, CryoSleep makes Transported far more likely, too.  Destination appears to have something of a weaker effect....


I'm curious, though, about the LastNameCount variable - is there significant evidence that these counts are not from the same distribution?  It's been a while since I've run a formal hypothesis test.

In [None]:
lnc_t = train_data[train_data["Transported"] == True].LastNameCount.value_counts().sort_index()
lnc_f = train_data[train_data["Transported"] == False].LastNameCount.value_counts().sort_index()

In [None]:
lnc_t

In [None]:
type(lnc_f)

In [None]:
lnc_chi_sq = pd.concat([lnc_t, lnc_f], axis=1)
lnc_chi_sq.columns = (['Transported', 'Not Transported'])

In [None]:
lnc_chi_sq

In [None]:
import scipy
r = scipy.stats.chisquare(lnc_chi_sq)

In [None]:
dir(r)


In [None]:
r.statistic

Somewhat surprisingly, these test statistics are indeed so large as to give a p-value of, effectively, zero... I suppose that when considering the number of observations this should have been obvious.  At any rate, it's reasonable to expect this variable to have some predictive power.

It might also be informative to do a little similar basic EDA for the quantitative variables.

In [None]:
quant_list

In [None]:
tt = train_data['Age'][train_data['Transported'] == True]

In [None]:
tt


In [None]:
for col in quant_list:
    tt = train_data[col][train_data["Transported"] == True]
    nt = train_data[col][train_data["Transported"] == False]
    plt.hist([tt, nt], label=["Transported", "Not Transported"])
    plt.title(col)
    plt.legend()
    plt.show()

These histograms are simply _atrocious_ - but I'm not terribly interested in perfecting them.  We can see some variation with respect to age.  The amount spent on various amenities generally seems to increase the chances of transportation.

# Imputation of Missing Values

As discussed above, all potentially relevant predictors have a nontrivial proportion of missing values - too many to discard.  Imputation is clearly an important component of this competition, and I'd like to approach it in an appropriate manner.

It might also be interesting to compare results from different categories of imputation methods.  A complication, of course, is that the same method must be applied to the test set.

In [None]:
# knn imputer a good starting point
# you could make a set of columns for logs of shopping mall, etc
# when submitting code to git, reset kernel and clear outputs -- easiest to say git push, but...
# get in the habit of looking for duplicate rows!  and also look for outliers
# when you do take home tests, make sure you have a blurb about each of these...

from sklearn.impute import KNNImputer

from sklearn.impute import SimpleImputer
# apparently required for IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# apparently IterativeImputer isn't stable yet


In [None]:
# here's an important question:  should I impute values for the train and test sets separately, or together?
# maybe I should compare both... let's do seperately first

# of course, the parameters are going to depend on the data type
# and I refuse to put together a dataframe of dataframes again, for this thing
# so I'm going to need a naming convention
# TrainSepSimp, TestSepSimp, TrainSepKnn, TestSepKnn, TrainSepIter, TestSepIter
# TrainPoolSimp, TestPoolSimp, TrainPoolKnn, TestPoolKnn, TrainPoolIter, TestPoolIter

# I have made more work for myself than need be, perhaps

In [None]:
for col in train_data.columns:  
    print(col, train_data[col].isna().sum(), type(train_data[col][0]))

In [None]:
for col in test_data.columns:
    print(col, test_data[col].isna().sum(), type(test_data[col][0]))

In [None]:
# I'm going to need to use different imputation metrics on different columns, and I don't think SimpleImputer
# will do this automatically, if the docs are any indication

quant_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "LastNameCount", "GroupCount"]
cat_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Num", "Side", "FirstName", "LastName", "Group", "Passenger"]

In [None]:
# TrainSepSimp, TestSepSimp, TrainSepKnn, TestSepKnn, TrainSepIter, TestSepIter
# TrainPoolSimp, TestPoolSimp, TrainPoolKnn, TestPoolKnn, TrainPoolIter, TestPoolIter
# again, why am I doing this?  A normal person would pick one method and move on...
TrainSepSimp = pd.DataFrame()
TestSepSimp = pd.DataFrame()
for col in quant_cols:
    imp = SimpleImputer(strategy='median') # few of these looked remotely symmetric
    TrainSepSimp[col] = imp.fit_transform(train_data[col].values.reshape(-1,1))[:,0]
    TestSepSimp[col] = imp.fit_transform(test_data[col].values.reshape(-1,1))[:,0]
  
    
for col in cat_cols:
    imp = SimpleImputer(strategy = 'most_frequent')  # why the heck not 'mode'?
    TrainSepSimp[col] = imp.fit_transform(train_data[col].values.reshape(-1,1))[:,0]
    TestSepSimp[col] = imp.fit_transform(test_data[col].values.reshape(-1,1))[:,0]

TrainSepSimp['Transported'] = train_data['Transported']

In [None]:
TrainSepKnn = pd.DataFrame()
TestSepKnn = pd.DataFrame()
for col in quant_cols:
    imp = KNNImputer(n_neighbors=3)
    TrainSepKnn[col] = imp.fit_transform(train_data[col].values.reshape(-1,1))[:,0]
    TestSepKnn[col] = imp.fit_transform(test_data[col].values.reshape(-1,1))[:,0]
  
    
for col in cat_cols:
    imp = SimpleImputer(strategy = 'most_frequent')
    TrainSepKnn[col] = imp.fit_transform(train_data[col].values.reshape(-1,1))[:,0]
    TestSepKnn[col] = imp.fit_transform(test_data[col].values.reshape(-1,1))[:,0]

TrainSepKnn['Transported'] = train_data['Transported']

In [None]:
TrainSepIter = pd.DataFrame()
TestSepIter = pd.DataFrame()
for col in quant_cols:
    imp = IterativeImputer(random_state=0)
    TrainSepIter[col] = imp.fit_transform(train_data[col].values.reshape(-1,1))[:,0]
    TestSepIter[col] = imp.fit_transform(test_data[col].values.reshape(-1,1))[:,0]
  
    
for col in cat_cols:
    imp = SimpleImputer(strategy = 'most_frequent')
    TrainSepIter[col] = imp.fit_transform(train_data[col].values.reshape(-1,1))[:,0]
    TestSepIter[col] = imp.fit_transform(test_data[col].values.reshape(-1,1))[:,0]

TrainSepIter['Transported'] = train_data['Transported']

In [None]:
# for pooling these, I'm going to need to unpool them later...
test_data_p = test_data
test_data_p["Transported"] = np.nan

In [None]:
train_data_p = train_data
train_data_p["TrainTest"] = "train"
test_data_p["TrainTest"] = "test"

In [None]:
pooled_data = pd.concat([train_data_p, test_data_p], axis=0)

In [None]:
pooled_data

In [None]:
# TrainPoolSimp, TestPoolSimp, TrainPoolKnn, TestPoolKnn, TrainPoolIter, TestPoolIter
PoolSimp = pd.DataFrame()
PoolSimp[["Transported", "TrainTest"]] = pooled_data[["Transported", "TrainTest"]]
TrainPoolSimp = pd.DataFrame()
TestPoolSimp = pd.DataFrame()
for col in quant_cols:
    imp = SimpleImputer(strategy='median') # few of these looked remotely symmetric
    PoolSimp[col] = imp.fit_transform(pooled_data[col].values.reshape(-1,1))[:,0]

    
for col in cat_cols:
    imp = SimpleImputer(strategy = 'most_frequent')  # why the heck not 'mode'?
    PoolSimp[col] = imp.fit_transform(pooled_data[col].values.reshape(-1,1))[:,0]
    
TrainPoolSimp = PoolSimp[PoolSimp["TrainTest"] == "train"]
TestPoolSimp = PoolSimp[PoolSimp["TrainTest"] == "test"]
TrainPoolSimp.drop("TrainTest", axis=1, inplace=True)
TestPoolSimp.drop("TrainTest", axis=1, inplace=True)
TestPoolSimp.drop("Transported", axis=1, inplace=True)

In [None]:
TrainPoolSimp.head()

I'm stopping this, because it's just _boring_, maybe I'll come back and handle it later, for comparison... I'd like to just make a decent model first.

# Modeling

In [None]:
# it will be useful to store results
results_cols = ['model type', 'imputation', 'hyperparameters', 'f1', 'roc_auc', 'accuracy']
# it's my understanding that submissions will be judged on accuracy
results = pd.DataFrame(columns = results_cols)
# TrainSepSimp, TestSepSimp, TrainSepKnn, TestSepKnn, TrainSepIter, TestSepIter
# 'imputation' will be 'simple', 'knn', or 'iterative'

# since this is a binary outcome ('transported') I plan to use:
# logistic regression, kNN, random forest, gaussian naive bayes, and the MLP neural net classifier


In [None]:
pred_cols = list(TrainSepSimp.columns)
# why can't you pass a list to list.remove?
for col in ['FirstName', 'LastName', 'Transported']:
    pred_cols.remove(col)
pred_cols

In [None]:
# starting with simple imputation
# training and 'validation' sets as test set is for submission to be assessed

X = TrainSepSimp[pred_cols]
X[["HomePlanet", "Destination", "Deck", "Side"]] = X[["HomePlanet", "Destination", "Deck", "Side"]].apply(LabelEncoder().fit_transform)
y = TrainSepSimp["Transported"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=13013)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
ra = roc_auc_score(y_val, y_pred)
item = ["Logistic regression", "simple", "N/A", f1, ra, accuracy]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Logistic regression\n", "Accuracy:", accuracy, "f1:", f1, "roc_auc:", ra)


In [None]:
# exactly the same but with knn imputation
# training and 'validation' sets as test set is for submission to be assessed

X = TrainSepKnn[pred_cols]
X[["HomePlanet", "Destination", "Deck", "Side"]] = X[["HomePlanet", "Destination", "Deck", "Side"]].apply(LabelEncoder().fit_transform)
y = TrainSepKnn["Transported"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=13013)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
ra = roc_auc_score(y_val, y_pred)
item = ["Logistic regression", "knn", "N/A", f1, ra, accuracy]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Logistic regression\n", "Accuracy:", accuracy, "f1:", f1, "roc_auc:", ra)


In [None]:
# and again, with iterative imputation
# starting with simple imputation
# training and 'validation' sets as test set is for submission to be assessed

X = TrainSepIter[pred_cols]
X[["HomePlanet", "Destination", "Deck", "Side"]] = X[["HomePlanet", "Destination", "Deck", "Side"]].apply(LabelEncoder().fit_transform)
y = TrainSepIter["Transported"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=13013)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
ra = roc_auc_score(y_val, y_pred)
item = ["Logistic regression", "iterative", "N/A", f1, ra, accuracy]
itemdict = dict(zip(results_cols, item))
results=results.append(itemdict, ignore_index=True)
print("Logistic regression\n", "Accuracy:", accuracy, "f1:", f1, "roc_auc:", ra)


In [None]:
results

It's worrisome that in all cases, we have a failure of convergence.  Interestingly, the knn and iterative imputation methods produced the same assessment metrics for logistic regression.  Is it possible that these imputers yielded exactly the same results?  How can I check this?  It would be silly to go and fit models to both datasets if they're identical.