In [None]:
import numpy as np
import pandas as pd

from IPython.core.display import display, HTML
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

import helper1 as h
import seaborn as sns
from importlib import reload
import matplotlib.pyplot as plt
import matplotlib
import warnings

# Configure Jupyter Notebook
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 500) 
pd.set_option('display.expand_frame_repr', False)
# pd.set_option('max_colwidth', -1)
display(HTML("<style>div.output_scroll { height: 35em; }</style>"))

reload(plt)
%matplotlib inline
%config InlineBackend.figure_format ='retina'

warnings.filterwarnings('ignore')

pio.renderers.default = 'iframe'

pio.templates['ck_template'] = go.layout.Template(
    layout_autosize=False,
    layout_width=800,
    layout_height=600,
    layout_font=dict(family='Calibri Light'),
    layout_title_font = dict(family='Calibri'),
    layout_hoverlabel_font = dict(family='Calibri Light')
)

pio.templates.default = 'ck_template+gridon'

In [None]:
#import necessary packages
import numpy as np
import pandas as pd

BASE_DIR = '../input/spaceship-titanic/'
data = pd.read_csv(BASE_DIR + "train.csv")
test = pd.read_csv(BASE_DIR + "test.csv")
sample_submission = pd.read_csv(BASE_DIR + "sample_submission.csv")

test_for_boost = test

In [None]:
data, test = h.fit_generator(data, test)

In [None]:
data.describe(include='all')

In [None]:
# looks like age<17 are unlikely to be VIPs, but that yields to nothing valuable as we classify most of nans in VIP as False
import plotly.express as px
df = data.loc[(data.Age.isnull() == False)&(data.VIP.isnull() == False)]
fig = px.histogram(df, x='Age', color='VIP')
fig.show()

In [None]:
# children <= 12 years old are barred from consumption
import plotly.express as px
data, test = h.preprocess_add_features(data, test)
df = data.loc[(data.Age.isnull() == False)&(data.TotalSpended.isnull() == False)]
fig = px.histogram(df, x='Age', color='TotalSpended')
fig.show()

In [None]:
# The accumulative values suits bell curve. Why not propagating the nans with neighboring values?
import plotly.express as px
import plotly.graph_objects as go

df = data.groupby('Age').sum().reset_index()

x0 = df.RoomService
x1 = df.FoodCourt
x2 = df.ShoppingMall
x3 = df.Spa
x4 = df.VRDeck
x5 = df.TotalSpended

fig = go.Figure()
fig.add_trace(go.Line(y=x0, name='RoomService'))
fig.add_trace(go.Line(y=x1, name='FoodCourt'))
fig.add_trace(go.Line(y=x2, name='ShoppingMall'))
fig.add_trace(go.Line(y=x3, name='Spa'))
fig.add_trace(go.Line(y=x4, name='VRDeck'))
fig.add_trace(go.Line(y=x5, name='TotalSpended'))

fig.show()

In [None]:
df2 = data.loc[(data.Age.isnull() == False)&(data.TotalSpended.isnull() == True)]
df2.fillna(method='bfill', inplace=True)
df2 = df2.iloc[:-1, :]
df2

In [None]:
# check sample results
df2 = df2.groupby('Age').sum().reset_index()

x0 = df2.RoomService
x1 = df2.FoodCourt
x2 = df2.ShoppingMall
x3 = df2.Spa
x4 = df2.VRDeck

fig = go.Figure()
fig.add_trace(go.Line(y=x0, name='RoomService'))
fig.add_trace(go.Line(y=x1, name='FoodCourt'))
fig.add_trace(go.Line(y=x2, name='ShoppingMall'))
fig.add_trace(go.Line(y=x3, name='Spa'))
fig.add_trace(go.Line(y=x4, name='VRDeck'))

fig.show()

In [None]:
data.head()

In [None]:
#fill missing CryoSleep data with True for those not spending any money
#fill missing spenditure data of kids less than 12 with zero spending
data.loc[(data['CryoSleep'].isnull()) 
         & (data['TotalSpended'] <= 0), 'CryoSleep'] = True
test.loc[(test['CryoSleep'].isnull()) 
         & (test['TotalSpended'] <= 0), 'CryoSleep'] = True

data.loc[(data['FoodCourt' or 'RoomService'
            or 'ShoppingMall' or 'Spa' or 'VRDeck'].isnull()) 
         & (data['Age'] <= 12), ['FoodCourt','RoomService'
            , 'ShoppingMall', 'Spa','VRDeck']] = 0
test.loc[(test['FoodCourt' or 'RoomService'
            or 'ShoppingMall' or 'Spa' or 'VRDeck'].isnull()) 
         & (test['Age'] <= 12), ['FoodCourt','RoomService'
            , 'ShoppingMall', 'Spa','VRDeck']] = 0

In [None]:
#fill nan with neareast numbers
data = data.fillna(method='bfill')
test = test.fillna(method='bfill')

In [None]:
#check decretized candidate data
print(data.HomePlanet.unique(),
      data.Cabin.unique(),
      data.Destination.unique(), sep='\n')

In [None]:
#looking for an appropriate option for missing values
from IPython.display import display
display(data.groupby('HomePlanet').count())
display(data.groupby('Destination').count())

In [None]:
#fill nans with mode
data.HomePlanet.fillna('Earth', inplace=True)
data.Destination.fillna('TRAPPIST-1e', inplace=True)

test.HomePlanet.fillna('Earth', inplace=True)
test.Destination.fillna('TRAPPIST-1e', inplace=True)

In [None]:
#check result
print(data.HomePlanet.unique(),
      data.Cabin.unique(),
      data.Destination.unique(), sep='\n')

In [None]:
#zoom in to see cabin
len(data.Cabin.unique()), len(data.Cabin.unique()) == len(data)

In [None]:
#check mode
display(data.groupby('A').count())
display(data.groupby('B').count())
display(data.groupby('C').count())

In [None]:
#fill nan with mode
data.A.fillna('F', inplace=True)
data.B.fillna(0, inplace=True)
data.C.fillna('S', inplace=True)

test.A.fillna('F', inplace=True)
test.B.fillna(0, inplace=True)
test.C.fillna('S', inplace=True)

In [None]:
#check results
print(data.A.unique(), data.B.unique(), data.C.unique())

In [None]:
#view data
data.head()

In [None]:
data.columns

In [None]:
#get necessary columns
labels = data.Transported
data = data[['HomePlanet', 'CryoSleep', 'Destination', 'Age','Consumption','AgeBin',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','TotalSpended',
        'A', 'B', 'C']]

test = test[['HomePlanet', 'CryoSleep', 'Destination', 'Age','Consumption','AgeBin',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpended',
        'A', 'B', 'C']]

In [None]:
#view data
display(data)

In [None]:
len(data.A.unique())

In [None]:
data.B = data.B.astype('int')
test.B = test.B.astype('int')
data.info()

In [None]:
data = pd.get_dummies(data)
test = pd.get_dummies(test)

In [None]:
data.columns

In [None]:
#ready for preprocessing
data = data.astype('int')
test = test.astype('int')

In [None]:
#start preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
X = scaler.fit_transform(data)
test = scaler.fit_transform(test)

In [None]:
X_train, y_train, X_test, y_test = train_test_split(X, labels, test_size = 0.25, shuffle=True)

It seems to be a binary classified structure, vaguely separated by the the middle blue line.

In [None]:
#using hierachical clustering to understand data
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

plt.figure(figsize=(22, 18))
linkage_type = 'ward'
linkage_matrix = linkage(X_train[:200, :], linkage_type)
dendrogram(linkage_matrix)
plt.show()

In [None]:
#using seaborn to see clearer
import seaborn as sns

sns.clustermap(X_train[:200, :], figsize=(12, 18), method=linkage_type, cmap='viridis')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score

complete = AgglomerativeClustering(n_clusters=200, linkage='complete')
complete_pred = complete.fit_predict(X_train)

average = AgglomerativeClustering(n_clusters=200, linkage='average')
average_pred = average.fit_predict(X_train)

ward = AgglomerativeClustering(n_clusters=200)
ward_pred = ward.fit_predict(X_train)

complete_score = adjusted_rand_score(X_test.astype('int'), complete_pred)
average_score = adjusted_rand_score(X_test.astype('int'), average_pred)
ward_score = adjusted_rand_score(X_test.astype('int'), ward_pred)

print('Scores: \nWard:', ward_score, '\nAverage:', average_score, '\nComplete:', complete_score)

In [None]:
from sklearn.cluster import KMeans

def k_fit(k):
    scores = []
    for _ in range(1, k+1):
        kmeans_k = KMeans(_)
        kmeans_k.fit(X_train)
        scores.append(abs(kmeans_k.score(X_train)))
    return scores

k=20
scores = k_fit(k)
plt.plot(range(1,k+1), scores, '--', marker='o', color='b')
plt.xlabel('Centroids')
plt.ylabel('Average Distance From Centroids')
plt.title('Scree Plot')

The reason why unsupervised learning is unable to converge to a certain point possibly is passengers boarding Titanic generally follows a similar pattern of behavior, unlike viridis, which varies in accordance with different species.

In [None]:
# Drop features with low contribution based on feature importance analysis
del_ = [17, 20, 24]
X_train = np.delete(X_train, del_, axis=1)
test = np.delete(test, del_, axis=1)
y_train = np.delete(y_train, del_, axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def train(models, X_train, X_test, n_estimators=None, min_samples_leaf=None):
    model_list = []
    for model in models:
        if model != AdaBoostClassifier and model != DecisionTreeClassifier:
            model = model(n_estimators=200, min_samples_leaf=3)
            model.fit(X_train, X_test)
            prediction = model.predict(y_train)
            score = accuracy_score(prediction, y_test)
            print(model, score)
        elif model != DecisionTreeClassifier:
            rf = DecisionTreeClassifier(min_samples_leaf=3, max_depth=25)
            ada = model(base_estimator=rf, n_estimators=200)
            ada.fit(X_train, X_test)
            prediction = ada.predict(y_train)
            score = accuracy_score(prediction, y_test)
            model = ada
            print(model, score)
        else:
            tree = DecisionTreeClassifier(min_samples_leaf=3, max_depth=200)
            tree.fit(X_train, X_test)
            prediction = tree.predict(y_train)
            score = accuracy_score(prediction, y_test)
            model = tree
            print(model, score)
        model_list.append(model)
    return model_list
            
results = train([RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, DecisionTreeClassifier],
      X_train,
      X_test)

In [None]:
# We could try adding clustered labels into data to see if that helps improvement
ward = AgglomerativeClustering(n_clusters=10)
X_ward_pred = ward.fit_predict(X_train)
X_ward_pred = scaler.fit_transform(X_ward_pred.reshape(-1,1))
X_added = np.hstack((X_train, ward_pred.reshape(-1,1)))
X_added = np.delete(X_added, del_, axis=1)

y_ward_pred = ward.fit_predict(y_train)
y_ward_pred = scaler.fit_transform(y_ward_pred.reshape(-1,1))
y_added = np.hstack((y_train, y_ward_pred.reshape(-1,1)))
y_added = np.delete(y_added, del_, axis=1)

t_added = ward.fit_predict(test)
t_added = scaler.fit_transform(t_added.reshape(-1,1))
t_added = np.hstack((test, t_added.reshape(-1,1)))
t_added = np.delete(t_added, del_, axis=1)

def train2(models, X_train, X_test, n_estimators=None, min_samples_leaf=None):
    model_list = []
    for model in models:
        if model != AdaBoostClassifier and model != DecisionTreeClassifier:
            model = model(n_estimators=200, min_samples_leaf=3, min_samples_split=2)
            model.fit(X_train, X_test)
            prediction = model.predict(y_added)
            score = accuracy_score(prediction, y_test)
            print(model, score)
        elif model != DecisionTreeClassifier:
            rf = DecisionTreeClassifier(min_samples_leaf=3, max_depth=25)
            ada = model(base_estimator=rf, n_estimators=200)
            ada.fit(X_train, X_test)
            prediction = ada.predict(y_added)
            score = accuracy_score(prediction, y_test)
            model = ada
            print(model, score)
        else:
            tree = DecisionTreeClassifier(min_samples_leaf=3, max_depth=200)
            tree.fit(X_train, X_test)
            prediction = tree.predict(y_added)
            score = accuracy_score(prediction, y_test)
            model = tree
            print(model, score)
        model_list.append(model)
    return model_list
results2 = train2([RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, DecisionTreeClassifier],
      X_added,
      X_test)

looks clustered labels are more effective

In [None]:
#Use grid search to find best parameters
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()
parameters = {'n_estimators':[100, 150, 200],
              'min_samples_leaf': [3, 5, 7, 9]}

scorer = make_scorer(f1_score)
grid_obj = GridSearchCV(clf, scoring=scorer, param_grid=parameters)
print('done1')
grid_obj.fit(X_train, X_test.astype('int'))
print('done2')
best_clf = grid_obj.best_estimator_
best_clf.fit(X_train, X_test.astype('int'))
print('done3')
prediction = best_clf.predict(y_train)
print(accuracy_score(prediction, y_test.astype('int')), f1_score(prediction, y_test.astype('int')))

In [None]:
rf_model = results[0]
np.savetxt('result.txt', rf_model.predict(test), fmt='%s')

In [None]:
np.savetxt('result2.txt', best_clf.predict(test), fmt='%s')

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

epochs = 10
model_list = []
scores = []
for e in range(epochs):
    X_train, y_train, X_test, y_test = train_test_split(X, labels, test_size = 0.25, shuffle=True)
    X_train = np.delete(X_train, del_, axis=1)
    y_train = np.delete(y_train, del_, axis=1)
    model = SVC()
    model.fit(X_train, X_test)
    prediction = model.predict(y_train)
    score = accuracy_score(prediction, y_test)
    scores.append(score)
    model_list.append(model)
    print(score, sep='\n')

In [None]:
np.savetxt('result3.txt', model.predict(test), fmt='%s')

In [None]:
#Curious about how Xgboost performs
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=800, min_samples_leaf=2, max_depth=50, n_jobs=20)
model.fit(X_train, X_test)
prediction = model.predict(y_train)
score = accuracy_score(prediction, y_test)
print(model, score, sep='\n')

In [None]:
np.savetxt('result4.txt', model.predict(test), fmt='%s')

Leaderboard score: 0.80640

In [None]:
# defining a voting function to optimize the prediction results
import collections
def optimize(models, test, X_test, t_added=None):
    """
    INPUT: model lists, test data, test data with clusterd labels
    OUTPUT: predicted values with most votes
    """
    table = np.zeros((len(models), len(test)))
    for i, m in enumerate(models):
        try:
            p = m.predict(test)
        except:
            p = m.predict(t_added)
        p = p.astype('bool')
        table[i] = p.reshape(-1)
    vote0, vote1 = (table==0).sum(axis=0), (table==1).sum(axis=0)
    votes = np.vstack((vote0, vote1))
    predictions = np.argmax(votes, axis=0)
    return predictions.astype('bool')

In [None]:
model_list = [results[0]] + [results[2]] + [results2[0]] + [results2[2]] + model_list + [best_clf] + [model]
predictions = optimize(model_list, test, X_test, t_added)
submission = pd.DataFrame()
submission['PassengerId'] = test_for_boost['PassengerId']
submission['Transported'] = predictions
submission.to_csv('submission.csv', index=False)