In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import os, gc, re, warnings
warnings.filterwarnings("ignore")

- PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
- CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- Destination - The planet the passenger will be debarking to.
- Age - The age of the passenger.
- VIP - Whether the passenger has paid for special VIP service during the voyage.
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- Name - The first and last names of the passenger.
- Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
df = pd.DataFrame(pd.read_csv('../input/spaceship-titanic/train.csv'))
df

In [None]:
#The cabin number where the passenger is staying. 
#Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
df['CabinDeck'] = df['Cabin'].str[0]
df['CabinNumber'] = df['Cabin'].str.split('/').str[1]
df['CabinSide'] = df['Cabin'].str[-1]
df.head()

In [None]:
fig, ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8), (ax9,ax10)) = plt.subplots(5,2, figsize=(15,30))

df.groupby(['HomePlanet', 'Transported']).size().unstack().plot.bar(title='Home Planet', stacked=True, ax=ax1, rot=0)
df.groupby(['CryoSleep', 'Transported']).size().unstack().plot.bar(title='Cryo-Sleep', stacked=True, ax=ax2, rot=0)
df.groupby(['Destination', 'Transported']).size().unstack().plot.bar(title='Destination', stacked=True, ax=ax3, rot=0)
df.loc[df['Transported']==False]['Age'].plot.hist(grid=True, title='Age',ax=ax4, alpha=.5, bins=10)
df.loc[df['Transported']==True]['Age'].plot.hist(grid=True, ax=ax4, alpha=.5, bins=10)
df.groupby(['VIP', 'Transported']).size().unstack().plot.bar(title='VIP', stacked=True, ax=ax5, rot=0)
df.loc[df['Transported']==False]['RoomService'].plot.hist(grid=True, title='Room Service',ax=ax6, alpha=.5, bins=10, range=[0,4000])
df.loc[df['Transported']==True]['RoomService'].plot.hist(grid=True, ax=ax6,alpha=.5, bins=10)
df.loc[df['Transported']==False]['ShoppingMall'].plot.hist(grid=True, title='Shopping Mall',ax=ax7, alpha=.5, bins=10, range=[0,4000])
df.loc[df['Transported']==True]['ShoppingMall'].plot.hist(grid=True, ax=ax7, alpha=.5, bins=10, range=[0,4000])
df.loc[df['Transported']==False]['VRDeck'].plot.hist(grid=True, title='VR Deck',ax=ax8, alpha=.5, bins=10, range=[0,4000])
df.loc[df['Transported']==True]['VRDeck'].plot.hist(grid=True, ax=ax8, alpha=.5, bins=10, range=[0,4000])
df['Transported'].value_counts().plot.bar(title='Transported', ax=ax9, rot=0)

plt.tight_layout()

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(8,30))
df.dropna(how='any').groupby(['CabinDeck', 'Transported']).size().unstack().plot.bar(title='Cabin Deck', stacked=True, ax=ax1, rot=0)
df.dropna(how='any').loc[df['Transported']==False]['CabinNumber'].astype('int').plot.hist(grid=True, title='Cabin Number',ax=ax2, alpha=.5, bins=10)
df.dropna(how='any').loc[df['Transported']==True]['CabinNumber'].astype('int').plot.hist(grid=True, ax=ax2,alpha=.5, bins=10)
df.dropna(how='any').groupby(['CabinSide', 'Transported']).size().unstack().plot.bar(title='Cabin Side', stacked=True, ax=ax3, rot=0)
plt.tight_layout()

In [None]:

plt.figure(figsize=(19, 10))
corr = df.corr()
sns.heatmap(corr, annot=True)
plt.title('Data correlation heatmap')
plt.show()

In [None]:
df['TotalSpent'] = df.RoomService + df.FoodCourt + df.ShoppingMall + df.Spa + df.VRDeck 
df.head()

In [None]:
df.loc[df['Transported']==False]['TotalSpent'].plot.hist(grid=True, title='Total Spent',alpha=.5, bins=30,range=[0,12500])
df.loc[df['Transported']==True]['TotalSpent'].plot.hist(grid=True, alpha=.5, bins=30,range=[0,12500])
plt.tight_layout()

In [None]:
df.columns

In [None]:
#new DF where CabinNumber is not null (this feature needs to be num not cat)
df2 = df[df['CabinNumber'].notna()]

# select features/target and split data
features = df2[['HomePlanet', 'CryoSleep', 'Destination', 'Age',
               'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 
               'VRDeck', 'CabinDeck', 'CabinNumber', 'CabinSide','TotalSpent']]
target = df2['Transported']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    random_state=42)


print(X_train.shape,y_train.shape)

In [None]:
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import f1_score

from sklearn.neural_network import MLPClassifier






num_features = ['Age','RoomService','FoodCourt','Spa','VRDeck','TotalSpent','CabinNumber']
num_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

cat_features = ['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide']
cat_transformer = OneHotEncoder(handle_unknown="ignore")


preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ]
)

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", MLPClassifier(alpha=1, max_iter=1000))]
)

clf.fit(X_train, y_train)

pred = clf.predict(X_test)
f1 = f1_score(y_test, pred, average='micro')

print("model score: %.3f" % clf.score(X_test, y_test))
print('F1 score:', f1)