In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import ML packages
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from catboost import CatBoostClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Intro to Spaceship Titanic Data

In [None]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

**Create temp set for null cabins**

In [None]:
train.loc[train.Cabin.isnull(), "Cabin"] = "Z/9999/Z"

# Priliminary EDA

In [None]:
# note 
print("All passengers in CryoSleep & VIP were transported:")
all(train.loc[(train['CryoSleep'] == True) & (train['VIP'] == True)].Transported == True)

**Extract Cabin Features**

In [None]:

list0 = ['F','G','E','B','D','A','T']
list1=['S','P']

# train
for i,v in train.Cabin.items():
    cabin = str(v).split('/')
    try:
        train.at[i,'Cabin_x']=cabin[0]
    except:
        train.at[i,'Cabin_x']=list0[random.randint(0,6)]
    try:
        train.at[i,'Cabin_y']=int(cabin[1])
    except:
        train.at[i,'Cabin_y']=random.randint(0,1894)
    try:
        train.at[i,'Cabin_z']=cabin[2]
    except:        
        train.at[i,'Cabin_z']=list1[random.randint(0,1)]

#test
for i,v in test.Cabin.items():
    cabin = str(v).split('/')
    try:
        test.at[i,'Cabin_x']=cabin[0]
    except:
        test.at[i,'Cabin_x']=list0[random.randint(0,6)]
    try:
        test.at[i,'Cabin_y']=int(cabin[1])
    except:
        test.at[i,'Cabin_y']=random.randint(0,1894)
    try:
        test.at[i,'Cabin_z']=cabin[2]
    except:        
        test.at[i,'Cabin_z']=list1[random.randint(0,1)]

In [None]:
# eliminate train nulls

train["Age"].fillna(train["Age"].median(),inplace=True)
train["VIP"].fillna(train["VIP"].median(),inplace=True)
train["RoomService"].fillna(train["RoomService"].median(),inplace=True)
train["FoodCourt"].fillna(train["FoodCourt"].median(),inplace=True)
train["ShoppingMall"].fillna(train["ShoppingMall"].median(),inplace=True)
train["Spa"].fillna(train["Spa"].median(),inplace=True)
train["VRDeck"].fillna(train["VRDeck"].median(),inplace=True)
train["Name"].fillna("John Doe",inplace=True)

train["HomePlanet"].fillna("Earth",inplace=True)
train["CryoSleep"].fillna(True,inplace=True) # or False
train["Destination"].fillna("TRAPPIST-1e",inplace=True)

train.info()

**Cabin: ML Feature Engineering**

In [None]:
# deal with the unknowns using ML

print(len(train.loc[train.Cabin == "Z/9999/Z"]), "null cabins (i.e. missing features)")

In [None]:
# create cabin_ml datasets
cabin_train = train[['PassengerId','Name','HomePlanet','CryoSleep','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Transported','Cabin_x','Cabin_y','Cabin_z']]
cabin_test = train.loc[train.Cabin == "Z/9999/Z"][['PassengerId','Name','HomePlanet','CryoSleep','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Transported']]

# create cabin_train ML sets
lb_make = LabelEncoder()
train_ml = cabin_train.drop(columns=['Cabin_x','Cabin_y','Cabin_z']).copy()
test_ml = cabin_test.copy()

# categorize/encode
train_ml['HomePlanet'] = lb_make.fit_transform(train_ml['HomePlanet'])
train_ml['Destination'] = lb_make.fit_transform(train_ml['Destination'])
train_ml['CryoSleep'] = lb_make.fit_transform(train_ml['CryoSleep'])
train_ml['VIP'] = lb_make.fit_transform(train_ml['VIP'])
train_ml['Transported'] = lb_make.fit_transform(train_ml['Transported'])
test_ml['HomePlanet'] = lb_make.fit_transform(test_ml['HomePlanet'])
test_ml['Destination'] = lb_make.fit_transform(test_ml['Destination'])
test_ml['CryoSleep'] = lb_make.fit_transform(test_ml['CryoSleep'])
test_ml['VIP'] = lb_make.fit_transform(test_ml['VIP'])
test_ml['Transported'] = lb_make.fit_transform(test_ml['Transported'])

X_train = train_ml.set_index(['PassengerId','Name'])
y_x = cabin_train.Cabin_x.ravel()
y_z = cabin_train.Cabin_z.ravel()

# normalize
X_train = (X_train-X_train.min())/(X_train.max()-X_train.min())
#X_train

X_test = test_ml.set_index(['PassengerId','Name'])
X_test = (X_test-X_test.min())/(X_test.max()-X_test.min())

# Gradient Boosting to generate cabin null features
clf_x = GradientBoostingClassifier(criterion="friedman_mse", init=None,
                             learning_rate=0.1, loss='deviance', max_depth=11,
                             max_features=None, max_leaf_nodes=None,
                             min_samples_leaf=60,
                             min_samples_split=1200, min_weight_fraction_leaf=0.0,
                             n_estimators=100, random_state=42,
                             subsample=1.0, verbose=0, warm_start=False).fit(X_train, y_x)
cabin_x_predictions = clf_x.predict(X_test)

clf_z = GradientBoostingClassifier(criterion="friedman_mse", init=None,
                             learning_rate=0.1, loss='deviance', max_depth=11,
                             max_features=None, max_leaf_nodes=None,
                             min_samples_leaf=60,
                             min_samples_split=1200, min_weight_fraction_leaf=0.0,
                             n_estimators=100, random_state=42,
                             subsample=1.0, verbose=0, warm_start=False).fit(X_train, y_z)
cabin_z_predictions = clf_z.predict(X_test)

X_result = pd.concat([cabin_test.reset_index(drop=False),pd.DataFrame({'Cabin_x':cabin_x_predictions,'Cabin_z':cabin_z_predictions})],axis=1)[['PassengerId','Name','Cabin_x','Cabin_z']]
X = X_test.merge(X_result,on=['PassengerId','Name'])
X['Cabin_x'] = lb_make.fit_transform(X['Cabin_x'])
X['Cabin_z'] = lb_make.fit_transform(X['Cabin_z'])
X

In [None]:

y_y = cabin_train.Cabin_y.ravel()
clf_y = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_y)
cabin_y_predictions = clf_y.predict(X_test)
cabin_y_predictions

In [None]:
pd.concat([cabin_test,X])

# Apply ML Modeling

**Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
train_ml = train.drop(columns=['Cabin']).copy()

# categorize/encode
train_ml['HomePlanet'] = lb_make.fit_transform(train_ml['HomePlanet'])
train_ml['Cabin_x'] = lb_make.fit_transform(train_ml['Cabin_x'])
train_ml['Cabin_y'] = lb_make.fit_transform(train_ml['Cabin_y'])
train_ml['Cabin_z'] = lb_make.fit_transform(train_ml['Cabin_z'])
train_ml['Destination'] = lb_make.fit_transform(train_ml['Destination'])
train_ml['CryoSleep'] = lb_make.fit_transform(train_ml['CryoSleep'])
train_ml['VIP'] = lb_make.fit_transform(train_ml['VIP'])
train_ml['Transported'] = lb_make.fit_transform(train_ml['Transported'])

train_ml

In [None]:
# set up X,y for ML
X = train_ml.set_index(['PassengerId','Name'])
y = X['Transported'].ravel()
X = X.drop(columns=['Transported'])
X.sample(10).style.background_gradient(cmap ='coolwarm')\
        .set_properties(**{'font-size': '8px'})

In [None]:
train_ml.Cabin_y.values

In [None]:
# normalize X
X_norm = (X-X.min())/(X.max()-X.min())
 
print("Scaled Dataset Using Pandas")
X_norm.sample(10).style.background_gradient(cmap ='coolwarm')\
        .set_properties(**{'font-size': '8px'})

In [None]:
from sklearn.preprocessing import StandardScaler

# define standard scaler
scaler = StandardScaler()
  
# transform data
X_norm = scaler.fit_transform(X)

print("Scaled Dataset Using skLearn")
pd.DataFrame(X_norm).sample(10).style.background_gradient(cmap ='coolwarm')\
        .set_properties(**{'font-size': '8px'})

In [None]:
# use normalized X
X = X_norm

In [None]:
# Gradient Boosting
criterion = "friedman_mse"#, "mse", "mae"
clf = GradientBoostingClassifier(criterion=criterion, init=None,
                             learning_rate=0.1, loss='deviance', max_depth=30,
                             max_features=None, max_leaf_nodes=None,
                             min_samples_leaf=60,
                             min_samples_split=1200, min_weight_fraction_leaf=0.0,
                             n_estimators=100, random_state=42,
                             subsample=1.0, verbose=0, warm_start=False).fit(X_norm, y)
train_predictions = clf.predict(X_norm)
train_score = clf.score(X_norm, y)
train_matrix = confusion_matrix(y, clf.predict(X_norm))
fier = 'Gradient Boosting'
print(f"{fier} score: \n",train_score)
print("\nTrain predictions: \n",train_predictions)
print("\nConfusion matrix: \n",train_matrix)

In [None]:
# CatBoost
clf = CatBoostClassifier(n_estimators=600, learning_rate= 0.01, max_depth=10, l2_leaf_reg= 0.5,
                        #iterations = 20000, # How many iterations we should do
                        random_state = 42, # The Reproducibility
                        verbose = 0, # Progress bar (in case you want to see)
                        boost_from_average = True, # The initial guess will start from the average of the data
                        eval_metric = 'BalancedAccuracy').fit(X_norm, y)
train_predictions = clf.predict(X_norm)
train_score = clf.score(X_norm, y)
train_matrix = confusion_matrix(y, clf.predict(X_norm))

fier = 'CatBoosting'
print(f"{fier} score: \n",train_score)
print("\nTrain predictions: \n",train_predictions)
print("\nConfusion matrix: \n",train_matrix)

In [None]:
pd.DataFrame(X).reset_index(drop=True)

In [None]:
# the winning classifier
train_result = X.reset_index()
train_result[f'Transported ({fier} Prediction)'] = pd.Series(train_predictions).astype(bool)
train_result.sample(10).style.background_gradient(cmap ='coolwarm')\
        .set_properties(**{'font-size': '8px'})

In [None]:
# Train predicted stats: transported
transported = train_ml.set_index('PassengerId').loc[train_result.set_index('PassengerId')[f'Transported ({fier} Prediction)'] == True]
#print("Transported: ",transported.Name.values.tolist())
print()
transported[['Age','RoomService','FoodCourt','ShoppingMall','VRDeck']].describe().style.background_gradient(cmap ='coolwarm')\
        .set_properties(**{'font-size': '15px'})

# Scatter Train Model Results

**Densely Scattered**

In [None]:
df = train_result.set_index(['PassengerId','Name'])
sns.pairplot(df[['Age','FoodCourt','RoomService','ShoppingMall','Spa','VRDeck','Cabin_y',f'Transported ({fier} Prediction)']],hue=f'Transported ({fier} Prediction)')

**Not-So Densely Scattered**

In [None]:
df = train_result.set_index(['PassengerId','Name'])
sns.pairplot(df[['HomePlanet','CryoSleep','Destination','VIP','Cabin_x','Cabin_z',f'Transported ({fier} Prediction)']],hue=f'Transported ({fier} Prediction)')

# Test Model Predictions

In [None]:
train.Cabin_z.value_counts()

In [None]:
# eliminate test nulls
# try Age.mean()
test["Age"].fillna(test["Age"].median(),inplace=True)
test["VIP"].fillna(test["VIP"].median(),inplace=True)
test["RoomService"].fillna(test["RoomService"].median(),inplace=True)
test["FoodCourt"].fillna(test["FoodCourt"].median(),inplace=True)
test["ShoppingMall"].fillna(test["ShoppingMall"].median(),inplace=True)
test["Spa"].fillna(test["Spa"].median(),inplace=True)
test["VRDeck"].fillna(test["VRDeck"].median(),inplace=True)
test["Name"].fillna("John Doe",inplace=True)

test["HomePlanet"].fillna("Earth",inplace=True)
test["CryoSleep"].fillna(True,inplace=True) # or False
test["Destination"].fillna("TRAPPIST-1e",inplace=True)

In [None]:
# test encoder

test_ml = test.drop(columns=['Cabin']).copy()

# categorize/encode
test_ml['Name'] = test_ml['Name'].astype("category")
test_ml['HomePlanet'] = lb_make.fit_transform(test_ml['HomePlanet'])
test_ml['Cabin_x'] = lb_make.fit_transform(test_ml['Cabin_x'])
test_ml['Cabin_y'] = lb_make.fit_transform(test_ml['Cabin_y'])
test_ml['Cabin_z'] = lb_make.fit_transform(test_ml['Cabin_z'])
test_ml['Destination'] = lb_make.fit_transform(test_ml['Destination'])
test_ml['CryoSleep'] = lb_make.fit_transform(test_ml['CryoSleep'])
test_ml['VIP'] = lb_make.fit_transform(test_ml['VIP'])

test_ml.info()

In [None]:
test_ml.describe().style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '10px'})

In [None]:
# normalize test_ml
test_ml_norm = test_ml.drop(columns = ['PassengerId','Name'])
test_ml_norm = (test_ml_norm-test_ml_norm.min())/(test_ml_norm.max()-test_ml_norm.min())
test_ml_norm = pd.concat((test_ml_norm, test_ml.PassengerId, test_ml.Name), 1)

# use normalized test_ml
# comment out, if needed
test_ml = test_ml_norm

In [None]:
# Latest Classifier predictions
test_predictions = clf.predict(test_ml.set_index(['PassengerId','Name']))
test_ml[f'Transported ({fier} Prediction)'] = pd.Series(test_predictions).astype(bool)
test_ml.sample(10).set_index([f'Transported ({fier} Prediction)','PassengerId']).style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '10px'})

# Prep Dataframe for Submission
**Final touches using preliminary EDA**


In [None]:
# if they were in CryoSleep & VIP, they were transported
for index in test_ml.loc[(test_ml['CryoSleep'] == True) & (test_ml['VIP'] == True)].index.values:
    test_ml[f'Transported ({fier} Prediction)'] [index] = True

# check that conditions are met before confirming Transported classification
print("All (TEST) passengers in CryoSleep & VIP were transported:")
print(all(test_ml.loc[(test_ml['CryoSleep'] == True) & (test_ml['VIP'] == True)][f'Transported ({fier} Prediction)'] == True))

# Submission

In [None]:
# predictions/submission
test_ml['Transported'] = test_ml[f'Transported ({fier} Prediction)'] 
submission = test_ml[['PassengerId','Transported']]
submission.to_csv('submission.csv',index=False)
submission

In [None]:
# results
print(f"Classifier: {fier}")
print("Transported: ",len(test_ml.loc[test_ml[f'Transported ({fier} Prediction)']  == True])/len(test_ml))
print("Not transported: ",len(test_ml.loc[test_ml[f'Transported ({fier} Prediction)']  == False])/len(test_ml))
test_ml.describe().style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '10px'})

In [None]:
df = test_ml.set_index(['PassengerId','Name'])
sns.pairplot(df,hue='Transported')

In [None]:
# Test predicted stats: transported
transported = test.loc[test_ml.Transported == True]
#print("Transported: ",transported.Name.values.tolist())
print()
transported[['Age','RoomService','FoodCourt','ShoppingMall','VRDeck']].describe().style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '15px'})

In [None]:
#en fin