# **Setup**

In [None]:
#import required libraries
import numpy as np
import pandas as pd
#from sklearn.svm import SVC

In [None]:
#import training data
passengers_known = pd.read_csv('../input/spaceship-titanic/train.csv')
#import training data
passengers_unknown = pd.read_csv('../input/spaceship-titanic/test.csv')

# **Exploratory Data Analysis**

# **Handling Missing Data**

In [None]:
#Credits - BARBAGRANDE007

# Assumption 1: When in cryo sleep, you will not use any facilities like ShoppingMall, RoomService etc.
# All missing values will be replaced with 0.0.
# Assumption 2: If no facilities have been used, CrySleep-NaN will be changed to True.
# Assumption 3: If food is consumed or roomservice has been used, CryoSleep-NaN will be changed to False

def cryosleep(df):
    df['ShoppingMall'] = np.where((df['ShoppingMall'].isnull()) & (df['CryoSleep'] == True), 0.0, df['ShoppingMall'])
    df['RoomService'] = np.where((df['RoomService'].isnull()) & (df['CryoSleep'] == True), 0.0, df['RoomService'])
    df['FoodCourt'] = np.where((df['FoodCourt'].isnull()) & (df['CryoSleep'] == True), 0.0, df['FoodCourt'])
    df['Spa'] = np.where((df['Spa'].isnull()) & (df['CryoSleep'] == True), 0.0, df['Spa'])
    df['VRDeck'] = np.where((df['VRDeck'].isnull()) & (df['CryoSleep'] == True), 0.0, df['VRDeck'])
    df['CryoSleep'] = np.where((df['CryoSleep'].isnull()) & ((df['RoomService'] == 0.0) & (df['FoodCourt'] == 0.0) & (df['ShoppingMall'] == 0.0) & (df['Spa'] == 0.0) & (df['VRDeck'] == 0.0)), True, df['CryoSleep'])
    df['CryoSleep'] = np.where((df['CryoSleep'].isnull()) & ((df['RoomService'] > 0.0) | (df['FoodCourt'] > 0.0) | (df['ShoppingMall'] > 0.0) | (df['Spa'] > 0.0) | (df['VRDeck'] > 0.0)), False, df['CryoSleep'])
    return df

cryosleep(passengers_known)
cryosleep(passengers_unknown)

In [None]:
#fill home planet based on cabins

def fill_home_planet(df):
    df.loc[df['Cabin'].str.startswith('A', na=True), "HomePlanet"] = 'Europa'
    df.loc[df['Cabin'].str.startswith('B', na=True), "HomePlanet"] = 'Europa'
    df.loc[df['Cabin'].str.startswith('C', na=True), "HomePlanet"] = 'Europa'
    df.loc[df['Cabin'].str.startswith('G', na=True), "HomePlanet"] = 'Earth'
    df.loc[df['Cabin'].str.startswith('T', na=True), "HomePlanet"] = 'Europa'
    df["HomePlanet"].fillna("unknown", inplace = True)
    return df

fill_home_planet(passengers_known)
fill_home_planet(passengers_unknown)

In [None]:
#fill missing features

def fill_missing_features(df):
    df["Destination"].fillna("unknown", inplace = True)
    df["Cabin"].fillna("F/0/S", inplace = True) 
    df["Age"].fillna(passengers_known['Age'].mean(), inplace = True) 
    df["VIP"].fillna(False, inplace = True) 
    df["CryoSleep"].fillna(False, inplace = True) 
    df["RoomService"].fillna(0, inplace = True) 
    df["FoodCourt"].fillna(0, inplace = True) 
    df["ShoppingMall"].fillna(0, inplace = True) 
    df["Spa"].fillna(0, inplace = True) 
    df["VRDeck"].fillna(0, inplace = True)
    return df

fill_missing_features(passengers_known)
fill_missing_features(passengers_unknown)

# **Feature Engineering**

In [None]:
def add_group_features(df):
    df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
    df['Group_size'] = df['Group'].map(lambda x: df['Group'].value_counts()[x])
    df['IsAlone'] = np.where((df["Group_size"] == 1),1,0)
    return df

def add_billing_features(df):
    df['Premium'] = df.RoomService +  df.Spa + df.VRDeck
    df['Essential'] =  df.FoodCourt + df.ShoppingMall
    return df

def add_age_features(df):
    df['IsAdult'] = np.where((df["Age"] > 18),1,0)

def encode_features(df):
    df['VIP']=df['VIP'].map({False:0, True:1})
    df['CryoSleep']=df['CryoSleep'].map({False:0, True:1})
    df['Destination']=df['Destination'].map({'TRAPPIST-1e':0, 'PSO J318.5-22':1, '55 Cancri e':2, 'unknown':3})
    df['HomePlanet']=df['HomePlanet'].map({'Europa':0, 'Earth':1, 'Mars':2, 'unknown':3})
    return df

add_group_features(passengers_known)
add_group_features(passengers_unknown)

add_billing_features(passengers_known)
add_billing_features(passengers_unknown)

add_age_features(passengers_known)
add_age_features(passengers_unknown)

encode_features(passengers_known)
encode_features(passengers_unknown)

In [None]:
#copy transported column for known passengers
passengers_known_status = passengers_known.Transported
passengers_known_status

#split cabin column
cabin_params = passengers_known['Cabin'].str.split('/',expand=True).rename(columns = lambda x: "cabin_param_"+str(x+1))
passengers_known = pd.concat([passengers_known, cabin_params],axis=1)

passengers_known['cabin_param_1']=passengers_known['cabin_param_1'].map({'B':1, 'F':5, 'A':0, 'G':6 ,'E':4, 'D':3, 'C':2, 'T':7, 'Z':8})
passengers_known['cabin_param_3']=passengers_known['cabin_param_3'].map({'P':0,  'S':1, 'Z':2})


In [None]:
def find_home_planet(df, deck, cabin):
    df = df[(df['cabin_param_1'] == deck) & (df['HomePlanet'].notnull()) & (df['cabin_param_2'] == cabin)]
    if df['HomePlanet'].empty:
        return "unknown"
    else:
        return df['HomePlanet'].iloc[0]

df = passengers_known[passengers_known['HomePlanet'] == 'unknown']
df2 = df[['PassengerId', 'cabin_param_1','cabin_param_2']]
for index, row in df2.iterrows():
    home_planet = find_home_planet(passengers_known, row['cabin_param_1'], row['cabin_param_2'])
    passengers_known.loc[passengers_known['PassengerId'] == row['PassengerId'], 'HomePlanet'] = home_planet

In [None]:
#drop transported column
passengers_known = passengers_known.drop(columns={'Transported'})
#drop unnecessary columns
passengers_known = passengers_known.drop(columns={'Name','PassengerId', 'Cabin', 'RoomService', 'Spa', 'VRDeck', 'ShoppingMall', 'FoodCourt', 'cabin_param_2', 'Age'})

passengers_known_v2 = passengers_known

# **Super Vector Machine Approach**

In [None]:
#learn from training data
#m = SVC(gamma='auto') # 0.70773
#m = SVC(kernel='rbf') # 0.7905
#m = SVC(kernel='rbf', C=2, gamma=1.2) # 0.50268
#m = SVC(kernel='rbf', C=0.1, gamma=10) # 0.50689
#m = SVC(kernel='rbf', C=11, random_state=123) # 0.79448
#m = SVC(kernel='rbf', C=10, random_state=123) # 0.79471
#m.fit(passengers_known_v2,passengers_known_status)
#verify prediction with known data
#passengers_known_pred = m.predict(passengers_known_v2)

# **Cat Boost Classifier Approach**

In [None]:
#from catboost import CatBoostClassifier
#m = CatBoostClassifier(iterations=100000,verbose=5000)
#m.fit(passengers_known_v2,passengers_known_status)
#passengers_known_pred = m.predict(passengers_known_v2)

# **Cat Boost Classifier Approach with Grid Search**

In [None]:
from pandas.api.types import is_numeric_dtype

def get_categorical_indicies(X):
    cats = []
    for col in X.columns:
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        cat_indicies.append(X.columns.get_loc(col))
    return cat_indicies

def convert_cats(X):
    cats = []
    for col in X.columns:
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        X[col] = X[col].astype('category')

In [None]:
import catboost as cb
m = cb.CatBoostClassifier(
    loss_function='Logloss', 
    eval_metric='Accuracy'
)

grid = {'learning_rate': [0.03, 0.1],
'depth': [4, 6, 10],
'l2_leaf_reg': [1, 3, 5,],
'iterations': [50, 100, 150]}

categorical_indicies = get_categorical_indicies(passengers_known_v2)
convert_cats(passengers_known_v2)

In [None]:
train_dataset = cb.Pool(passengers_known_v2,passengers_known_status, cat_features=categorical_indicies)


In [None]:
m.grid_search(grid,train_dataset)

In [None]:
m.get_params()

In [None]:
#passengers_known_pred = m.predict(passengers_known_v2)

# **Predict Transport Status of Remaining Passengers**

In [None]:
passengers_unknown_ids = pd.DataFrame(passengers_unknown.PassengerId)

#split cabin column
cabin_params = passengers_unknown['Cabin'].str.split('/',expand=True).rename(columns = lambda x: "cabin_param_"+str(x+1))
passengers_unknown = pd.concat([passengers_unknown, cabin_params],axis=1)

passengers_unknown['cabin_param_1']=passengers_unknown['cabin_param_1'].map({'B':1, 'F':5, 'A':0, 'G':6 ,'E':4, 'D':3, 'C':2, 'T':7})
passengers_unknown['cabin_param_3']=passengers_unknown['cabin_param_3'].map({'P':0,  'S':1, 'Z':2})

df = passengers_unknown[passengers_unknown['HomePlanet'] == 'unknown']
df2 = df[['PassengerId', 'cabin_param_1','cabin_param_2']]
for index, row in df2.iterrows():
    home_planet = find_home_planet(passengers_unknown, row['cabin_param_1'], row['cabin_param_2'])
    passengers_unknown.loc[passengers_unknown['PassengerId'] == row['PassengerId'], 'HomePlanet'] = home_planet

passengers_unknown = passengers_unknown.drop(columns={'Name','PassengerId', 'Cabin', 'RoomService', 'Spa', 'VRDeck', 'ShoppingMall', 'FoodCourt', 'cabin_param_2', 'Age'})

passengers_unknown_v2 = passengers_unknown




In [None]:
#predict status
passengers_unknown_pred = m.predict(passengers_unknown_v2)


In [None]:
passengers_unknown_pred = pd.DataFrame(passengers_unknown_pred)
passengers_unknown_pred = passengers_unknown_pred.set_axis(['Transported'] , axis=1, inplace=False)

# **Generate Output**

In [None]:
final_prediction = pd.concat([passengers_unknown_ids, passengers_unknown_pred],axis=1)
final_prediction.to_csv("./submission.csv",index=False)

# **Experiments to Improve Score**

**Super Vector Machine Results**
* #m = SVC(kernel='rbf', C=10, random_state=123) # 0.79471

**Cat Boost Classifier Results**
* 0.79798
* Introduced Group, Group_size, IsAlone features - Improved score to 0.80056
* Introduced Premium and Essential features - Improved score to 0.80219
* Removed one hot encoding for cabin deck and sides and VIP and mapped with numeric values - Improved score to 0.80289
* Used grid search for Catboost - Improved score to 0.80406
* Removed cabin number - Improved score to 0.80523


