# Data Description

**In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.**

# File and Data Field Descriptions

**train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
Destination - The planet the passenger will be debarking to.
Age - The age of the passenger.
VIP - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
Name - The first and last names of the passenger.
Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.**

<a id="1"></a>
# 1. Importing Packages

In [None]:
#Importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
import numpy as np
import warnings
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFECV
from catboost import CatBoostClassifier
plt.style.use('seaborn-bright')

<a id="2"></a>
# 2. Loading Files

In [None]:
#Loading Files
train_data = pd.read_csv("../input/spaceship-titanic/train.csv")
test_data = pd.read_csv("../input/spaceship-titanic/test.csv")

In [None]:
#Shape of the train and test data
print("Training data shape: {}".format(train_data.shape))
print("Testing data shape: {}".format(test_data.shape))

In [None]:
#data info
train_data.info()

<a id="3"></a>
# 3. Exploratory Data Analysis

In [None]:
#Statistics
train_data.describe()

In [None]:
combine_data = [train_data, test_data]
for data in combine_data:
    Deck=[]
    Num=[]
    Side=[]
    for val in data['Cabin'].values:
        if str(val)=='nan':
            Deck.append(np.nan)
            Num.append(np.nan)
            Side.append(np.nan)
        else:
            list_data = val.split('/')
            Deck.append(list_data[0])
            Num.append(list_data[1])
            Side.append(list_data[2])
    data['Deck']=Deck
    data['Num']=Num
    data['Side']=Side

In [None]:
combine_data = [train_data, test_data]
for data in combine_data:
    data.Num = data.Num.astype('float32')
train_data.head()

In [None]:
#Transported passangers in %
#Predictions: True for Transported, False for Lost) 
plt.figure(figsize=(13,4))
plt.subplot(121)
sbn.barplot(x=train_data.Transported.unique(), y=train_data.Transported.value_counts())
plt.title("Transported passangers in counts")
for i, j in zip(train_data.Transported.unique(), train_data.Transported.value_counts()):
    plt.annotate(str(j), xytext=(i, j), xy=(i, j))
plt.subplot(122)
plt.pie(train_data.Transported.value_counts(), labels=train_data.Transported.unique(), autopct='%.2f%%')
plt.title("Transported passangers in %")
plt.show()

In [None]:
#Number of people travelled from HomePlanet to destination
plt.figure(figsize=(13,3))
sbn.countplot(data=train_data, x='HomePlanet', hue='Destination')
plt.title("Number of people travelled from HomePlanet to destination")
plt.show()

In [None]:
#Visualizing people of different ages travelled from HomePlanet
plt.figure(figsize=(15,4))
warnings.filterwarnings('ignore')
Age_nomiss=train_data[~(train_data.Age.isna())]
Age_nomiss['Age']=Age_nomiss.Age.astype('int')
g=sbn.countplot(data=Age_nomiss, x='Age', hue='HomePlanet')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.title("Visualizing people of different ages travelled from HomePlanet")
plt.show()

In [None]:
#Visualizing people of different ages travelled to Destination
plt.figure(figsize=(15,4))
warnings.filterwarnings('ignore')
Age_nomiss=train_data[~(train_data.Age.isna())]
Age_nomiss['Age']=Age_nomiss.Age.astype('int')
g=sbn.countplot(data=Age_nomiss, x='Age', hue='Destination')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.title("Visualizing people of different ages travelled to Destination")
plt.show()

In [None]:
pd.pivot_table(data=train_data, columns=['HomePlanet', 'Destination'], values='Age').plot(kind='bar', width=15)
plt.title("Average Age people travelled from HomePlanet to distination")
plt.show()

In [None]:
pd.pivot_table(data=train_data, columns=['HomePlanet', 'Destination'], values='RoomService').plot(kind='bar')
plt.title("Average ampount spent for Room Service")
plt.show()

In [None]:
pd.pivot_table(data=train_data, columns=['HomePlanet', 'Destination'], values='FoodCourt').plot(kind='bar')
plt.title("Average ampount spent for FoodCourt")
plt.show()

In [None]:
pd.pivot_table(data=train_data, columns=['HomePlanet', 'Destination'], values='ShoppingMall').plot(kind='bar')
plt.title("Average ampount spent for ShoppingMall")
plt.show()

In [None]:
pd.pivot_table(data=train_data, columns=['HomePlanet', 'Destination'], values='Spa').plot(kind='bar')
plt.title("Average ampount spent for Spa")
plt.show()

In [None]:
#Visualizing Percentages of people in 'CryoSleep', 'VIP', 'Deck', 'Side'
import matplotlib as mpl
mpl.rcParams['font.size'] = 9.0
fig = plt.figure(figsize=(15,9))
columns=['CryoSleep', 'VIP', 'Deck', 'Side']
for i, col in enumerate(columns):
    q, r = divmod(i,4)
    ax = fig.add_subplot(int(str(q+1)+"4"+str(r+1)))
    indeces = train_data[col].value_counts().index
    patches, texts, autotexts = ax.pie(train_data[col].value_counts(), labels=indeces, autopct='%.2f%%')
    for auto in autotexts:
        auto.set_fontsize(7.5)
    plt.xlabel(col)
plt.show()

In [None]:
#Transported passangers in category wise
columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
q, r =divmod(len(columns), 3)
fig, ax=plt.subplots(q, 3, figsize=(15,7))
for i in range(0,len(columns)):
    q, r =divmod(i, 3)
    sbn.countplot(data=train_data, x=columns[i], hue='Transported', ax=ax[q, r])
plt.show()

In [None]:
#Visualizing distribution of the data
columns=['Age', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
q, r =divmod(len(columns), 3)
fig, ax=plt.subplots(q+1, 3, figsize=(16,10))
for i in range(0,len(columns)):
    q, r =divmod(i, 3)
    sbn.histplot(data=train_data, x=columns[i], hue='Transported', ax=ax[q, r], kde=True, bins=30)
plt.show()

In [None]:
#Visualizing Relationships between variables 
columns=['Age', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
q, r =divmod(int(len(columns)*(len(columns)-1)/2), 3)
fig, ax=plt.subplots(q, 3, figsize=(17,25))
k=0
for i in range(0,len(columns)):
    for j in range(i+1,len(columns)):
        q, r =divmod(k, 3)
        k+=1
        sbn.scatterplot(data=train_data, x=columns[i], y=columns[j], ax=ax[q, r], hue='Transported')
plt.show()

In [None]:
#Checking outliers in data
columns=['Age', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
q, r =divmod(len(columns), 3)
fig, ax=plt.subplots(q+1, 3, figsize=(15,10))
for i in range(0,len(columns)):
    q, r =divmod(i, 3)
    sbn.boxplot(data=train_data, x=columns[i], ax=ax[q, r])
plt.show()

<a id="4"></a>
# 4. Data Preparation

In [None]:
#Drop the unwanted columns
columns=['PassengerId', 'Cabin', 'Name']
test_copy = test_data.copy(deep=True)
combine_data = [train_data, test_copy]
for data in combine_data:
    data.drop(columns, axis=1, inplace=True)

In [None]:
train_data.head()

In [None]:
#Handling Outliers
from sklearn.preprocessing import PowerTransformer
columns=['Age','Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for data in combine_data:
    for col in columns:
        pt = PowerTransformer('yeo-johnson').fit(train_data[[col]])
        train_data[col]= pt.transform(train_data[[col]])
        test_copy[col]= pt.transform(test_copy[[col]])

In [None]:
#After transformation
columns=['Age', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
q, r =divmod(len(columns), 3)
fig, ax=plt.subplots(q+1, 3, figsize=(15,10))
for i in range(0,len(columns)):
    q, r =divmod(i, 3)
    sbn.boxplot(data=train_data, x=columns[i], ax=ax[q, r])
plt.show()

In [None]:
#Visualizing % of missing values in training data for each feature
fig = plt.figure(figsize=(13,4))
ax = fig.add_subplot(111)
indeces=train_data.isna().sum().index
bars= ax.bar(indeces, train_data.isna().sum())
for k, b in enumerate(bars):
    b.set_color(plt.cm.jet(1.0 * k / (len(indeces) - 1)))
for i, j in zip(train_data.isna().sum().index, train_data.isna().sum()):
    text = ax.annotate(text=str(round((j/len(train_data))*100,1))+"%", xy=(i, j), xytext=(i, j), va='bottom')
    text.set_fontsize(8)
plt.xticks(rotation=90)
plt.xlabel("Missing Values in Percents")
plt.ylabel("count")
plt.show()

In [None]:
#imputing missing values
#Checking inconsistency in categorical varaibale for both train and test set
columns = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side', 'VIP']
combine_data = [train_data, test_copy]
values = ["train", "test"]
for data, value in zip(combine_data, values):
    print("---------------------- ",value ," data----------------------")
    for col in columns:
        print(data[col].value_counts())

In [None]:
columns = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side']
combine_data = [train_data, test_copy]
for data in combine_data:
    data['HomePlanet']=data['HomePlanet'].map({'Europa':0, 'Earth':1, 'Mars':2})
for data in combine_data:
    data['CryoSleep']=data['CryoSleep'].map({False:0, True:1})
for data in combine_data:
    data['VIP']=data['VIP'].map({False:0, True:1})
for data in combine_data:
    data['Destination']=data['Destination'].map({'TRAPPIST-1e':0, 'PSO J318.5-22':1, '55 Cancri e':2})
for data in combine_data:
    data['Deck']=data['Deck'].map({'B':1, 'F':5, 'A':0, 'G':6 ,'E':4, 'D':3, 'C':2, 'T':7})
for data in combine_data:
    data['Side']=data['Side'].map({'P':0,  'S':1})

In [None]:
train_data.head()

In [None]:
impute = KNNImputer(n_neighbors=5).fit(train_data.drop(['Transported'], axis=1))
train_impute = pd.DataFrame(impute.transform(train_data.drop(['Transported'], axis=1)))
test_impute=pd.DataFrame(impute.transform(test_copy))

In [None]:
columns = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Num', 'Side']
combine_data = [train_impute, test_impute]
for data in combine_data:
    data.columns = columns
train_impute['Transported']=train_data.Transported

In [None]:
columns = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side', 'VIP']
combine_data = [train_impute, test_impute]
for data in combine_data:
    for col in columns:
        data[col]=round(data[col]).astype('int')

In [None]:
#Feature Scaling for catagorical variables
le = LabelEncoder()
train_impute['Transported']=le.fit_transform(train_impute['Transported'])
#One Hot encoding catagorical variables
train_impute=pd.get_dummies(train_impute, columns=['HomePlanet', 'Destination', 'Deck'])
test_impute=pd.get_dummies(test_impute, columns=['HomePlanet', 'Destination', 'Deck'])
train_impute.head()

In [None]:
#Correlation Analysis
plt.figure(figsize=(18,18))
sbn.heatmap(train_impute.corr(), annot=True)
plt.title('Correlation Analysis')
plt.show()

In [None]:
#Checking Noise in the data
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=4, min_samples=10).fit(train_impute)
set(dbscan.labels_)

In [None]:
train_impute['labels']=dbscan.labels_

In [None]:
sbn.pairplot(train_impute[['Age', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'labels']], hue='labels')
plt.show()

In [None]:
#Features and Target
feature_data = train_impute.drop(['Transported', 'labels'], axis=1)
targeted_data = train_impute['Transported']
feature_data.shape, targeted_data.shape

In [None]:
feature_data.shape

<a id="5"></a>
# 5. Model Building

In [None]:
#Model Selection
model_factory = [CatBoostClassifier(learning_rate=0.1, depth=3), DecisionTreeClassifier(max_depth=6), BaggingClassifier(DecisionTreeClassifier(max_depth=7)), KNeighborsClassifier(n_neighbors=21), RandomForestClassifier(n_jobs=-1,max_depth=5, n_estimators=350, max_features=16), XGBClassifier(objective='binary:logistic', max_depth=4, gamma=1, eta=0.003, alpha=0.9, n_estimators=400, use_label_encoder=False, eval_metric='error', learning_rate=0.05, reg_lambda=0.9), GradientBoostingClassifier(), AdaBoostClassifier()]
kfold = KFold(n_splits=10, shuffle=True, random_state=10)
for model in model_factory:
    mf = model.fit(feature_data, targeted_data)
    Pred = mf.predict(feature_data)
    scores=cross_val_score(model, feature_data, targeted_data, cv=kfold, scoring='accuracy')
    print(model.__class__.__name__, " : Train Accuracy: ", accuracy_score(targeted_data, Pred), " : Validation Accuracy : ", np.mean(scores))

**Looking at cross val score. I have chosen XGboost, GradientBoost, Cataboost Classifiers for further tuning**

<a id="6"></a>
# 6. Tuning GradientBoostingClassifier Parameters

In [None]:
params = {'learning_rate':[0.05,0.1,0.3], 'n_estimators':[100,150,175,200]}
gs = GridSearchCV(GradientBoostingClassifier(), param_grid=params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
gs.fit(feature_data, targeted_data)

In [None]:
#Displaying CV results for different parameters
cv_results_=gs.cv_results_
cv_table= pd.concat([pd.Series(cv_results_['params']), pd.Series(cv_results_['mean_test_score']), pd.Series(cv_results_['std_test_score'])], axis=1)
cv_table.columns=['params', 'mean_test_score', 'std_test_score']
plt.figure(figsize=(13,3))
plt.bar(x=cv_table.index, height=cv_table.mean_test_score, tick_label=cv_table.params)
plt.xlabel('params')
plt.xticks(rotation=45)
plt.ylabel('mean_test_score')
plt.show()

In [None]:
gb = gs.best_estimator_

In [None]:
#Best Parameters
gs.best_params_

In [None]:
result_x = gb.fit(feature_data, targeted_data)

In [None]:
y_pred = result_x.predict(feature_data)

In [None]:
accuracy_score(targeted_data, y_pred)

In [None]:
#Cross Value Score
np.mean(cross_val_score(gb, feature_data, targeted_data, cv=kfold, scoring='accuracy'))

<a id="7"></a>
# 7. Tuning CatBoostClassifier Parameters

In [None]:
params = {'depth':[2,3,6], 'learning_rate':[0.01,0.03], 'l2_leaf_reg': [0.05,0.01]}
gs = GridSearchCV(CatBoostClassifier(), param_grid=params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
gs.fit(feature_data, targeted_data)

In [None]:
#Displaying CV results for different parameters
cv_results_=gs.cv_results_
cv_table= pd.concat([pd.Series(cv_results_['params']), pd.Series(cv_results_['mean_test_score']), pd.Series(cv_results_['std_test_score'])], axis=1)
cv_table.columns=['params', 'mean_test_score', 'std_test_score']
plt.figure(figsize=(13,3))
plt.bar(x=cv_table.index, height=cv_table.mean_test_score, tick_label=cv_table.params)
plt.xlabel('params')
plt.xticks(rotation=45)
plt.ylabel('mean_test_score')
plt.show()

In [None]:
cata = gs.best_estimator_

In [None]:
#Best Parameters
gs.best_params_

In [None]:
result_x = cata.fit(feature_data, targeted_data)

In [None]:
y_pred = result_x.predict(feature_data)

In [None]:
accuracy_score(targeted_data, y_pred)

In [None]:
#Cross Value Score
np.mean(cross_val_score(cata, feature_data, targeted_data, cv=kfold, scoring='accuracy'))

<a id="8"></a>
# 8. Tuning XGboostClassifier Parameters

In [None]:
params = {'max_depth':[3,4], 'gamma':[1], 'n_estimators': [250,300], 'eta':[0.0005,0.001], 'alpha':[0.9,1], 'lambda':[0.9,1]}
gs = GridSearchCV(XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='error', n_jobs=-1, learning_rate=0.05), param_grid=params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
gs.fit(feature_data, targeted_data)

In [None]:
#Displaying CV results for different parameters
cv_results_=gs.cv_results_
cv_table= pd.concat([pd.Series(cv_results_['params']), pd.Series(cv_results_['mean_test_score']), pd.Series(cv_results_['std_test_score'])], axis=1)
cv_table.columns=['params', 'mean_test_score', 'std_test_score']
plt.figure(figsize=(13,3))
plt.bar(x=cv_table.index, height=cv_table.mean_test_score, tick_label=cv_table.params)
plt.xlabel('params')
plt.xticks(rotation=45)
plt.ylabel('mean_test_score')
plt.show()

In [None]:
xg_c = gs.best_estimator_

In [None]:
#Best Parameters
gs.best_params_

In [None]:
result_x = xg_c.fit(feature_data, targeted_data)

In [None]:
y_pred = result_x.predict(feature_data)

In [None]:
accuracy_score(targeted_data, y_pred)

In [None]:
#Cross Value Score
np.mean(cross_val_score(xg_c, feature_data, targeted_data, cv=kfold, scoring='accuracy'))

**I will ensemble above tuned classifiers.**

<a id="9"></a>
# 9. Ensembling tuned classifiers

In [None]:
estimators = [('xg_c', xg_c), ('gb', gb), ('cat', cata)]
vt = VotingClassifier(estimators, voting='soft')

In [None]:
result_x = vt.fit(feature_data, targeted_data)

In [None]:
y_pred = result_x.predict(feature_data)

<a id="10"></a>
# 10. Model Evaluation

In [None]:
#Accuracy
accuracy_score(targeted_data, y_pred)

In [None]:
#Cross Value Score
np.mean(cross_val_score(vt, feature_data, targeted_data, cv=kfold, scoring='accuracy'))

<a id="11"></a>
# 11. Submitting the Prediction to Kaggle

In [None]:
test_data['Transported']=result_x.predict(test_impute)

In [None]:
test_data['Transported']=test_data['Transported'].map({0:False, 1:True})

In [None]:
#Submitting the Prediction to Kaggle
test_data.loc[:,['PassengerId','Transported']].to_csv('Submission.csv', index=False)