In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # For Data Visualization
import xgboost as xgb
from xgboost import XGBClassifier,cv # Model to be used for classification
#and cv for cross_val

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
pd.set_option('max_rows',1000)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The summary of the whole process can be explained as follows : 
1. Read the Dataset
2. Visualize and fill the missing values with a simple idea

    -> If the data has too many outliers, we take the median
    
    -> If the data appears to be normalized, we take the mean
    
    -> For categorical data, we usually take the mode
    
    Deleting the data is not an option as we have to provide a result for each given test case
3. Finalize the necessary features ( where we have considered the cabin portion by dviding it into three portions
4. Perform necessary encoduing ( one hot encoding for non-sequential categories and label encoding for sequential categories ) to prepare the data for training
5. Decide a model and tune hyperparameters to get the necessary results

In [None]:
#Read all datasets
sample=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
X=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
xtest=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
X.head()

In [None]:
X.info()

In [None]:
#Check for missing values
X.isnull().sum()

In [None]:
#Drop the unnecessary columns
xtrain=X.drop(['Name','PassengerId','Transported'],axis=1)
#Get the test ids
ids=xtest['PassengerId']
#Drop unnecessary columns in test dataset
xtest=xtest.drop(['Name','PassengerId'],axis=1)
#Target column for training dataset
y=X['Transported'].astype(int)

In [None]:
#Fill Age with mean
plt.hist(xtrain['Age'])

In [None]:
#Fill destination with mode
#Fill CryoSleep with mode
#Fill Cabin with hybrid mode
#Fill Homeplanet with mode
xtrain['Destination'].value_counts().plot(kind='bar')
plt.show()
xtrain['CryoSleep'].value_counts().plot(kind='bar')
plt.show()
#plt.figure(figsize=(10,10))
#xtrain['Cabin'].value_counts().plot(kind='bar')
#plt.show()
xtrain['HomePlanet'].value_counts().plot(kind='bar')
plt.show()

In [None]:
#Fill services columns with zero ie Mode
fig,axes=plt.subplots(3,2,figsize=(10,10))

axes[0][0].hist(xtrain['Spa'],bins=20)
axes[0][1].hist(xtrain['VRDeck'],bins=20)
axes[1][0].hist(xtrain['RoomService'],bins=20)
axes[1][1].hist(xtrain['FoodCourt'],bins=20)
axes[2][0].hist(xtrain['ShoppingMall'],bins=20)
plt.show()
#Drop the name column
#Fill VIP with False
xtrain['VIP'].value_counts().plot(kind='bar')



In [None]:
#Use the different portions of cabin to make a judgement : 
xtrain['Deck']=xtrain['Cabin'].apply(lambda x : str(x)[0])
xtrain['Num']=xtrain['Cabin'].apply(lambda x : str(x)[2])
xtrain['Side']=xtrain['Cabin'].apply(lambda x : str(x)[-1])

xtest['Deck']=xtest['Cabin'].apply(lambda x : str(x)[0])
xtest['Num']=xtest['Cabin'].apply(lambda x : str(x)[2])
xtest['Side']=xtest['Cabin'].apply(lambda x : str(x)[-1])
xtrain.head()

In [None]:
mode_cols=['Spa','VRDeck','ShoppingMall','FoodCourt','RoomService','VIP','Destination','CryoSleep','HomePlanet']
xtrain.fillna({col : xtrain[col].mode()[0] for col in mode_cols},inplace=True)
xtest.fillna({col : xtrain[col].mode()[0] for col in mode_cols},inplace=True)
xtrain.isnull().sum()

In [None]:
xtrain['Num'].replace('n',xtrain['Num'].mode()[0],inplace=True)
xtest['Num'].replace('n',xtrain['Num'].mode()[0],inplace=True)

xtrain['Side'].replace('n',xtrain['Side'].mode()[0],inplace=True)
xtest['Side'].replace('n',xtrain['Side'].mode()[0],inplace=True)

xtrain['Deck'].replace('n',xtrain['Num'].mode()[0],inplace=True)
xtest['Deck'].replace('n',xtrain['Num'].mode()[0],inplace=True)


In [None]:
#Convert booleans to ints
xtrain['VIP']=xtrain['VIP'].astype(bool).astype(int)
xtest['VIP']=xtest['VIP'].astype(bool).astype(int)
xtrain['CryoSleep']=xtrain['CryoSleep'].astype(bool).astype(int)
xtest['CryoSleep']=xtest['CryoSleep'].astype(bool).astype(int)

In [None]:
#Fill Age with mean
xtrain['Age'].fillna(xtrain['Age'].mean(),inplace=True)
xtest['Age'].fillna(xtrain['Age'].mean(),inplace=True)
xtrain.isnull().sum()

In [None]:
xtest.isnull().sum()

In [None]:
xtrain['Num']=xtrain['Num'].astype(int)
xtest['Num']=xtest['Num'].astype(int)

In [None]:
le=LabelEncoder()
ohe=OneHotEncoder(sparse=False)


In [None]:
def categorize(column,headers):
    le_result=le.fit_transform(column)
    le_result=le_result.reshape(len(column),1)
    ohe_result=pd.DataFrame(ohe.fit_transform(le_result),columns=headers)
    return ohe_result

In [None]:
#Get name of encoded columns to use in the final dataframe
destinations=sorted(xtrain['Destination'].unique())
homes=sorted(xtrain['HomePlanet'].unique())
decks=sorted(xtrain['Deck'].unique())
sides=sorted(xtrain['Side'].unique())


In [None]:
#Get one hot encoded results
ohe_train_destination=categorize(xtrain['Destination'],destinations)
ohe_train_home=categorize(xtrain['HomePlanet'],homes)
ohe_train_deck=categorize(xtrain['Deck'],decks)
ohe_train_side=categorize(xtrain['Side'],sides)
xtrain.head()


In [None]:
xtrain=xtrain.join([ohe_train_destination,ohe_train_home,ohe_train_deck,ohe_train_side])


In [None]:
xtrain.drop(['HomePlanet','Destination','Cabin','Deck','Side'],axis=1,inplace=True)

In [None]:
xtrain.isnull().sum()

In [None]:
#Same for Test dataset
ohe_test_destination=categorize(xtest['Destination'],destinations)
ohe_test_home=categorize(xtest['HomePlanet'],homes)
ohe_test_deck=categorize(xtest['Deck'],decks)
ohe_test_side=categorize(xtest['Side'],sides)

In [None]:
xtest=xtest.join([ohe_test_destination,ohe_test_home,ohe_test_deck,ohe_test_side])

In [None]:
xtest.drop(['HomePlanet','Destination','Cabin','Deck','Side'],axis=1,inplace=True)

In [None]:
xtest.isnull().sum()

In [None]:
xtrain

In [None]:
# XGB Classifier
params = {"objective":"binary:logistic",'colsample_bytree': 0.7,'learning_rate': 0.3,
                'max_depth': 20, 'alpha': 5,'eval_metric':'mlogloss'}
xgb_clf = XGBClassifier(**params)
print(xgb_clf)

In [None]:
dmatrix = xgb.DMatrix(data=xtrain,label=y)
xgb_clf.fit(xtrain,y)
print(f'Training Score : {xgb_clf.score(xtrain,y):.2f}')

In [None]:
xgb_cv = cv(dtrain=dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=420)

In [None]:
xgb_cv.head()

In [None]:
plt.figure(figsize = (16, 16))
xgb.plot_importance(xgb_clf)
plt.show()

In [None]:
results=xgb_clf.predict(xtest)
results=results.astype(bool)

In [None]:
final_df=pd.DataFrame({'Passengerid':ids,'Transported':results})
final_df.to_csv('submission.csv',index=False)