In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load data and required libraries

In [None]:
#Load the required librares
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from sklearn.model_selection import train_test_split
%matplotlib inline

pd.pandas.set_option('display.max_columns',None)

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')


In [None]:
#Load training dataset
df_train=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_train.head()

In [None]:
#Load test dataset
df_test=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df_test.head()

In [None]:
#Check submission file
df_submission=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
df_submission.head()

Observation: The final submission file must have 2 columns with names as 'PassengerId' and 'Transported' and the index must start from 013_01

## EDA

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.shape, df_test.shape

**Feature Description**:

* PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

* HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

* CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

* Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

* Destination - The planet the passenger will be debarking to.

* Age - The age of the passenger.

* VIP - Whether the passenger has paid for special VIP service during the voyage.

* RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

* Name - The first and last names of the passenger.

* **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
#distribution of values in target feature
sns.barplot(x=df_train['Transported'].value_counts().index, y=df_train['Transported'].value_counts().values, data=df_train)
plt.xlabel('Transported to another dimension')
plt.ylabel('Passenger count')
plt.title('Distribution of target feature')

Observation: The training dataset is a **Balanced dataset**.

In [None]:
#distribution of values in target feature for each planet
sns.countplot(x='HomePlanet', data=df_train, hue='Transported')
plt.xlabel('Home Planet')
plt.ylabel('Passenger count')
plt.title('Distribution of target feature - homeplanet')

In [None]:
#distribution of values in target feature based on ticket class
sns.countplot(x='Transported', data=df_train, hue='VIP')
plt.xlabel('Transported to another dimension')
plt.ylabel('Passenger count')
plt.title('Distribution of target feature - ticket class')

In [None]:
#Passenger destination
label = df_train['Destination'].value_counts().index
size  = df_train['Destination'].value_counts().values
color = ['Red', 'Blue', 'Yellow']
e     = [0.08,0.08,0.08]

plt.pie(size, labels=label, colors=color, radius=1.2, shadow=True, explode=e, autopct='%1.1f%%')
plt.title('Destination of Passengers')

In [None]:
df_train['Destination'].value_counts().values

## Data preprocessing

In [None]:
#combine train and test data for data preprocessing
df_merge=pd.concat([df_test.assign(ind="test"), df_train.assign(ind="train")])
df_merge.head()

In [None]:
df_merge.shape

In [None]:
df_merge['Transported']=df_merge['Transported'].astype('object')

### Handle missing values

In [None]:
#Functon to get count of missing values in each column
def get_cols_with_missing_values(DataFrame):
    missing_na_columns=(DataFrame.isnull().sum())
    return missing_na_columns[missing_na_columns > 0]

In [None]:
#Get count of missing values - ignore 4277 records from 'Transported' as it is target feature
print(get_cols_with_missing_values(df_merge))

In [None]:
#Drop unwanted feature
df_merge.drop(["Name"], axis=1, inplace=True)

In [None]:
#We will impute a '0' value for the luxury amenities as a NULL value might indicate that the passenger has not used them
bill_luxury_amenity=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df_merge[bill_luxury_amenity]=df_merge[bill_luxury_amenity].fillna(0)

In [None]:
#Get categorical and numerical columns
categorical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes=='object' and cname!='Transported']
numerical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes!='object']

In [None]:
#Handling Missing Values in Categorical features by replacing them with the feature mode value
for col in categorical_cols:
    df_merge[col] = df_merge[col].fillna(df_merge[col].mode()[0])

In [None]:
#Handling Missing Values in numerical features by replacing them with the feature mean value
for col in numerical_cols:
    df_merge[col] = df_merge[col].fillna(df_merge[col].mean())

In [None]:
#Make sure all the missing values have been handled- ignore 4277 records from 'Transported' as it is target feature
print(get_cols_with_missing_values(df_merge))

### Feature Engineering

In [None]:
new_df = df_merge['PassengerId'].str.split('_',expand = True)
df_merge['Passenger_Group'] = new_df[0]
df_merge['Passenger_Number_in_Group'] = new_df[1]
df_merge.drop(["PassengerId"], axis=1, inplace=True)

In [None]:
df_merge[['Deck','Num','Side']] = df_merge['Cabin'].str.split('/',expand=True)
df_merge.drop(["Cabin"], axis=1, inplace=True)

In [None]:
df_merge.head()

In [None]:
df_merge.info()

In [None]:
#change data-type to int for required features
df_merge['Num']=df_merge['Num'].astype('int')
df_merge['Passenger_Number_in_Group']=df_merge['Passenger_Number_in_Group'].astype('int')
df_merge['VRDeck']=df_merge['VRDeck'].astype('int')
df_merge['Spa']=df_merge['Spa'].astype('int')
df_merge['ShoppingMall']=df_merge['ShoppingMall'].astype('int')
df_merge['FoodCourt']=df_merge['FoodCourt'].astype('int')
df_merge['RoomService']=df_merge['RoomService'].astype('int')
df_merge['Age']=df_merge['Age'].astype('int')

df_merge['Passenger_Group']=df_merge['Passenger_Group'].astype('object')
df_merge['VIP']=df_merge['VIP'].astype('object')
df_merge['CryoSleep']=df_merge['CryoSleep'].astype('object')

### Feature Encoding

In [None]:
df_merge.columns

In [None]:
#Get categorical and numerical columns
categorical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes=='object' and cname not in ('Transported', 'ind') and df_merge[cname].nunique() < 10]
numerical_cols=[cname for cname in df_merge.columns if df_merge[cname].dtypes!='object']

In [None]:
df_merge.drop(["Passenger_Group"], axis=1, inplace=True)

In [None]:
df_merge[numerical_cols].describe()

In [None]:
skew_df = pd.DataFrame(numerical_cols, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(df_merge[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False)
skew_df

In [None]:
#Apply log1p transformation
for column in skew_df.query("Skewed == True")['Feature'].values:
    df_merge[column] = np.log1p(df_merge[column])

In [None]:
df_merge['CryoSleep'] = df_merge['CryoSleep'].map({False:0,True:1})
df_merge['VIP'] = df_merge['VIP'].map({False:0,True:1})

In [None]:
cat_remaining_to_encode = [col for col in categorical_cols if col not in ('CryoSleep', 'VIP')] 
df_merge_dummies = pd.get_dummies(df_merge[cat_remaining_to_encode],drop_first=True)
df_merge.drop(cat_remaining_to_encode,axis=1,inplace=True)
df_merge = pd.concat([df_merge,df_merge_dummies],axis=1)

## Split train and test data

In [None]:
#split test and train data
test, train= df_merge[df_merge["ind"].eq("test")], df_merge[df_merge["ind"].eq("train")]
test.drop(["Transported", "ind"], axis=1, inplace=True)
train.drop(["ind"], axis=1, inplace=True)

In [None]:
train.shape, test.shape

In [None]:
X = train.loc[:, train.columns!='Transported']
y = train['Transported']

In [None]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_valid.shape, y_valid.shape)

### Building a model

In [None]:
model=XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)

In [None]:
y_pred=model.predict(X_valid)

In [None]:
#Hyper parameter optimization
params={
   "learning_rate": [0.05, 0.10, 0.15, 0.20],
    "max_depth": [5,6,7,8,9,10],
    "min_child_weight": [1,3,5],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7]
}

classifier=XGBClassifier()


## Make test predictions

In [None]:
pred_test=model.predict(test)

In [None]:
# Save test predictions to file
output = pd.DataFrame({'PassengerId': df_test.PassengerId,
                       'Transported': pred_test})
output.to_csv('submission.csv', index=False)