In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# *Importing Libraries*

In [None]:
# Libraries for EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import figure

#Libraries for Feature Engineering
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

#Library for model training
from xgboost import XGBClassifier

# *Importing Datasets*

In [None]:
#  Imported training data as 'train'
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

#  Imported test data as 'test'
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# *EDA*

### Let's see how our data looks like

In [None]:
train.head()

### Now let's look into shape of our Data

In [None]:
train.shape

##### So our data contains 8693 rows and 14 diifferent columns

### Let's now look if there are any null values and what are dtypes of rows in our data

In [None]:
train.info()

##### The describe() method gives a quick summary of the statistical information of the numerical columns

In [None]:
train.describe()

##### Our output column in 'Transported' so lets see how much counts is has

In [None]:
train['Transported'].value_counts()

##### Almost 50% of the travellers were transported

## Let's see correlations in our Data

In [None]:
plt.figure(figsize=(20, 15))
correlations = train.corr()
sns.heatmap(correlations, cmap="coolwarm", annot=True)
plt.show()

### Now let us look into value counts our features (columns) have

In [None]:
fig,ax = plt.subplots(2,2, figsize=(15,10))
sns.countplot(train['HomePlanet'] , palette='Paired_r', ax=ax[0][0])
sns.countplot(train['CryoSleep'] , palette='Paired_r', ax=ax[0][1])
sns.countplot(train['Destination'] , palette='Paired_r', ax=ax[1][0])
sns.countplot(train['VIP'] , palette='Paired_r', ax=ax[1][1])
plt.show()

In [None]:
sns.distplot(train[train['Transported']==0]['Age'],hist=False)
sns.distplot(train[train['Transported']==1]['Age'],hist=False)

#### This graph shows us density of person transported according to their age

# Adding columns and handling null values

##### Features like cabin contains information of deck , num and side so we will first split that and , other feature like RoomService , FoodCourt , ShoppingMall , Spa , VRDeck  contain info about person's bill which he spent on this features so, we will make another colum as ToalBill for that having sum of all the bills

In [None]:
train[['deck','num','side']] = train['Cabin'].str.split('/', expand=True)
train['TotalBill'] =  train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
train.drop(['Cabin','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'],1,inplace=True)


test[['deck','num','side']] = test['Cabin'].str.split('/', expand=True)
test['TotalBill'] =  test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
test.drop(['Cabin','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'],1,inplace=True)

##### Now lets again see into our data if there are any outliers

In [None]:
sns.scatterplot(train['Age'],train['TotalBill'],hue=train['Transported'])

##### Here we can see that therer are very less values having their total bill greater than 2500 so we will remove that values and also there less values in age above 75 so we will also remove them

### Now lets see how much null values are there in our data

In [None]:
train.isnull().sum()

##### So we will now fill the null values according to their max occuring values 

In [None]:
'''CATEGORICAL DATA'''


# Training Data
train['HomePlanet']= train['HomePlanet'].fillna('Earth')
train['Destination']= train['Destination'].fillna('TRAPPIST-1e')
train["deck"] = train["deck"].fillna('F')
train['side']= train['side'].fillna('S')

#Test Data
test['HomePlanet']= test['HomePlanet'].fillna('Earth')
test['Destination']= test['Destination'].fillna('TRAPPIST-1e')
test["deck"] = test["deck"].fillna('F')
test['side']= test['side'].fillna('S')

In [None]:
#Here we will convert true/false to 1/0
 
train['CryoSleep']= train['CryoSleep'].fillna(False).astype(int)
test['CryoSleep']= test['CryoSleep'].fillna(False).astype(int)

train['VIP']= train['VIP'].fillna(False).astype(int)
test['VIP']= test['VIP'].fillna(False).astype(int)

In [None]:
'''NUMERICAL DATA'''

#Training Data
train["Age"] = train["Age"].fillna(train["Age"].mean())
train['num'] = train['num'].astype(float)
train['num']= train['num'].fillna(train['num'].mean())

#Test Data
test["Age"] = test["Age"].fillna(test["Age"].mean())
test['num'] = test['num'].astype(float)
test['num']= test['num'].fillna(train['num'].mean())

##### We can't fill values of name column , so we will drop it

In [None]:
train.drop(columns='Name',inplace=True)

test.drop(columns='Name',inplace=True)

In [None]:
print(train.isnull().sum())
print(test.isnull().sum())

##### So now our null values are handled

# Feature Engineering

### Now lets split our data into input and output

In [None]:
X_train=train.drop(columns='Transported')
y_train=train['Transported']

##### test.copy() will copy its data into X_test so we that we can identify train and test data easily

In [None]:
X_test=test.copy(deep=True)

##### Now we want to apply OneHotEncoder to our categorical columns & StandardScaler to numerical data so we will use make_column_transformer() function


In [None]:
transformer=make_column_transformer(
    (OneHotEncoder(drop='first',sparse=False,dtype=np.int32),['HomePlanet','Destination','deck' ,'side']),
    (StandardScaler(),['Age','num','TotalBill']),
    remainder='passthrough')

#### Now we will fit and transform our train and test data

In [None]:
X_train=transformer.fit_transform(X_train)

In [None]:
X_test=transformer.transform(X_test)

##### Our output column is categorical so we will use LabelEncoder

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)

# Model Training

 ### Here we are using XGBClassifier as our model

In [None]:

xgb_model = XGBClassifier()
model = xgb_model.fit(X_train, y_train, eval_metric='logloss')

print("Performance on train data:", model.score(X_train, y_train)*100)

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg=DecisionTreeRegressor()
model_2=reg.fit(X_train, y_train)

### So our model's performance on train data was almost 90%

#### Now lets predict output with optimized model

In [None]:
optimized_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, gamma=5, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=5,
              min_child_weight=5, monotone_constraints='()',
              n_estimators=600, n_jobs=1, nthread=1, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, silent=True, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

optimized_model = optimized_xgb.fit(X_train, y_train, eval_metric='logloss')

print("Performance on train data:", optimized_model.score(X_train, y_train)*100)

### So performance on optimized model is 78%

# Submitting The Data

In [None]:
y_pred = model_2.predict(X_test)

##### Our outpute values are ture/false in kaggle submission data so we also need to convert our output to boolean form to calculate ou score

In [None]:
y_pred

In [None]:
# submission_df = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
# submission_df["Transported"] = y_pred.astype('bool')

# submission_df.to_csv('my_submission_final.csv', index=False)

In [None]:
pd.read_csv('my_submission_final.csv')

In [None]:
y_true = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true,submission_df)

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg=DecisionTreeRegressor()
model_2.fit(X_train,y_train)
y_pred_temp=model_2.predict(X_test)
submission_df["Transported"] = y_pred.astype('bool')
submission_df.to_csv('my_submission_Temp.csv', index=False)
