In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the Libraries

In [None]:
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test=pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train[train.isna().any(axis=1)]

In [None]:
#unique values of HomePlanet
train['HomePlanet'].unique()
train['HomePlanet']
sns.barplot(data=train,x='HomePlanet',y='Transported')

In [None]:
#unique values of Destination
train['Destination'].unique()
sns.barplot(data=train,x='Destination',y='Transported')

1. Splitting the deck feature into 3 features- Deck,Number,Cabin

2. Age feature into ranges

In [None]:
#splitting the deck feature into 3 features- Deck,Number,Cabin
train[['Deck', 'Number', 'Side']] = train['Cabin'].str.split("/", expand=True)
test[['Deck', 'Number', 'Side']] = test['Cabin'].str.split("/", expand=True)
#Age feature into ranges
bins=[0,13,18,25,200]
labels=['<=13','13-18','18-25','>25']
train['AgeGroup'] = pd.cut(train['Age'], bins=bins, labels=labels, right=False)
test['AgeGroup'] = pd.cut(test['Age'], bins=bins, labels=labels, right=False)
train.columns

Replacing name with last name

In [None]:
#replacing name with last name
train['Name']=train['Name'].apply(lambda x:str(x).split(" ")[-1])
test['Name']=test['Name'].apply(lambda x:str(x).split(" ")[-1])

In [None]:
train.isnull().sum()

# Preprocessing the data and Feature Eng

Filling the missing values

In [None]:
#two new variables- group_id- group number
#id_in_group- passenger number in that group
train[['group_id', 'id_in_group']]=train['PassengerId'].str.split("_", expand=True)
test[['group_id', 'id_in_group']]=test['PassengerId'].str.split("_", expand=True)
#another feature total_in_group- number of passengers in that group
train['total_in_group']=train['group_id'].map(lambda x: pd.concat([train['group_id'],test['group_id']]).value_counts()[x])
test['total_in_group']=test['group_id'].map(lambda x: pd.concat([train['group_id'],test['group_id']]).value_counts()[x])
#from below output it is clear that within group numbers have same HomePlanet
# because train['group_id'].nunique()- 201(nan values of Homeplanet)== number of rows (6107)
#every group has only one HomePlanet
#within same group has different destinations

>   Concatenating train and test data to fill nan values all at once

In [None]:
Y=train[['PassengerId','Transported']]

In [None]:
X=train.drop(['Transported'],axis=1)
total_data=pd.concat([X,test]).reset_index()

>  As mentioned in dataset, there is a possibility to have same Deck, Number and side if they are from same group

> Finding the relation between variables to fill nan values because there no individual relation between predictors and output.

In [None]:
#nan values for cabin,number and side
total_data[['Deck','Number','Side']]=total_data.groupby('group_id')[['Deck','Number','Side']].fillna(method='ffill').fillna(np.nan)

> fill nan values in HomePlanet based on previous conclusion. Within the same group have same HomePlanet

In [None]:
total_data['HomePlanet'] = total_data.groupby('group_id')['HomePlanet'].fillna(method='ffill').fillna(np.nan)
total_data[['Deck','Number','Side','HomePlanet']].groupby(['HomePlanet','Deck']).size().reset_index()

In [None]:
#Deck values Earth- G,E,F
#            Europa- A,B,C,D,E,T
#            Mars- D,E,F
def some(x):
    if pd.isnull(x['HomePlanet']):
        if x['Deck']=='G':
            return 'Earth'
        elif (x['Deck']=='T') | (x['Deck']=='A') | (x['Deck']=='B') | (x['Deck']=='C'):
            return 'Europa'
    else:
        return x['HomePlanet']
total_data['HomePlanet'] = total_data.apply(some,axis=1)
#test['HomePlanet']=test.apply(some,axis=1)

In [None]:
total_data[['HomePlanet','VIP']].groupby(['HomePlanet','VIP']).size().reset_index()

> Earth has no VIPs

> fill the nan values of VIP based on previous conclusion

In [None]:
total_data.loc[total_data['HomePlanet']=='Earth','VIP']=total_data.loc[total_data['HomePlanet']=='Earth','VIP'].fillna(False)

> making all separate expenses into single column "total_expenses"

In [None]:
#making all separate expenses into single column "total_expenses"
total_data['total_expenses']=total_data.iloc[:,7:12].sum(axis=1,skipna=False)
sns.barplot(data=total_data,x='AgeGroup',y='total_expenses')
#clearly children have no bill

In [None]:
sns.barplot(data=total_data,x='CryoSleep',y='total_expenses')
#Cryosleep passengers have no bills

> clearly children and CryoSleep passengers have no bill

In [None]:
#for all services
cond5=(total_data['AgeGroup']=='0-13')|(total_data['CryoSleep']==True)
total_data.loc[cond5,'RoomService']=total_data.loc[cond5,'RoomService'].fillna(0)

total_data.loc[cond5,'FoodCourt']=total_data.loc[cond5,'FoodCourt'].fillna(0)

total_data.loc[cond5,'ShoppingMall']=total_data.loc[cond5,'ShoppingMall'].fillna(0)

total_data.loc[cond5,'Spa']=total_data.loc[cond5,'Spa'].fillna(0)

total_data.loc[cond5,'VRDeck']=train.loc[cond5,'VRDeck'].fillna(0)

#AgeGroup vs VIP
total_data[['VIP','AgeGroup']].groupby(['VIP','AgeGroup']).size().reset_index()
#all VIPs are adults(>18)

In [None]:
total_data.isnull().sum()

> same group often has same last name - mentioned in data description

In [None]:
total_data['Name']=total_data.groupby('group_id')['Name'].fillna(method='ffill').fillna(np.nan)
total_data[['AgeGroup','CryoSleep','total_expenses','Destination']].groupby(['AgeGroup','CryoSleep','total_expenses','Destination']).size().reset_index().query('total_expenses==0')

> not child, no CryoSleep with no bill have destination TRAPPIST-1e

In [None]:
cond1=((total_data['AgeGroup']!='<=13') & (total_data['CryoSleep']==False) &(total_data['total_expenses']==0))
total_data.loc[cond1,'Destination']=total_data.loc[cond1,'Destination'].fillna('TRAPPIST-1e')

In [None]:
total_data[['HomePlanet','AgeGroup','Destination']].groupby(['AgeGroup','HomePlanet','Destination']).size().reset_index()
#Europa with Age<25, never goes to PSO J318.5-22

In [None]:
pd.crosstab(total_data['HomePlanet'],total_data['Destination'])
#from Europa less people has destination PSO J318.5-22
#from mars high preferred destination is TRAPPIST-1e
#PSO J318.5-22 has majority from earth

In [None]:
#train.loc[train['HomePlanet']=='Mars','Destination']=train.loc[train['HomePlanet']=='Mars','Destination'].fillna('TRAPPIST-1e')
#test.loc[test['HomePlanet']=='Mars','Destination']=test.loc[test['HomePlanet']=='Mars','Destination'].fillna('TRAPPIST-1e')
#train.loc[train['Destination']=='PSO J318.5-22','HomePlanet']=train.loc[train['Destination']=='PSO J318.5-22','HomePlanet'].fillna('Earth')
#test.loc[test['Destination']=='PSO J318.5-22','HomePlanet']=test.loc[test['Destination']=='PSO J318.5-22','HomePlanet'].fillna('Earth')

In [None]:
bins=[0,400,800,100000]
labels=['0-400','400-800','800-1000000']
total_data['expenses_group'] = pd.cut(total_data['total_expenses'], bins=bins, labels=labels, right=False)
#test['expenses_group'] = pd.cut(test['total_expenses'], bins=bins, labels=labels, right=False)

In [None]:
#train[['HomePlanet','expenses_group','Destination']].groupby(['expenses_group','HomePlanet']).size().reset_index()
#400-800 range, Europa and mars are very less
#train.loc[train['expenses_group']=='400-800','HomePlanet']=train.loc[train['expenses_group']=='400-800','HomePlanet'].fillna('Europa')
#test.loc[test['expenses_group']=='400-800','HomePlanet']=test.loc[test['expenses_group']=='400-800','HomePlanet'].fillna('Europa')

> Additionally, Outliers

In [None]:
x=total_data[total_data['RoomService'].notna()]['RoomService']
q1, q3 = np.percentile(x, [25, 98])
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
qq=np.where((x > upper_bound) | (x < lower_bound))[0]

In [None]:
train.isnull().sum()

In [None]:
total_data.columns

In [None]:
total_data['ShoppingMall']=total_data['ShoppingMall'].fillna(total_data.groupby('HomePlanet')['ShoppingMall'].transform('median'))
total_data['RoomService']=total_data['RoomService'].fillna(total_data.groupby('HomePlanet')['RoomService'].transform('median'))
total_data['FoodCourt']=total_data['FoodCourt'].fillna(total_data.groupby('HomePlanet')['FoodCourt'].transform('median'))
total_data['Spa']=total_data['Spa'].fillna(total_data.groupby('HomePlanet')['Spa'].transform('median'))
total_data['VRDeck']=total_data['VRDeck'].fillna(total_data.groupby('HomePlanet')['VRDeck'].transform('median'))

In [None]:
total_data.isnull().sum()

In [None]:
train[['group_id','Transported','HomePlanet']].groupby(['HomePlanet','group_id']).mean().reset_index()

In [None]:
total_data["total_in_group"] = np.where(total_data["total_in_group"]==1, 0, 1)

In [None]:
#conversion of datatypes
total_data['Number']=total_data['Number'].astype(float)
total_data['group_id']=total_data['group_id'].astype(float)
total_data['id_in_group']=total_data['id_in_group'].astype(int)
total_data['CryoSleep']=total_data['CryoSleep'].astype(bool)
total_data['VIP']=total_data['VIP'].astype(bool)

In [None]:
plt.figure(figsize = (18,18))
sns.heatmap(train.corr(), annot = True, cmap = "RdYlGn")
plt.show()

In [None]:
sns.histplot(x='Number', data=train, hue='Transported',kde=True,bins=20)

Values are right skewed

In [None]:
total_data['Number']=total_data['Number'].apply(lambda x:np.log10(x) if (x!=0) else x)

# Filling remaining Nan values

In [None]:
total_data.isnull().sum()

In [None]:
#numerical data
def numerical_(df):
    data_num=df.select_dtypes(['float64','int64'])
    cols_num = list(data_num.columns)
    dict_num = {i:cols_num[i] for i in range(len(cols_num)) }
    imputer = SimpleImputer(strategy='median')
    d=imputer.fit_transform(data_num)
    temp1=pd.DataFrame(d,index=df.index)
    temp1= temp1.rename(columns=dict_num)
    return temp1

In [None]:
#dtype-Object
def object_(df):
    obj_data = df.select_dtypes(['object','category']) 
    cols = list(obj_data.columns)
    for col in cols:
        obj_data[col].fillna(obj_data[col].mode()[0],inplace=True)
    for i in cols:
        un = obj_data[i].unique()
        ran = range(1,len(un)+1)
        obj_data.replace(dict(zip(un,ran)) ,inplace=True)
    obj_data = pd.get_dummies(obj_data, columns=['HomePlanet', 'Destination'])
    return obj_data

In [None]:
#boolean data
def boolean_(df): 
    bool_data = df.select_dtypes(['bool'])
    cols = bool_data.columns
    for i in cols:
        bool_data[i] = LabelEncoder().fit_transform(bool_data[i])
    return bool_data

In [None]:
total_num = numerical_(total_data)

bins=[0,13,18,25,200]
labels=['<=13','13-18','18-25','>25']
total_data['AgeGroup'] = pd.cut(total_num['Age'], bins=4, labels=labels, right=False)
test['AgeGroup'] = pd.cut(test['Age'], bins=bins, labels=labels, right=False)

total_data['total_expenses']=total_num[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
#test['total_expenses']=test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
bins=[0,400,800,1200,100000]
labels=['0-400','400-800','800-1200','1200-1000000']
total_data['expenses_group'] = pd.cut(total_data['total_expenses'], bins=bins, labels=labels, right=False)
#test['expenses_group'] = pd.cut(test['total_expenses'], bins=bins, labels=labels, right=False)

In [None]:
total_cat = object_(total_data)
total_bool = boolean_(total_data)
final_data= pd.concat([total_num,total_cat,total_bool],axis=1)

In [None]:
final_data.columns

In [None]:
#Onehotencoder
#enc = OneHotEncoder()
#transformed = enc.fit_transform(train_data[['AgeGroup']])
#train_data[enc.categories_[0]] = transformed.toarray()

In [None]:
final_data['PassengerId']=total_data['PassengerId']
#final_data['Number']=np.where(final_data['Number']==0,0,np.log2(final_data['Number']))

In [None]:
train_data=pd.merge(final_data,Y, how ='inner', on =['PassengerId'])
test_data=pd.merge(final_data,test['PassengerId'],on=['PassengerId'],how='inner')

# Predictions using RandomForestClassifier

In [None]:
predictor_cols=final_data.drop(['VIP','total_expenses','total_in_group','Name','Age','PassengerId','Cabin'],axis=1).columns[1:]

In [None]:
#predictor_cols=['HomePlanet_1','HomePlanet_2','HomePlanet_3','Destination_1','Destination_2','Destination_3','RoomService', 
#                'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','CryoSleep','id_in_group','group_id',
#               'Side','Number','total_expenses','AgeGroup','total_in_group','VIP']

In [None]:
X = train_data[predictor_cols]
Y=train_data['Transported']

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=60,shuffle=True)

In [None]:
params = {
    "n_estimators": [i for i in range (100, 600, 50)], 
    "max_depth": [i for i in range (6, 20, 2)],
    "min_samples_leaf" : [i for i in range (15, 40,3)],
    "min_samples_split" : [i for i in range (5,240,10)]
}
params = {
    "n_estimators": [i for i in range (200, 500, 50)]
}

In [None]:
rfc=RandomForestClassifier()
skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 1001)
grid = GridSearchCV(rfc, params, cv=skf, scoring='accuracy', return_train_score=False,verbose=1)
#grid = RandomizedSearchCV(estimator = rfc, param_distributions = params, cv = 5, scoring = 'accuracy', n_jobs = -1)
grid_search = grid.fit(x_train, y_train)
accuracy = grid_search.best_score_ *100
print(accuracy)

In [None]:
#grid = RandomizedSearchCV(estimator = rcla, param_distributions = params, cv = 5, scoring = 'accuracy', n_jobs = -1)
best_param_rf = grid_search.best_params_
print(best_param_rf)

In [None]:
#rcla = RandomForestClassifier(**best_param_rf)
rcla=RandomForestClassifier()
rcla.fit(x_train, y_train)

In [None]:
predicted = rcla.predict_proba(x_val)[:,1]
train_score = accuracy_score(predicted.round(),y_val)*100
print("Accuracy using Random Forest on training data is {} %".format(train_score))

In [None]:
predicted_val = rcla.predict(test_data[predictor_cols])
predicted_val=predicted_val.round()

# XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xg=XGBClassifier()
skf = StratifiedKFold(n_splits=10, shuffle = False)
#grid = RandomizedSearchCV(estimator = xg, param_distributions = params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid = GridSearchCV(rfc, params, cv=skf, scoring='accuracy', return_train_score=False,verbose=1)
grid_search = grid.fit(x_train, y_train)
accuracy = grid_search.best_score_ *100
print(accuracy)

In [None]:
#grid = RandomizedSearchCV(estimator = rcla, param_distributions = params, cv = 5, scoring = 'accuracy', n_jobs = -1)
best_param_rf = grid_search.best_params_
print(best_param_rf)

In [None]:
#xgc = XGBClassifier(**best_param_rf)
xgc=XGBClassifier(n_estimators=1000,learning_rate=0.05,random_state=30,n_jobs=-1)
xgc.fit(x_train, y_train)

In [None]:
xgc.feature_importances_ *1000

In [None]:
xgc.feature_names_in_

In [None]:
predicted = xgc.predict_proba(x_val)[:,1]
train_score = accuracy_score(predicted.round(),y_val)*100
print("Accuracy using Random Forest on training data is {} %".format(train_score))

In [None]:
predicted = xgc.predict_proba(x_train)[:,1]
train_score = accuracy_score(predicted.round(),y_train)*100
print("Accuracy using Random Forest on training data is {} %".format(train_score))

In [None]:
predicted_val = xgc.predict(test_data[predictor_cols])
predicted_val=predicted_val.round()

# MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf = MLPClassifier(random_state=1, max_iter=400,hidden_layer_sizes=(150,100,50),activation="relu",learning_rate_init=0.05)
clf.fit(x_train,y_train)
pred=clf.predict(x_train)
pred2=clf.predict(x_val)
print(accuracy_score(pred,y_train))
print(accuracy_score(pred2,y_val))

In [None]:
#predicted_val=clf.predict(test_data[predictor_cols])

In [None]:
my_submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported': predicted_val})
my_submission['Transported']=my_submission['Transported'].astype(bool)

In [None]:
my_submission

In [None]:
my_submission.to_csv('submission.csv', index=False)