In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Load DataSets

In [None]:
#load datasets
train=pd.read_csv("../input/spaceship-titanic/train.csv")
test=pd.read_csv("../input/spaceship-titanic/test.csv")

Explore the Dataset

In [None]:
#Exploratory data analysis
train.head(20)
#test.describe()
#train.info()
#train.isnull().sum()


Visualize Data

In [None]:
#import visualization modules
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#visualize data
sns.catplot(x="HomePlanet",data=train,kind='count',hue='Transported')
sns.catplot(x="Destination",data=train,kind='count',hue='Transported')
sns.catplot(x="VIP",data=train,kind='count',hue='Transported')
sns.catplot(x="Transported",data=train,kind='count')


Data Cleaning and Feature Engineering

In [None]:
#strip cabin parameters
train[['Dec','Num','Side']]=train.Cabin.str.split('/',expand=True)

#strip group number from passenger id
train[['Group_No','People']]=train.PassengerId.str.split('_',expand=True)

#convert the new columns data type string into numeric
train.People=train.People.astype('float64')


In [None]:
#fill null values with mean, mode

train['HomePlanet'].fillna(train['HomePlanet'].mode()[0],inplace=True)
train['CryoSleep'].fillna(train['CryoSleep'].mode()[0],inplace=True)
train['Destination'].fillna(train['Destination'].mode()[0],inplace=True)
train['VIP'].fillna(train['VIP'].mode()[0],inplace=True)
train['Side'].fillna(train['Side'].mode()[0],inplace=True)
train['Dec'].fillna(train['Dec'].mode()[0],inplace=True)

train['RoomService'].fillna(train['RoomService'].mean(),inplace=True)
train['FoodCourt'].fillna(train['FoodCourt'].mean(),inplace=True)
train['ShoppingMall'].fillna(train['ShoppingMall'].mean(),inplace=True)
train['Spa'].fillna(train['Spa'].mean(),inplace=True)
train['VRDeck'].fillna(train['VRDeck'].mean(),inplace=True)
train['Age'].fillna(train['Age'].mean(),inplace=True)

In [None]:
#create new columns
category=pd.cut(train.Age,bins=[0.0,17,25,65,99],labels=['Child','Mini_Adult','Adult','Elderly'])
train.insert(5,'Group',category)
train['Group'].fillna('Child',inplace=True)
#Combine services  paid
train['Paid']=train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck']

In [None]:
#categorical encoding using ordinal encoder
from sklearn.preprocessing import OrdinalEncoder
ordinal=OrdinalEncoder()
columns=['HomePlanet','CryoSleep','VIP','Dec','Side','Destination','Transported']
for col in columns:
    train[col]=ordinal.fit_transform(train[col].values.reshape(-1,1))
    

train['Group'].replace(['Child','Mini_Adult','Adult','Elderly'],[0,1,2,3],inplace=True)

In [None]:
#drop columns
train=train.drop(['PassengerId',"Name",'Cabin','Group_No','Age','Num'],axis=1)



In [None]:
#import modelling modules
#in this case we will use Logistic regression, Random forest and Decision Tree
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process.kernels import RBF
import mlxtend
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.inspection import permutation_importance

result=[]

In [None]:
#visualization matrix
fig,ax=plt.subplots(figsize=(10,9))
ax=sns.heatmap(train.corr(), annot=True, fmt=".2f",cmap='cool',ax=ax)

print (ax)


Modelling

In [None]:
#modelling
#in this case we will use Logistic regression, Random Forest and Decision Tree
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import mlxtend
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.inspection import permutation_importance

result=[]

In [None]:
#normalize data
ntrain=(train-train.min())/(train.max()-train.min())

In [None]:
#split training dataset
X=ntrain.drop("Transported",axis=1)
Y= ntrain["Transported"]
X_train, X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

In [None]:
#logistic Regression
logmodel=LogisticRegression()
logmodel.fit(X_train,Y_train)
y_pred=logmodel.predict(X_test)
score=["Logistic Regression", logmodel.score(X_train,Y_train),logmodel.score(X_test,Y_test),abs(logmodel.score(X_train,Y_train)-logmodel.score(X_test,Y_test))]
result.append(score)


In [None]:
# Decision Tree
#parameter tuning
dct_param={'max_depth':[5,8,10,15],'min_samples_split':[3,5,10,15],'min_weight_fraction_leaf':[0,1/len(train),2/len(train),3/len(train)],'max_features':['auto','sqrt','log2']}
shc=HalvingGridSearchCV(DecisionTreeClassifier(),dct_param,factor=3, random_state=100).fit (X_train,Y_train)
dct= shc.best_estimator_

dct.fit(X_train,Y_train)
y_pred=dct.predict(X_test)
score1=["Decision Tree",dct.score(X_train,Y_train),dct.score(X_test,Y_test),abs(dct.score(X_train,Y_train)-dct.score(X_test,Y_test))]
result.append (score1)

In [None]:
#Random forest
#parameter tuning
random_param={'max_depth':[8,10,15],'min_samples_split':[5,10,20,50],'max_features':['auto','sqrt','log2'],'warm_start':[True, False]}
shc=HalvingGridSearchCV(RandomForestClassifier(warm_start=True),random_param,factor=3, random_state=100).fit (X_train,Y_train)
random_forest= shc.best_estimator_
random_forest.fit(X_train,Y_train)
y_pred=random_forest.predict(X_test)
score2=["Random Forest", random_forest.score(X_train,Y_train),random_forest.score(X_test,Y_test),abs(random_forest.score(X_train,Y_train)-random_forest.score(X_test,Y_test))]
result.append (score2)


In [None]:
#compare performance of the models
result=pd.DataFrame(data=result,columns=['Model','Train Set Score', 'Test Set Score',' Error'])
result

In [None]:
#clean the test data
#strip cabin data
test[['Dec','Num','Side']]=test.Cabin.str.split('/',expand=True)
#strip group number from passenger id
test[['Group_No','People']]=test.PassengerId.str.split('_',expand=True)
    
index=pd.DataFrame(test['PassengerId'])

#fill null values with mean, mode
#test=test.drop(['Num'],axis=1)
test['HomePlanet'].fillna(test['HomePlanet'].mode()[0],inplace=True)
test['CryoSleep'].fillna(test['CryoSleep'].mode()[0],inplace=True)
test['Destination'].fillna(test['Destination'].mode()[0],inplace=True)
test['VIP'].fillna(test['VIP'].mode()[0],inplace=True)
test['Side'].fillna(test['Side'].mode()[0],inplace=True)
test['Dec'].fillna(test['Dec'].mode()[0],inplace=True)

test['RoomService'].fillna(test['RoomService'].mean(),inplace=True)
test['FoodCourt'].fillna(test['FoodCourt'].mean(),inplace=True)
test['ShoppingMall'].fillna(test['ShoppingMall'].mean(),inplace=True)
test['Spa'].fillna(test['Spa'].mean(),inplace=True)
test['VRDeck'].fillna(test['VRDeck'].mean(),inplace=True)
test['Age'].fillna(test['Age'].mean(),inplace=True)

#categorical encoding using ordinal encoder
from sklearn.preprocessing import OrdinalEncoder
ordinal=OrdinalEncoder()
columns=['HomePlanet','CryoSleep','VIP','Dec','Side','Destination']
for col in columns:
    test[col]=ordinal.fit_transform(test[col].values.reshape(-1,1))
#creatw new column
category=pd.cut(test.Age,bins=[0.0,17,25,65,99],labels=['Child','Mini_Adult','Adult','Elderly'])
test.insert(5,'Group',category)
test['Group'].fillna('Child',inplace=True)

test['Group'].replace(['Child','Mini_Adult','Adult','Elderly'],[0,1,2,3],inplace=True)
#Combine services  paid
test['Paid']=test['RoomService']+test['FoodCourt']+test['ShoppingMall']+test['Spa']+test['VRDeck']
#drop columns
test=test.drop(['PassengerId',"Name",'Cabin','Group_No','Num','Age'],axis=1)
test.People=test.People.astype('float64')
#normalize
ntest=(test-test.min())/(test.max()-test.min())


In [None]:
test.info()

Submission

In [None]:
# we will use Random forest as it performed best with the train dataset
prediction=index.join(pd.DataFrame(logmodel.predict(ntest).astype(int),columns=['Transported']))
prediction['Transported'].replace([1,0],['True','False'],inplace=True)
prediction.to_csv("submissionsPT2.csv",index=False)