![](https://imgk.timesnownews.com/story/artist_impression_spaceship.jpg?tr=w-600,h-450,fo-auto)

## Importing Data and preprocessing

In [None]:
#import necessary libraries 
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import plotly as py 
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
sns.set(style='darkgrid', font_scale=1.4)

In [None]:
df_train = pd.read_csv("../input/spaceship-titanic/train.csv")
df_test = pd.read_csv("../input/spaceship-titanic/test.csv")
df_train.head()

In [None]:
#Check number of different values of Cabin column
len(df_train.Cabin.unique())

In [None]:
#Check test data
df_test.head()

In [None]:
#Check train data
df_train.info()

In [None]:
#Check test data
df_test.info()

In [None]:
#Check labeled values distrbuition in train data
sns.countplot(df_train['Transported'])
plt.show()

In [None]:
#Plot distrbution of survivors
plt.figure(figsize=(6,6))

# Pie plot
df_train['Transported'].value_counts().plot.pie(explode=[0.1,0.1], autopct='%1.1f%%', shadow=True, textprops={'fontsize':16}).set_title("Transported")

#### We notice that our labeled is almost equally distrbuted 

In [None]:
#Age and Survrivors
plt.figure(figsize=(10,4))

# Histogram
sns.histplot(data=df_train, x='Age', hue='Transported', binwidth=1, kde=True)

# Aesthetics
plt.title('AGE Distrbution')
plt.xlabel('Age vs Survivors')

#### For people between (1, 20) years old a higher chance for being transported appears specially for very young childern (1-5) years, and the peak of people suffers from risk of not transported is from (20,25) years old

In [None]:
#HomePlanet and Survivors 
plt.figure(figsize=(10,4))
sns.countplot(data=df_train, x='HomePlanet', hue='Transported')

#### We found that people from Earth have a lower chance to be transported, on the other side people from Europa have a higher chance to be transported, people from Mars have equally chance!

In [None]:
#CryoSleep and Survivors 
plt.figure(figsize=(10,4))
sns.countplot(data=df_train, x='CryoSleep', hue='Transported')

#### People who choose to be confined to their Cryo have a higher chance to be transported than the people who choose to not confined

In [None]:
#Destination and Survivors 
plt.figure(figsize=(10,4))
sns.countplot(data=df_train, x='Destination', hue='Transported')

#### People who was heading to "TRAPPIST-le" have a lower chance to be trasnported, while people heading to "PSO J318.5-22" has equal chance, Finally people heading to "55 Cancri e" have a lower chance to be transported

In [None]:
#VIP and Survivors 
plt.figure(figsize=(10,4))
sns.countplot(data=df_train, x='VIP', hue='Transported')

#### We notice that VIP service make no deference for being transported or not!

## Preprocessing

In [None]:
#Save ID column for test data as we will need it in submission step
df_test_id = df_test['PassengerId']

In [None]:
#As mentioned earlier Passenger ID contains GroupID_NumberInsideTheGroup, we will add a new column named Group then drop ID columns
df_train['Group'] = ''
df_test['Group'] = ''

for i in range(0, len(df_train)):
    df_train['Group'][i] = df_train['PassengerId'][i].split('_')[0]
for i in range(0, len(df_test)):
    df_test['Group'][i] = df_test['PassengerId'][i].split('_')[0]
    
df_train.drop('PassengerId', axis =1, inplace = True)
df_test.drop('PassengerId', axis = 1, inplace = True)

In [None]:
#Check data type
df_train.info()

In [None]:
##Replacing Null with N/0/N
df_train['Cabin'].fillna('N/0/N', inplace = True)
df_test['Cabin'].fillna('N/0/N', inplace = True)

In [None]:
#As mentioned earlier Passenger ID contains GroupID_NumberInsideTheGroup, we will add a new column named Deck and side then drop Cabin columns
df_train['Deck'] = ''
df_test['Deck'] = ''
df_train['Side'] = ''
df_test['Side'] = ''



for i in range(0, len(df_train)):
    df_train['Deck'][i] = df_train['Cabin'][i].split('/')[0]
    df_train['Side'][i] = df_train['Cabin'][i].split('/')[2]
for i in range(0, len(df_test)):
    df_test['Deck'][i] = df_test['Cabin'][i].split('/')[0]
    df_test['Side'][i] = df_test['Cabin'][i].split('/')[2]

df_train.drop('Cabin', axis = 1, inplace = True)
df_test.drop('Cabin', axis = 1, inplace = True)

### Dealing with missing Values

In [None]:
#checking data type
df_train.info()

In [None]:
#checking null values
df_train.isnull().sum()

In [None]:
# Regarding the Age filling the missing data with the average would be a good action
df_train['Age'].fillna(value = df_train['Age'].mean(), inplace = True)
df_test['Age'].fillna(value = df_test['Age'].mean(), inplace = True)

In [None]:
#All expenses will be filled with zero for all null values
df_train['RoomService'].fillna(value = 0, inplace = True)
df_train['FoodCourt'].fillna(value = 0, inplace = True)
df_train['ShoppingMall'].fillna(value = 0, inplace = True)
df_train['Spa'].fillna(value = 0, inplace = True)
df_train['VRDeck'].fillna(value = 0, inplace = True)
df_test['RoomService'].fillna(value = 0, inplace = True)
df_test['FoodCourt'].fillna(value = 0, inplace = True)
df_test['ShoppingMall'].fillna(value = 0, inplace = True)
df_test['Spa'].fillna(value = 0, inplace = True)
df_test['VRDeck'].fillna(value = 0, inplace = True)

In [None]:
#No need for name columns so we will drop it
df_train.drop('Name', inplace = True, axis =1)
df_test.drop('Name', inplace = True, axis = 1)

In [None]:
#checking null values again
df_train.isnull().sum()

In [None]:
# For HomePlanet, CryoSleep, Destination and VIP we will fill NAN with the most frequent value
df_train['HomePlanet'].fillna(df_train['HomePlanet'].mode()[0], inplace = True)
df_train['CryoSleep'].fillna( df_train['CryoSleep'].mode()[0], inplace = True)
df_train['Destination'].fillna( df_train['Destination'].mode()[0], inplace = True)
df_train['VIP'].fillna(df_train['VIP'].mode()[0], inplace = True)
df_test['HomePlanet'].fillna( df_test['HomePlanet'].mode()[0], inplace = True)
df_test['CryoSleep'].fillna( df_test['CryoSleep'].mode()[0], inplace = True)
df_test['Destination'].fillna( df_test['Destination'].mode()[0], inplace = True)
df_test['VIP'].fillna(df_test['VIP'].mode()[0], inplace = True)

### Now we have done dealing with missing values

## Encoding Categorical Values

In [None]:
#checking data type
df_train.info()

In [None]:
#For categorical features with a few number of unique values (HomePlanet, CrySleep, Destination, VIP, Deck, Side) we will use one hot encoder, 
#and for bigger frequency feature (Group) we will use label encoder
one_hot_encoded_training_predictors = pd.get_dummies(df_train[['HomePlanet','Destination','Deck','Side']])
one_hot_encoded_testing_predictors = pd.get_dummies(df_test[['HomePlanet','Destination','Deck','Side']])
df_train.drop(['HomePlanet','Destination','Deck','Side'], axis = 1, inplace = True)
df_test.drop(['HomePlanet','Destination','Deck','Side'], axis = 1, inplace = True)
df_train = df_train.join(one_hot_encoded_training_predictors)
df_test = df_test.join(one_hot_encoded_testing_predictors)

In [None]:
#checking columns type
df_train.info()

In [None]:
#Converting bool columns into int
df_train['CryoSleep'] = df_train['CryoSleep'].astype(int)
df_train['VIP'] = df_train['VIP'].astype(int)
df_train['Transported'] = df_train['Transported'].astype(int)
df_test['CryoSleep'] = df_test['CryoSleep'].astype(int)
df_test['VIP'] = df_test['VIP'].astype(int)

In [None]:
#Finally we will apply label enconder to "Group"
le = preprocessing.LabelEncoder()
le.fit(df_train['Group'])
transformed = le.transform(df_train['Group'])
df_train['Group'] = transformed
le.fit(df_test['Group'])
transformed = le.transform(df_test['Group'])
df_test['Group'] = transformed

In [None]:
#checking columns type again
df_test.info()

## Model

In [None]:
#Let's split our training data into train/test split
X = df_train.drop("Transported", axis = 1)
y = df_train['Transported']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state = 40)

### Logistic Regression

In [None]:
#Predict our Test data using Logistic Regression
logmodel= LogisticRegression()
logmodel.fit(X_train, y_train)
predicted_lr = logmodel.predict(X_test)
#Evaluate our logistic regression model
print(confusion_matrix(y_test, predicted_lr))
print(classification_report(y_test, predicted_lr))
print(logmodel.score(X_test, y_test))

### K-Nearest Neighbor Classifier

In [None]:
#KNN classifier 
KN_calssifier = KNeighborsClassifier()
KN_calssifier.fit(X_train, y_train)
predicted_kn = KN_calssifier.predict(X_test)
print(confusion_matrix(y_test, predicted_kn))
print(classification_report(y_test, predicted_kn))

In [None]:
#For loop to try different K values
error_rate = []
for i in range(1,60):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
#Plt for-loop
plt.figure(figsize = (10,6))
plt.plot(range(1,60), error_rate, color = 'blue', linestyle = '--', marker = 'o',
        markerfacecolor = 'red', markersize = 10)
plt.title('Error Rate vs K')
plt.xlabel('K')
plt.ylabel("Error Rate")

#### We found the best K-value = 34

In [None]:
#KNN classifier after hypertuning
KN_calssifier = KNeighborsClassifier(n_neighbors=34)
KN_calssifier.fit(X_train, y_train)
predicted_kn = KN_calssifier.predict(X_test)
print(classification_report(y_test, predicted_kn))
print(confusion_matrix(y_test, predicted_kn))
print(KN_calssifier.score(X_test, y_test))

### Decision Tree Classifier

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
predictions_dt = dtree.predict(X_test)
print(confusion_matrix(y_test, predictions_dt))
print(classification_report(y_test, predictions_dt))
print(dtree.score(X_test, y_test))

### Random Forest Classifier

In [None]:
#Randomforest Classifier
rfc = RandomForestClassifier(n_estimators= 300)
rfc.fit(X_train, y_train)
predictions_rf = rfc.predict(X_test)
print(confusion_matrix(y_test, predictions_rf))
print(classification_report(y_test, predictions_rf))
print(rfc.score(X_test, y_test))

### Naive Bayes Classifier

In [None]:
NB = GaussianNB()
NB.fit(X_train, y_train)
predictions_nb = NB.predict(X_test)
print(confusion_matrix(y_test, predictions_nb))
print(classification_report(y_test, predictions_nb))
print(NB.score(X_test, y_test))

### Support Vecotr Classifier

In [None]:
svc = make_pipeline(StandardScaler(), SVC(gamma='auto')) 
svc.fit(X_train, y_train)
predictions_svc = svc.predict(X_test)
print(confusion_matrix(y_test, predictions_svc))
print(classification_report(y_test, predictions_svc))
print(svc.score(X_test, y_test))

### Stochastic Gradient Descent

In [None]:
sgd = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-3))
sgd.fit(X_train, y_train)
predictions_sgd = sgd.predict(X_test)
print(confusion_matrix(y_test, predictions_sgd))
print(classification_report(y_test, predictions_sgd))
print(sgd.score(X_test, y_test))

### Neural Network

In [None]:
model = Sequential()
model.add(Dense(128, activation="relu", input_shape = (X_train.shape[1],))) # Hidden Layer 1 that receives the Input from the Input Layer

model.add(Dense(64, activation="relu")) # Hidden Layer 2
model.add(Dropout(0.2))

model.add(Dense(32, activation="relu")) # Hidden Layer 3
model.add(Dropout(0.2))

model.add(Dense(16, activation="relu")) # Hidden Layer 4
model.add(Dropout(0.2))


model.add(Dense(1, activation="sigmoid")) # Outout Layer

model.summary()

In [None]:
model.compile(optimizer='adam', loss = "binary_crossentropy", metrics = ['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=10,restore_best_weights=True)
model.fit(X_train, y_train, batch_size = 64, epochs = 200, callbacks=[early_stop], validation_data=(X_test, y_test))

In [None]:
validation_loss, validation_accuracy = model.evaluate(X_test, y_test, batch_size=32)
print("Loss: "+ str(np.round(validation_loss, 3)))
print("Accuracy: "+ str(np.round(validation_accuracy, 3)))

In [None]:
#Compare all models
from tabulate import tabulate
table = [['Model', 'Accuracy %'], ['Logistic Regression', logmodel.score(X_test, y_test) * 100], ['KNN-Classifier',KN_calssifier.score(X_test, y_test) *100], ['Decision Tree',dtree.score(X_test, y_test)*100], ['Random Forest',rfc.score(X_test, y_test)*100], ['Naive Bayese',NB.score(X_test, y_test)*100], ['Support Vector Classifier',svc.score(X_test, y_test)*100], ['Stochastics Gradient Descent',sgd.score(X_test, y_test)*100], ['Neural Network',validation_accuracy*100]]
print(tabulate(table))

#### We notice that the maximum accuracy come with SVC algortihm = 81% so we will go with it

## Predicting Test Data

In [None]:
#Predicting Test data
Transort_predicted = svc.predict(df_test)

In [None]:
#Converting test id series into DataFrame
submission = df_test_id.to_frame()

In [None]:
#Convert Int (1,0) into bool (True, False)
Transort_predicted = [bool(x) for x in Transort_predicted]

In [None]:
submission['Transported'] = Transort_predicted

In [None]:
submission.set_index('PassengerId')

In [None]:
#Exporting file as .csv
submission.to_csv('Submission', index = False)

## Thanks