######Reference 
https://www.kaggle.com/code/javigallego/titanic-spaceship-eda/notebook?scriptVersionId=90210061


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)
pd.set_option('display.expand_frame_repr', False)
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)


from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pd.set_option('max_rows',None)

In [None]:
#Helper functions

#Create function for missing data analysis
def draw_missing_data_table(df):
  total=df.isnull().sum().sort_values(ascending=False)
  percent=(df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
  missing_data=pd.concat([total,percent],axis=1,keys=['Total','Percent'])
  return missing_data

In [None]:
#Plot learning curve
def plot_learning_curve(estimator, title, x,y,ylim=None, cv=None,n_jobs=1,train_sizes=np.linspace(.1,1,5)):
  plt.figure()
  plt.title(title)
  if ylim is not None:
    plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator,x,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes)
    train_scores_mean=np.mean(train_scores,axis=1)
    train_scores_std=np.std(train_scores,axis=1)
    test_scores_mean=np.mean(test_scores,axis=1)
    test_scores_std=np.std(test_scores,axis=1)
    plt.grid()
    plt.fill_between(train_sizes,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.1,color='g')
    plt.plot(train_sizes,train_scores_mean,'o-',color='r',label="Training score")
    plt.plot(train_sizes,test_scores_mean,'o-',color='g',label='Validation score')
    plt.legend(loc="best")
    return plt

In [None]:
#plot validation curve
def plot_validation_curve(estimator, title,x,y,param_name,param_range,ylim=None,cv=None,n_jobs=1,train_sizes=np.linspace(.1,1,5)):
  train_scores,test_scores=validation_curve(estimator,x,y,param_name,param_range,cv)
  train_mean=np.mean(train_scores,axis=1)
  train_std=np.std(train_scores,axis=1)
  test_mean=np.mean(test_scores,axis=1)
  test_std=np.std(test_scores,axis=1)
  plt.plot(param_range,train_mean,color='r',marker='o',markersize=5,label='Training Score')
  plt.fill_between(param_range,train_mean+train_std,train_mean-train_std,alpha=0.15,color='r')
  plt.plot(param_range,test_mean,color='g',linstyle='--',marker='s',markersize=5,label='Validation score')
  plt.fill_between(param_range, test_mean+test_std, test_mean-test_std,alpha=0.15,color='g')
  plt.grid()
  plt.xscale('log')
  plt.legend(loc='best')
  plt.xlabel('Parameter')
  plt.ylabel('Score')
  plt.ylim(ylim)

In [None]:
#get path from training and test data
curr_path = os.getcwd()
dataset_src=os.path.join(curr_path,'spaceship titanic')
train_path=os.path.join(dataset_src,'train.csv')
test_path=os.path.join(dataset_src,'test.csv')

In [None]:
#import data
#train_df=pd.read_csv(train_path)
test_df=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_raw=df.copy()
train_df=df.copy()
df.head()

Exploratory Data Analysis
Not all features provide necessary information for the model. It is essentil to find out important features for analysis.


In [None]:
# Check for missing values
train_df.info(show_counts=True)

In [None]:
# Define the function that creates missing value heatmap
def plot_missing_data(dataset, title):
  fig,ax=plt.subplots(figsize=(5,5))
  plt.title(title)
  sns.heatmap(dataset,cbar=False)

In [None]:
plot_missing_data(df.isnull(),"Training Dataset")

In [None]:
plot_missing_data(test_df.isnull(),"Test Dataset")

In [None]:
# Check number of  missing values in each columns
draw_missing_data_table(train_df)

In [None]:
# Check features belonging to object category

train_df.select_dtypes(['object']).head()

In [None]:
test_df.select_dtypes(['object']).head()
#test_df.head()

In [None]:
# Impute missing values for Homeplanet
def filling_HomePlanet(df):
  mode=df['HomePlanet'].value_counts().index[0]
  df['HomePlanet']=df['HomePlanet'].fillna(mode)
  return df

In [None]:
# define function for imputing missing values of cryosleep
#if passenger had elected to put himself into suspended animation rarely it would have a missing value, we are going to consider the option of replacing missing values with False in this case.
def filling_CryoSleep(df):
  df['CryoSleep']=df['CryoSleep'].fillna(False)
  return df

######Cabin. As it is shown in the report this feature is categorical. As it is almost impossible to estimate cabin number for a passenger with given format, we are going to split cabin number into three different features. Those are going to be describing: desk, number and side. Thus, we'll start Feature Engineering here (continued in detail subsequently). Next, we are going to replace missing values for deck type feature with F (most repeated value). Hereafter, we are going to fill side feature with most repeated value into decks of type F. Finally, we are going to fill cabin number with half of the maximum cabin number (as cabins belonging to one deck type could have more survival rate whether they are one of the first/last cabin).

In [None]:
#Cabin

def split_Cabin(df):
    df['Deck'] = df['Cabin'].str.split("/", n=2, expand=True)[0]
    df['Number'] = df['Cabin'].str.split("/", n=2, expand=True)[1]
    df['Side'] = df['Cabin'].str.split("/", n=2, expand=True)[2]
    df.pop('Cabin')
    return df

def filling_cabin(df):
  df['Deck']=df['Deck'].fillna('F')
  mode=df[df.Deck=='F']['Side'].value_counts().index[0]
  df['Side']=mode
  df['Number']=df['Number'].astype(float)
  df['Number']=df['Number'].fillna(1976/2)
  return df

In [None]:
# Destination. planet the passenger will be debarking to
# We will impute missing values with most repeated value

def filling_destination(df):
  mode=df['Destination'].value_counts().index[0]
  df['Destination']=df['Destination'].fillna(mode)
  return df

In [None]:
#VIP- weather the passenger has paid for special VIP service during the voyage
# VIP service deal has not been taken into data collection so replave missing values with faluse

def filling_vip(df):
  df['VIP']=df['VIP'].fillna(False)
  return df

In [None]:
#Name the first and last name of the passenger
# Replace missing values with none as name must be unique and we can not guess it

def filling_name(df):
  df['Name']=df['Name'].fillna('None')
  return df

In [None]:
# Define a function to call all categorical feature missing value imputation functions as defined above

def filling_categorical(df):
  df = filling_HomePlanet(df)
  df = filling_CryoSleep(df)
  df = split_Cabin(df)
  df = filling_cabin(df)
  df = filling_destination(df)
  df = filling_vip(df)
  df = filling_name(df)
  return df

In [None]:
# Check continuous features
train_df.select_dtypes(['float64']).head()

In [None]:
test_df.select_dtypes(['float64']).head()

In [None]:
train_df['Age'].hist(bins=10)

In [None]:
# Cross tab frequency table
#train.groupby(['Fare', 'Survived'])['Fare'].count().unstack()
#filt=train_df.loc[[train_df['Age'] <10],"Age"]
train_df.groupby(['HomePlanet','Transported'])['Transported'].count().hist()
# Draw barplot with cross tab freq values.
ax=sns.countplot(data=train_df,x='HomePlanet',hue='Transported')
#ax.bar_label(ax.containers[0])
#ax.bar_label(ax.containers[1])
plt.legend(title='Transported or not',loc='upper left',labels=['No','Yes'])

In [None]:
# Draw barplot with cross tab freq values. for biosleep
ax=sns.countplot(data=train_df,x='CryoSleep',hue='Transported')
#plt.bar_label(ax.containers[0])
#plt.bar_label(ax.containers[1])
plt.legend(title='Transported or not',loc='upper left',labels=['No','Yes'])

In [None]:
# Impute age with median

def filling_age(df):
  median = df['Age'].describe()[5]
  df['Age']=df['Age'].fillna(median)
  return df

In [None]:
# Impute all other luxury features
def filling_luxury_features(df):
  luxury_features=['RoomService', 'FoodCourt', 'ShoppingMall','Spa','VRDeck']
  df[luxury_features]=df[luxury_features].fillna(0)
  return df


In [None]:
# Impute all continuous features

def filling_numerical(df):
  df = filling_age(df)
  df = filling_luxury_features(df)
  return df

In [None]:
# Finallly create a function that calls both categorical and continuous function

def filling_missing(df):
  df = filling_categorical(df)
  df = filling_numerical(df)
  return df

# Call function on train data
train_df = filling_missing(train_df)
test_df= filling_missing(test_df)

In [None]:
draw_missing_data_table(test_df)

In [None]:
# Convert all float values to integer for train data
train_df.RoomService=train_df.RoomService.astype(int)
train_df.Age=train_df.Age.astype(int)
train_df.FoodCourt=train_df.FoodCourt.astype(int)
train_df.ShoppigMall=train_df.ShoppingMall.astype(int)
train_df.Spa=train_df.Spa.astype(int)
train_df.VRDeck=train_df.VRDeck.astype(int)

In [None]:
# Convert all float values to integer for test data
test_df.RoomService=test_df.RoomService.astype(int)
test_df.Age=test_df.Age.astype(int)
test_df.FoodCourt=test_df.FoodCourt.astype(int)
test_df.ShoppigMall=test_df.ShoppingMall.astype(int)
test_df.Spa=test_df.Spa.astype(int)
test_df.VRDeck=test_df.VRDeck.astype(int)

In [None]:
# Feature engineering
# Base model

def score_dataset(x,y,model=XGBClassifier(label_encoder=False)):
    # Label encoding is good for XGBoost and RandomForest, but one-hot
    # would be better for models like Lasso or Ridge. The `cat.codes`
    # attribute holds the category levels.
    for colname in x.select_dtypes(["object","bool"]).columns:
        x[colname] = LabelEncoder().fit_transform(x[colname])
    y['Transported'] = LabelEncoder().fit_transform(y['Transported'])
    # Metric for Titanic SpaceShipt competition is MAE (Mean Absolute Error)
    score_xgb = cross_val_score(
        model, x, y, cv=5, scoring="accuracy", n_jobs=-1
    )
    
    score = score_xgb.mean()
    return score

x = train_df[train_df.Transported.isnull() == False].copy()
y = pd.DataFrame(x.pop('Transported'))
baseline_score = score_dataset(x, y)
print(f"Baseline score: {baseline_score:.5f} Accuracy")

In [None]:
# Modify Age feature and create new feature
train_df.Age=pd.qcut(train_df['Age'],10)
test_df.Age=pd.qcut(test_df['Age'],10)
test_df.head().style.set_properties(subset=['Age'], **{'background-color':'lightseagreen'})

In [None]:
# Family features

train_df[['Name','PassengerId']].head().style.set_properties(subset=['PassengerId'],**{'background-color':'lightgreen'})

In [None]:
# Add new feature for family size. Using last name to find out how manyfamily memebrs are there in FamilySize

train_df['FamilyId'] = train_df['PassengerId'].str.split("_", n=2, expand=True)[0]
train_df['Family Name'] = train_df['Name'].str.split(' ', n=2, expand=True)[1]
train_df = train_df.set_index(['FamilyId','Family Name'])
train_df['Family Size'] = 1

for i in range(train_df.shape[0]):
    fam_size = train_df.loc[train_df.index[i],:].shape[0]
    train_df.loc[train_df.index[i],'Family Size'] = fam_size

train_df=train_df.reset_index()
train_df[['FamilyId','PassengerId','Family Name','Name','Family Size']].head().style.set_properties(subset=['FamilyId','Family Name','Family Size'], **{'background-color':'lightseagreen'})

In [None]:
# Do same for test data
test_df['FamilyId'] = test_df['PassengerId'].str.split("_", n=2, expand=True)[0]
test_df['Family Name'] = test_df['Name'].str.split(' ', n=2, expand=True)[1]
test_df = test_df.set_index(['FamilyId','Family Name'])
test_df['Family Size'] = 1

for i in range(test_df.shape[0]):
    fam_size = test_df.loc[test_df.index[i],:].shape[0]
    test_df.loc[test_df.index[i],'Family Size'] = fam_size

test_df=test_df.reset_index()
test_df[['FamilyId','PassengerId','Family Name','Name','Family Size']].head().style.set_properties(subset=['FamilyId','Family Name','Family Size'], **{'background-color':'lightseagreen'})

In [None]:
#Boolean features encoding
boolean_col = train_df.select_dtypes(['bool']).columns
for i in range(len(boolean_col)):
  train_df[boolean_col[i]].replace([False,True],[0,1],inplace=True)

train_df['Transported'].replace([False,True],[0,1],inplace=True)
boolean_col = test_df.select_dtypes(['bool']).columns
for i in range(len(boolean_col)):
  test_df[boolean_col[i]].replace([False,True],[0,1],inplace=True)

In [None]:
test_df.head()

In [None]:
# Categorical features encoding

for colname in train_df.drop('PassengerId',axis=1).select_dtypes(['object','category']).columns:
  train_df[colname]=LabelEncoder().fit_transform(train_df[colname])

In [None]:
train_df.head()

In [None]:
# Do for test data also

for colname in test_df.drop('PassengerId',axis=1).select_dtypes(['object','category']).columns:
  test_df[colname]=LabelEncoder().fit_transform(test_df[colname])

In [None]:
test_df.tail()

In [None]:
# Modelling
x=train_df.drop(columns=['Transported'])
y=train_df["Transported"]
x_train,x_test, y_train, y_test=train_test_split(x,y,test_size=0.2)

for colname in x_train.select_dtypes(['object','bool']).columns:
  x_train[colname]=LabelEncoder().fit_transform(x_train[colname])

for colname in x_test.select_dtypes(['object','bool']).columns:
  x_test[colname]=LabelEncoder().fit_transform(x_test[colname])


In [None]:
x_train.head()

##### PassengerID is not going to be useful. Name and Family Name are redundant features as we already have FamilyId,FamilyName and Family Size features. So we should drop them.


In [None]:
x_train=x_train.drop(columns=["PassengerId","Name","Family Name"])
x_test=x_test.drop(columns=["PassengerId","Name","Family Name"])


In [None]:
x_train.shape

In [None]:
#USe NN to predict survival rate
import tensorflow as tf
# Set random seed
tf.random.set_seed(42)

# Create a model
model_1 = tf.keras.Sequential([
  tf.keras.layers.Dense(60, activation=tf.keras.activations.relu), # hidden layer 1, ReLU activation
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(40, activation=tf.keras.activations.relu), # hidden layer 2, ReLU activation
 tf.keras.layers.Dense(20, activation=tf.keras.activations.relu), # hidden layer 2, ReLU activation 
  tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid) # ouput layer, sigmoid activation
])

# Compile the model
model_1.compile(loss=tf.keras.losses.binary_crossentropy,
                optimizer=tf.keras.optimizers.Adam(lr=0.001),
                metrics=['accuracy'], run_eagerly=True)

# Fit the model
history = model_1.fit(x_train, y_train, epochs=50,validation_data=(x_test,y_test))


In [None]:
model_1.summary()

In [None]:

op=test_df.copy()
test_df=test_df.drop(columns=["PassengerId","Name","Family Name"])


In [None]:
op.head()

In [None]:
pred=model_1.predict(test_df)
pred

In [None]:
op['Survived']=pred

op.head()

In [None]:
op.loc[op['Survived'] < 0.5, 'Transported'] = 'False' 
op.loc[op['Survived'] >= 0.5, 'Transported'] = 'True' 
op1=op[["PassengerId","Transported"]]
#op.drop('Survived',axis=1,inplace=True)
op1.head()

In [None]:
op1.to_csv("Submissionx.csv",index=False)

In [None]:
#Logistic regression
logreg = LogisticRegression( solver='lbfgs', max_iter=3000)
logreg.fit(x_train, y_train)

In [None]:
pr_lr=logreg.predict(x_test)
pr_lr

In [None]:
# check confusion matrix
from sklearn.metrics import confusion_matrix

TN, FP, FN,TP= confusion_matrix(y_test,pr_lr).ravel()
print('True Positive (TP) :', TP)
print('false positive (FP) :',FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy = (TP+TN)/(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

In [None]:
# Check accuracy against all other binary classifiers
models = {}
# Logistic regression

from sklearn.linear_model import LogisticRegression
models['Logistic Regression']=LogisticRegression(solver='lbfgs', max_iter=3000)

# Support vector machines
from sklearn.svm import LinearSVC
models['Support Vector Machines']=LinearSVC(max_iter=3000)

#Decision trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees']=DecisionTreeClassifier()

# Random forest 
from sklearn.ensemble import RandomForestClassifier
models['Random Forest']=RandomForestClassifier()

# Naive bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes']= GaussianNB()

#k-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
models['k-neares neighbors']=KNeighborsClassifier()

#Extra tree classifier
from sklearn.ensemble import ExtraTreesClassifier
models['Extra tree Classifier'] = ExtraTreesClassifier()
#Light GBM 
from lightgbm import LGBMClassifier
#from lightgbm import log_evaluation, early_stopping
models['Light GBM Classifier'] = LGBMClassifier(objective='binary',
                     learning_rate=0.01,
                     num_iterations=700,
                     max_depth=7)

In [None]:
## Import metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall ={}, {},{}

for key in models.keys():
  # Fit the model
  models[key].fit(x_train, y_train)

  # Prediction 
  prediction=models[key].predict(x_test)

  # Calculate metrics
  accuracy[key]=accuracy_score(prediction,y_test)
  precision[key]=precision_score(prediction,y_test)
  recall[key]=recall_score(prediction, y_test)

In [None]:

df_model = pd.DataFrame(index=models.keys(),columns=['Accuracy','Precision','Recall'])
df_model['Accuracy']=accuracy.values()
df_model['Precision']=precision.values()
df_model['Recall']=recall.values()

df_model

In [None]:
#Random forest
rf = RandomForestClassifier( oob_score = True, n_jobs = -1,random_state =50, max_features = "auto", min_samples_leaf = 50)
rf.fit(x_train, y_train)
# Prediction 
prediction=rf.predict(x_test)

In [None]:
# check confusion matrix
from sklearn.metrics import confusion_matrix

TN, FP, FN,TP= confusion_matrix(y_test,prediction).ravel()
print('True Positive (TP) :', TP)
print('false positive (FP) :',FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy = (TP+TN)/(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

In [None]:
pred_rf=rf.predict(test_df)


In [None]:
submit=pd.DataFrame({
    'PassengerId':op['PassengerId'],
    'Transported':pred_rf.astype('bool')
})
submit.head()

In [None]:
submit.to_csv("Submission5.csv",index=False)

In [None]:
d = submit['Transported'].value_counts()
d

In [None]:
#Logistic regression
logreg = LogisticRegression( solver='lbfgs', max_iter=3000)
logreg.fit(x_train, y_train)

In [None]:
pr_lr=logreg.predict(x_test)
pr_lr

In [None]:
# check confusion matrix

TN, FP, FN,TP= confusion_matrix(y_test,pr_lr).ravel()
print('True Positive (TP) :', TP)
print('false positive (FP) :',FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy = (TP+TN)/(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

In [None]:
pred_lr=logreg.predict(test_df)


In [None]:
submit=pd.DataFrame({
    'PassengerId':op['PassengerId'],
    'Transported':pred_lr.astype('bool')
})
submit.head()

In [None]:
submit.to_csv("Submission5.csv",index=False)

In [None]:
d = submit['Transported'].value_counts()
d

In [None]:
#Light GBM classifier
lg = LGBMClassifier(objective='binary',
                     learning_rate=0.01,
                     num_iterations=700,
                     max_depth=7)
lg.fit(x_train, y_train)
# Prediction 
pred_lg=lg.predict(x_test)

In [None]:
# check confusion matrix
from sklearn.metrics import confusion_matrix

TN, FP, FN,TP= confusion_matrix(y_test,pred_lg).ravel()
print('True Positive (TP) :', TP)
print('false positive (FP) :',FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy = (TP+TN)/(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

In [None]:
lgtest=lg.predict(test_df)


In [None]:
submit6=pd.DataFrame({
    'PassengerId':op['PassengerId'],
    'Transported':lgtest.astype('bool')
})
submit6.head()

In [None]:
submit6.to_csv("Submission6.csv",index=False)