# 1. Problem Statement

In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.

<b>File and Data Field Descriptions</b>

<b>train.csv</b> - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

<b>PassengerId</b> - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

<b>HomePlane</b>t - The planet the passenger departed from, typically their planet of permanent residence.

<b>CryoSleep</b> - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

<b>Cabin</b> - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

<b>Destination</b> - The planet the passenger will be debarking to.

<b>Age</b> - The age of the passenger.

<b>VIP</b> - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

<b>Name</b> - The first and last names of the passenger.

<b>Transported</b> - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

<b>test.csv</b> - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.

<b>sample_submission.csv</b> - A submission file in the correct format.

<b>PassengerId</b> - Id for each passenger in the test set.

<b>Transported</b> - The target. For each passenger, predict either True or False.

# 2. Importing Dataset and Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
df_train=pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_test=pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
#df_train=df_train.drop(['PassengerId','Name'],axis=1)
df_train.info()

In [None]:
df_test.info()

# 3. Data Cleaning and preprocessing

In [None]:
#Checking for missing values in each column of the training dataset
df_train.isna().sum()

In [None]:
plt.figure(figsize=(10,6))
sns.displot(
    data=df_train.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=1.25
)

In [None]:
#Split df_train and df_test columns into numerical and categorical 
num_features=[col for col in df_train.select_dtypes('number')]
categ_features=[col for col in df_train.select_dtypes(exclude=['number'])]
test_categ_features=[col for col in df_test.select_dtypes(exclude=['number'])]

#replace missing values in each numerical column with the mediane
for col in num_features:
    df_train[col].fillna(df_train[col].mean(), inplace=True)
    df_test[col].fillna(df_test[col].mean(), inplace=True)
    
#replace missing values in each categorical column with the most frequent value
for col in categ_features:
    df_train[col].fillna(df_train[col].value_counts().index[0], inplace=True)
for col in test_categ_features:
    df_test[col].fillna(df_test[col].value_counts().index[0], inplace=True)   

In [None]:
df_train.isna().sum()


In [None]:
#Checking for missing values in df_test after imputation
df_test.isna().sum()

In [None]:
for col in categ_features:
    print("{0}  :  {1} ".format(col, len(df_train[col].unique())))

In [None]:
#Creating new features for training dataset and test dataset

#Cabin has three values deck/num/side, so we'll create two columns for deck and side
df_train["Deck"] = df_train["Cabin"].apply(lambda x: str(x).split("/")[0])
df_test["Deck"] = df_test["Cabin"].apply(lambda x: str(x).split("/")[0])
df_train["side"] =df_train["Cabin"].apply(lambda x: x.split("/")[2])
df_test["side"] = df_test["Cabin"].apply(lambda x: x.split("/")[2])

#Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group, so we'll create two new features GroupId and GroupIdNumber
df_train["GroupId"] = df_train["PassengerId"].apply(lambda x: x.split("_")[0])
df_test["GroupId"] = df_test["PassengerId"].apply(lambda x: x.split("_")[0])
df_train["GroupIdNumber"] =df_train["PassengerId"].apply(lambda x: x.split("_")[1])
df_test["GroupIdNumber"] = df_test["PassengerId"].apply(lambda x: x.split("_")[1])

# Creating new feature InGroup to indicate if a passenger is alone or in group
Group_train =df_train[df_train["GroupId"].duplicated()]["GroupId"]
Group_test =df_test[df_test["GroupId"].duplicated()]["GroupId"]
df_train["InGroup"] = df_train["GroupId"].apply(lambda x: x in Group_train.values)
df_test["InGroup"] = df_test["GroupId"].apply(lambda x: x in Group_test.values)

In [None]:
#Drop 'PassengerId','Cabin','Name','GroupId','GroupIdNumber' from df_train
df_train.drop(['PassengerId','Cabin','Name','GroupId','GroupIdNumber'], axis=1, inplace=True)
#Save PassengerId and Name
Id_test_list = df_test["PassengerId"].tolist()
#Drop 'PassengerId','Cabin','Name','GroupId','GroupIdNumber' from df_test
df_test.drop(['PassengerId','Cabin','Name','GroupId','GroupIdNumber'], axis=1, inplace=True)

In [None]:
df_train.info()

In [None]:
#print categories of each categorical column after removing unnecessary columns
for col in df_train.select_dtypes(exclude=['number']):
    print(f'{col} : {df_train[col].unique()} : {len(df_train[col].unique())}')

In [None]:
#print numerical columns 
#for col in df_train.select_dtypes(['number']):
#    print(f'{col} : {df_train[col].describe()} : {len(df_train[col].unique())}')
df_train.describe()

In [None]:
df_train.plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20));


In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df_train.select_dtypes('number').corr(),cmap='BuPu',annot=True)
plt.title ('Correlation', fontsize=20)
plt.show()

In [None]:
df=df_train.copy()


In [None]:
df_train

# 4. Split Dataset into Training and Testing Set

In [None]:
X= pd.get_dummies(df.drop(['Transported'],axis=1),drop_first=True)
y= df['Transported']
df_test=pd.get_dummies(df_test,drop_first=True)


from sklearn.model_selection import train_test_split
from sklearn import preprocessing


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)
df_test = pd.DataFrame(scaler.transform(df_test), columns = df_test.columns)

In [None]:
"""
from sklearn.decomposition import PCA

pca = PCA(n_components=18)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
df_test = pca.fit_transform(df_test)"""

In [None]:
#explained_variance = pca.explained_variance_ratio_
#explained_variance

# 5. Machine Learning Model Pipeline

In [None]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=DeprecationWarning)
# machine learning model_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


model_pipeline=[]
model_pipeline.append(LogisticRegression(solver='liblinear'))
model_pipeline.append(KNeighborsClassifier())
model_pipeline.append(DecisionTreeClassifier())
model_pipeline.append(RandomForestClassifier())
model_pipeline.append(GradientBoostingClassifier())
model_pipeline.append(AdaBoostClassifier())
model_pipeline.append(XGBClassifier(objective='binary:logistic',
                          booster='gbtree',
                          eval_metric='auc',
                          tree_method='hist',
                          grow_policy='lossguide',learning_rate=0.05, max_depth=5, n_estimators=180))


In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#model_list=['Logistic Regression','SVM','KNN','Decision Tree','Random Forest','Naive Bayes','AdaBoost']
model_list=['Logistic Regression','KNN','Decision Tree','Random Forest','GradientBoost','AdaBoost','XGB']
acc_list=[]
auc_list=[]
cm_list=[]

for model in model_pipeline:
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    acc_list.append(metrics.accuracy_score(y_test,y_pred))
    fpr,tpr,_thresholds=metrics.roc_curve(y_test,y_pred)
    auc_list.append(round(metrics.auc(fpr,tpr),2))
    cm_list.append(confusion_matrix(y_test,y_pred))
    
result_df=pd.DataFrame({'Model':model_list,'Accuracy':acc_list,'AUC':auc_list})
result_df.sort_values('Accuracy',ascending=False)

%%time

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

my_model=model_pipeline[6]
params = {
 'max_depth': range (2, 10, 1),
 'n_estimators': range(60, 220, 40),
 'learning_rate': [0.01, 0.10, 0.05]
}

grid_search = GridSearchCV(estimator=my_model,
                           param_grid=params,                          
                           n_jobs=10,
                           cv=10,
                           verbose=True)
grid_search.fit(X_train,y_train)

predictions_1 = grid_search.predict(X_test)

score_1 = grid_search.score(X_test, y_test)
mae_1 = mean_squared_error(y_test, predictions_1)
rmse_1 = np.sqrt(mae_1)

print('Model Accuracy : ', score_1)
print('Mean Squared Error : ', mae_1)
print('RMSE : ', rmse_1)

print(grid_search.best_params_)

In [None]:
#pred=pd.Series(model_pipeline[0].predict(df_test))
pred=pd.Series(model_pipeline[6].predict(df_test)).map({0:False, 1:True})
#pred=pd.Series(grid_search.predict(df_test)).map({0:False, 1:True})
submission = pd.DataFrame({'PassengerId': Id_test_list,
                       'Transported': pred})

submission.to_csv("submission.csv", index=False)
submission.Transported.value_counts()