 # **INTRODUCTION**
> In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.
*In this notebook I have used Logistic Regression and Random Forset Classification methods to predict which passengers are transported to another world.*

# **IMPORTING DATA**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
space_train=pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
space_test=pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
sample=pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

# **DATA EXPLORATION**

In [None]:
space_train.shape

In [None]:
space_test.shape

In [None]:
space_train.head()

In [None]:
space_test.head()

In [None]:
space_train.info()

In [None]:
space_train.describe()

In [None]:
space_train.PassengerId.nunique()/space_train.shape[0]

In [None]:
space_train.Transported.value_counts()

In [None]:
space_train.isnull().sum()

In [None]:
space_test.isnull().sum()

In [None]:
# Replacing the null values 
space_train["HomePlanet"].fillna(method="ffill",inplace=True)
space_train["CryoSleep"].fillna(method="ffill",inplace=True)
space_train["Cabin"].fillna(method="ffill",inplace=True)
space_train["Destination"].fillna(method="ffill",inplace=True)
space_train["Age"].fillna(space_train["Age"].mean(),inplace=True)
space_train["VIP"].fillna(method="ffill",inplace=True)
space_train["RoomService"].fillna(space_train["RoomService"].mean(),inplace=True)
space_train["FoodCourt"].fillna(space_train["FoodCourt"].mean(),inplace=True)
space_train["ShoppingMall"].fillna(space_train["ShoppingMall"].mean(),inplace=True)
space_train["Spa"].fillna(space_train["Spa"].mean(),inplace=True)
space_train["VRDeck"].fillna(space_train["VRDeck"].mean(),inplace=True)
space_train["Name"].fillna(method="ffill",inplace=True)

In [None]:
# Rechecking after eliminating null values
space_train.isnull().sum()

In [None]:
# Replacing the null values 
space_test["HomePlanet"].fillna(method="ffill",inplace=True)
space_test["CryoSleep"].fillna(method="ffill",inplace=True)
space_test["Cabin"].fillna(method="ffill",inplace=True)
space_test["Destination"].fillna(method="ffill",inplace=True)
space_test["Age"].fillna(space_test["Age"].mean(),inplace=True)
space_test["VIP"].fillna(method="ffill",inplace=True)
space_test["RoomService"].fillna(space_test["RoomService"].mean(),inplace=True)
space_test["FoodCourt"].fillna(space_test["FoodCourt"].mean(),inplace=True)
space_test["ShoppingMall"].fillna(space_test["ShoppingMall"].mean(),inplace=True)
space_test["Spa"].fillna(space_test["Spa"].mean(),inplace=True)
space_test["VRDeck"].fillna(space_test["VRDeck"].mean(),inplace=True)
space_test["Name"].fillna(method="ffill",inplace=True)

In [None]:
# Rechecking after eliminating null values
space_test.isnull().sum()

# **DATA VISUALIZATION**

In [None]:
plt.figure(figsize=(7,7))
plt.title("Distribution of Transported Passengers")
space_train["Transported"].value_counts().plot(kind="pie",autopct='%1.2f%%')

There is not much difference in the distribution of Transported and non transported passengers. Hence, the data is unbaised. 

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x="Transported",y="HomePlanet",data=space_train)
plt.title("Transported successfully from Home Planet")

Maximum no of passengers transported are from planet Eupora

In [None]:
plt.figure(figsize=(7,7))
plt.title(" Passengers Confined to Cabins")
space_train["CryoSleep"].value_counts().plot(kind="pie",autopct='%1.2f%%')

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x="Transported",y="CryoSleep",data=space_train)
plt.title("Confined Passengers Transported")

Maximum confined passengers are transported.

In [None]:
plt.figure(figsize=(7,7))
space_train["Destination"].value_counts().plot(kind="pie",autopct='%1.2f%%')

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x="Transported",y="Destination",data=space_train)

Maximum transported to 55 Cancri e

In [None]:
# splitting the cabin to Side : Port(P)  or Standard(S)
space_train["Side"]=space_train["Cabin"].str.split("/").str[2] 
plt.figure(figsize=(7,7))
space_train["Side"].value_counts().plot(kind="pie",autopct='%1.2f%%')

In [None]:
plt.figure(figsize=(7,7))
sns.catplot(x="Side",y="Transported",kind="bar",palette="mako",data=space_train)

In [None]:
space_train["Deck"]=space_train["Cabin"].str.split("/").str[0] 
plt.figure(figsize=(7,7))
space_train["Deck"].value_counts().plot(kind="pie",autopct='%1.2f%%')

In [None]:
plt.figure(figsize=(7,7))
sns.catplot(x="Transported",y="Deck",kind="bar",palette="ch:.25",data=space_train)

In [None]:
plt.figure(figsize=(10,10))
sns.catplot(x="HomePlanet",y="Transported",hue="Destination",kind="bar",palette="pastel",data=space_train)

In [None]:
plt.figure(figsize=(10,10))
sns.catplot(x="HomePlanet",y="Age",hue="Transported",kind="box",palette="viridis",data=space_train)

In [None]:
plt.figure(figsize=(7,7))
space_train["VIP"].value_counts().plot(kind="pie",autopct='%1.2f%%')

In [None]:
plt.figure(figsize=(10,10))
sns.catplot(x="Destination",y="VIP",hue="Transported",kind="point",palette="Spectral",data=space_train)

In [None]:
space_train["Expenses"]=space_train["RoomService"]+space_train["FoodCourt"]+space_train["ShoppingMall"]+space_train["Spa"]+space_train["VRDeck"]
plt.figure(figsize=(10,10))
sns.catplot(x="VIP",y="Expenses",hue="Transported",kind="bar",palette="icefire",data=space_train)

In [None]:
plt.figure(figsize=(10,10))
sns.catplot(x="HomePlanet",y="Expenses",hue="Transported",kind="bar",palette="coolwarm",data=space_train)

In [None]:
sns.scatterplot(x='Age',y='Expenses',data=space_train[space_train.Transported==True])

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x="Side",y="Expenses",palette="ch:s=-.2,r=.6",data=space_train)

# **DATA ENCODING**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
for x in [i for i in space_train.columns if len(space_train[i].unique())==2]:
    print(x, space_train[x].unique())
    space_train[x]= label_encoder.fit_transform(space_train[x])
    

In [None]:
space_train.drop(["HomePlanet","Cabin","Destination","Name","Side","Expenses","Deck"],axis=1,inplace=True)

In [None]:
space_train.head()

In [None]:
#Check Variables after Encoding
[[x, space_train[x].unique()] for x in [i for i in space_train.columns if len(space_train[i].unique())<10]]

In [None]:
space_test.drop(["HomePlanet","Cabin","Destination","Name"],axis=1,inplace=True)

In [None]:
space_test.head()

In [None]:
label_encoder=LabelEncoder()
for x in [i for i in space_test.columns if len(space_test[i].unique())==2]:
    print(x, space_test[x].unique())
    space_test[x]= label_encoder.fit_transform(space_test[x])

In [None]:
#Check Variables after Encoding
[[x, space_test[x].unique()] for x in [i for i in space_test.columns if len(space_test[i].unique())<10]]

In [None]:
space_train.head()

# **SPLITTING DATA**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , classification_report,accuracy_score
from sklearn.metrics import roc_curve , auc
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
## Defining variables X,y 
x_train= space_train.drop(["Transported","PassengerId"],axis=1)
x_test=space_test.drop(["PassengerId"],axis=1)
Y_train=space_train["Transported"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_train, Y_train, test_size = 0.25, random_state = 0)

In [None]:
print("shape of X_train:",X_train.shape)
print("shape of y_train:",y_train.shape[0])

In [None]:
## Scaling the data 
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test=sc.transform(X_test)
X_train=pd.DataFrame(X_train,columns=x_train.columns)
X_test=pd.DataFrame(X_test,columns=x_test.columns)

# **LOGISTIC REGRESSION**

In [None]:
## Training the model using Logistic Regression
model=LogisticRegression()
model.fit(X_train,y_train)

In [None]:
## Predicting data
y_pred=model.predict(X_test)

In [None]:
## classification report
print(classification_report(y_test,y_pred))

In [None]:
## confusion matrix
conf_mat=confusion_matrix(y_test,y_pred)
print("Confusion matrix is \n",conf_mat)

## plotting confusion matrix
plt.figure(figsize=(7,7))
plt.title("Logistic Regression CM")
sns.heatmap(conf_mat, square=True,cmap="BuPu",annot=True,fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')

 **CONCLUSION**
   *    true values : 1698
   *    false values: 476

In [None]:
## accuracy score
print("accuracy score : ",accuracy_score(y_test,y_pred))

## percentage accurate
print("accuracy:",round(100*accuracy_score(y_test,y_pred)),"%")

In [None]:
## Prediction using predict_proba 

y_pred_proba = model.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc=auc(fpr,tpr)
#Now Draw ROC using fpr , tpr
plt.plot([0, 1], [0, 1], 'k--',label='Random')
plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' %roc_auc)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Logistic Regression ROC curve')
plt.legend(loc='best')

# **RANDOM FOREST CLASSIFICATION**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Apply RandomForest Algorithm
random_classifier= RandomForestClassifier(max_depth=20,random_state=2)
random_classifier.fit(X_train,y_train)

In [None]:
y_pred_rnd= random_classifier.predict(X_test)

In [None]:
#Classification Report
print(classification_report(y_test,y_pred_rnd))

In [None]:
## confusion matrix
conf_mat=confusion_matrix(y_test,y_pred_rnd)
print("Confusion matrix is \n",conf_mat)

## plotting confusion matrix
plt.figure(figsize=(7,7))
plt.title("Logistic Regression CM")
sns.heatmap(conf_mat, square=True,cmap="BuPu",annot=True,fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')

 **CONCLUSION**
   *    true values : 1691
   *    false values: 483

In [None]:
## accuracy score
print("accuracy score : ",accuracy_score(y_test,y_pred_rnd))

## percentage accurate
print("accuracy:", round(100*accuracy_score(y_test,y_pred_rnd)),"%")

In [None]:
y_pred_proba_rnd=random_classifier.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_rnd)
roc_auc=auc(fpr,tpr)
#Now Draw ROC using fpr , tpr
plt.plot([0, 1], [0, 1], 'k--',label='Random')
plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' %roc_auc)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Random Forest ROC curve')
plt.legend(loc='best')

# **MODEL SELECTION**
> Both the models have same accuracy rate. Hence, we can use any one of them.
Here, I am using Logistic Regression because it has more true values than Random Forest Classification. 

In [None]:
y_final=model.predict(x_test)

In [None]:
y_final

# **SUBMISSION**

In [None]:
sub=pd.DataFrame({'PassengerId':space_test["PassengerId"],"Transported":y_final.astype('bool')})

In [None]:
sub.to_csv('submission.csv',index=False)
pd.read_csv('submission.csv')