In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
import missingno as msno

warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

# **Exploratory Data Analysis**

In [None]:
train_df.head()

In [None]:
train_df.shape

#### **Dataframe general information**

In [None]:
train_df.describe()

In [None]:
train_df.info()

#### **Missing values (nan)**

In [None]:
print(f'Nan values:\n\n{train_df.isna().sum()}')

In [None]:
msno.matrix(train_df)

#### **Check for duplicate data**

In [None]:
print(f'Total duplciate data: {train_df.duplicated().sum()}')

#### **Correlation between every column**

In [None]:
corr = train_df.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corr, annot=True)

### **Takeaways**
1. There are not many missing data, and the missing values are spread 
1. There is not any strong relationship between the columns

# **Feature Engineering**
* Add new columns (Deck and TotalBill)
* Fill nan values

In [None]:
from sklearn.impute import SimpleImputer

class FE:
    def __init__(self, df):
        self.df = df
        
    def add_columns(self):
        # Add Deck column
        self.df['Deck'] = self.df.Cabin.apply(lambda x: str(x)[0])
        
        # Add TotalBill column
        self.df['TotalBill'] = self.df.RoomService + self.df.ShoppingMall + self.df.Spa + self.df.VRDeck
        return self.df
    
    def fill_na_object(self):
        columns = self.df.select_dtypes(include='object')
        for column in columns:
            val = self.df[column].value_counts().index[0]
            self.df[column].fillna(val, inplace=True)
        return self.df
    
    def fill_na_int(self):
        # Fill age with the mean value
        self.df.Age.fillna(self.df.Age.mean(), inplace=True)
        
        # SimpleImputer
        imputer = SimpleImputer()
        columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        result = imputer.fit_transform(self.df[columns])
        
        self.df[columns] = result
        
        return self.df
        
    def run_all(self):
        self.fill_na_object()
        self.fill_na_int()
        self.add_columns()
        try:
            return self.df[['PassengerId', 'Name', 'Age', 'HomePlanet', 'Destination', 'CryoSleep', 'Cabin', 'Deck', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalBill', 'Transported']]
        except:
            return self.df
        
fe = FE(train_df.copy())
cleaned_df = fe.run_all()

In [None]:
print(f'Nan values:\n\n{cleaned_df.isna().sum()}')

# **Visualization**

### **HomePlanet, CryoSleep, Destination, Target Distribution**

In [None]:
home_planets = cleaned_df.HomePlanet.value_counts().index
home_planets_count = cleaned_df.HomePlanet.value_counts().values

destinations = cleaned_df.Destination.value_counts().index
destinations_count = cleaned_df.Destination.value_counts().values

decks = cleaned_df.Deck.value_counts().index
decks_count = cleaned_df.Deck.value_counts().values

vip = cleaned_df.VIP.value_counts().index
vip_count = cleaned_df.VIP.value_counts().values

cryo_sleep = cleaned_df.CryoSleep.value_counts().index
cryo_sleep_count = cleaned_df.CryoSleep.value_counts().values

transported = cleaned_df.Transported.value_counts().index
transported_count = cleaned_df.Transported.value_counts().values

In [None]:
fig, ax = plt.subplots(ncols=3, nrows=2, figsize=(20,10))

ax[0, 0].pie(home_planets_count, labels=home_planets, startangle=90, autopct='%1.1f%%', shadow=True, wedgeprops={'edgecolor':'black'}, explode=[0.1, 0, 0], colors=['#d8f9d8', '#f88888', '#aee'])
ax[0, 0].set_title('Home Planets', fontsize=16)

ax[0, 1].pie(destinations_count, labels=destinations, startangle=90, autopct='%1.1f%%', shadow=True, wedgeprops={'edgecolor':'black'}, explode=[0.1, 0, 0], colors=['#d8f9d8', '#f88888', '#aee'])
ax[0, 1].set_title('Destinations', fontsize=16)

ax[0, 2].pie(decks_count, labels=decks, startangle=220, autopct='%1.1f%%', shadow=True, wedgeprops={'edgecolor':'black'}, explode=[0, 0.1, 0, 0, 0, 0, 0, 0], colors=['#ececff', '#beb', '#d8f9d8', '#f88888', '#ffdddd', '#aee', '#cff4f4'])
ax[0, 2].set_title('Decks', fontsize=16)

ax[1, 0].pie(vip_count, labels=vip, startangle=90, autopct='%1.1f%%', shadow=True, wedgeprops={'edgecolor':'black'}, explode=[0, 0.1], colors=['#d8f9d8', '#f88888'])
ax[1, 0].set_title('VIP', fontsize=16)

ax[1, 1].pie(cryo_sleep_count, labels=cryo_sleep, startangle=90, autopct='%1.1f%%', shadow=True, wedgeprops={'edgecolor':'black'}, explode=[0, 0.1], colors=['#d8f9d8', '#f88888'])
ax[1, 1].set_title('CryoSleep', fontsize=16)

ax[1, 2].pie(transported_count, labels=transported, startangle=90, autopct='%1.1f%%', shadow=True, wedgeprops={'edgecolor':'black'}, explode=[0, 0.1], colors=['#d8f9d8', '#f88888'])
ax[1, 2].set_title('Transported', fontsize=16)


plt.tight_layout()

### **Age and TotalBill Distributions in Three Different Planets**

In [None]:
fig = px.histogram(cleaned_df, x='Age', marginal='box', title='Age Distribution', color='HomePlanet')
fig.show()

fig = px.box(cleaned_df, x='TotalBill', title='TotalBill Distribution', color='HomePlanet')
fig.show()

### **Average Spending and Age in Three Different Planets**

In [None]:
planets_bills = cleaned_df.groupby('HomePlanet')['TotalBill'].median().index.tolist()
average_bills = cleaned_df.groupby('HomePlanet')['TotalBill'].median().values.tolist()

planets_age = cleaned_df.groupby('HomePlanet')['Age'].median().index.tolist()
average_age = cleaned_df.groupby('HomePlanet')['Age'].median().values.tolist()


fig = make_subplots(rows=1, cols=2, subplot_titles=('Average Spending', 'Average Age'))
fig.add_trace(go.Bar(x=planets_bills, y=average_bills), row=1, col=1)
fig.add_trace(go.Bar(x=planets_age, y=average_age), row=1, col=2)

### **Age and TotalBill Distributions Grouped by Transported**

In [None]:
fig = px.box(cleaned_df, x='Age', color='Transported', title='Age Distribution')
fig.show()

fig = px.box(cleaned_df, x='TotalBill', color='Transported', title='Total Bill Distribution')
fig.show()

### **Takeaways**
1. More than half of the people on board are from the Earth
2. Less than three quarter people on board are going to TRAPPIST-1e
3. The majority of the people are in the G and f decks
4. Only 2.3% people paid for the VIP trip
5. Passengers from mars are more likely to spend more money on the ship's facilities
6. Passengers who spent a little amount of money for the facilities have higher probability to be transported

# **Preprocessing**

In [None]:
features = ['Age', 'HomePlanet', 'Destination', 'CryoSleep', 'Deck', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalBill']
X = cleaned_df[features]
y = cleaned_df.Transported

#### **Label Encoding**

In [None]:
# from sklearn.preprocessing import LabelEncoder

# cat_columns = ['HomePlanet', 'Destination', 'CryoSleep', 'Deck', 'VIP']
# for col in cat_columns:
#     encoder = LabelEncoder()
#     X[col] = encoder.fit_transform(X[col])

In [None]:
try:
    X = pd.get_dummies(X, columns=['HomePlanet', 'Destination', 'CryoSleep', 'Deck', 'VIP'])
except:
    pass
X.head()

#### **Data Splitting**

In [None]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=1)

print(f'Train size: {train_x.shape}')
print(f'Test size: {test_x.shape}')

#### **Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_x)
scaled_train_x = scaler.transform(train_x)
scaled_test_x = scaler.transform(test_x)

# **Data Training**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
class Model:
    model_names = list()
    accuracy_scores = list()
    cv_scores = list()
    
    def __init__(self, model, name):
        self.model = model
        Model.model_names.append(name)
    
        
    def predict(self):
        self.model.fit(scaled_train_x, train_y)
        prediction = self.model.predict(scaled_test_x)
        
        acc = accuracy_score(prediction, test_y)
        cv = cross_val_score(self.model, scaled_train_x, train_y, cv=5)
        Model.accuracy_scores.append(acc)
        Model.cv_scores.append(np.mean(cv))
        
        print(f'Accuracy Score: {acc}')
        print(f'Classification Report: {classification_report(prediction, test_y)}')

#### **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

model = Model(LogisticRegression(), 'Logistic Regression')
model.predict()

#### **KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = Model(KNeighborsClassifier(), 'K-NN')
model.predict()

#### **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = Model(RandomForestClassifier(), 'Random Forest')
model.predict()

#### **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = Model(DecisionTreeClassifier(), 'Decision Tree')
model.predict()

#### **Support Vector Classifier**

In [None]:
from sklearn.svm import SVC

model = Model(SVC(), 'SVC')
model.predict()

#### **Naive Bayesian**

In [None]:
from sklearn.naive_bayes import GaussianNB

model = Model(GaussianNB(), 'GaussianNB')
model.predict()

# **Models Evaluation**

In [None]:
scores_df = pd.DataFrame({'Model':Model.model_names, 'Accuracy':Model.accuracy_scores, 'CV':Model.cv_scores})
scores_df.sort_values(by='Accuracy', ascending=False, inplace=True)

scores_df

# **Hyperparameter Tuning**
I will use Logistic Regression since it performs the highest accuracy

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
log = LogisticRegression()
params = {'C': [100, 10, 1.0, 0.1, 0.01], 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear']}
best_model = GridSearchCV(log, params, cv=5)
best_model.fit(scaled_train_x, train_y)

print(f'Score After Hyperparameter Tuning: {best_model.score(scaled_test_x, test_y)}')
print(f'Best Parameters: {best_model.best_params_}')

# **Using Test Dataset**
Basically repeating the same steps I did for the previous dataset. But now I'm using the test dataset

In [None]:
fe = FE(test_df.copy())
cleaned_test = fe.run_all()[features]
    
cleaned_test = pd.get_dummies(cleaned_test, columns=['HomePlanet', 'Destination', 'CryoSleep', 'Deck', 'VIP'])
    
scaler = StandardScaler()
scaled_test = scaler.fit_transform(cleaned_test)

log = LogisticRegression()
params = {'C': [100], 'penalty': ['l2'], 'solver': ['newton-cg']}
best_model = GridSearchCV(log, params, cv=5)
best_model.fit(scaled_train_x, train_y)

final_result = best_model.predict(scaled_test)

# **Submission**

In [None]:
output = pd.DataFrame({'PassengerId':test_df.PassengerId, 'Transported':final_result})
output

In [None]:
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

### Thanks for reading my notebook :D
### Any feedback or advice would be invaluable :D
### Goodluck :)