In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# What are we doing and what is the point?


With almost 13,000 passengers on board, the vessel set out on its maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars.

While rounding Alpha Centauri en route to its first destination—the torrid 55 Cancri E—the unwary Spaceship Titanic collided with a spacetime anomaly hidden within a dust cloud. Sadly, it met a similar fate as its namesake from 1000 years before. Though the ship stayed intact, almost half of the passengers were transported to an alternate dimension!


In [None]:
raw_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
raw_data

# Lets take a look at and think about what data we have and how relevent this might be to an outcome

First off, in our training data we have about two-thirds of the passendgers (~8700) data

'PassengerId' - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.


'HomePlanet'-The planet the passenger departed from, typically their planet of permanent residence.

'CryoSleep' - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.


'Cabin' - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

'Destination' - The planet the passenger will be debarking to.

'Age' - The age of the passenger.

'VIP' - Whether the passenger has paid for special VIP service during the voyage.

'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'- Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

'Name'- The first and last names of the passenger.

'Transported'- Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.


In [None]:



class ETL():
    
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.working_data = raw_data
    
    @classmethod
    def load_and_run_ETL(cls, raw_data):
        self.raw_data = raw_data
        self.working_data = raw_data
        self.extract_cabin_data()
        
        
    def set_raw_data(self, data):
        self.raw_data = data
    
    def set_working_data(self, data):
        self.working_data = data
    
    def get_raw_data(self):
        return self.raw_data
    
    def get_working_data(self):
        return self.working_data
    
    def run_etl(self, use_working=True):
        '''
        function which runs the etl pipeline
        :input use_working: bool
        '''
        self.extract_cabin_data(use_working)
        self.extract_passengerID(use_working)
        
    
    def extract_cabin_data(self,use_working=True):
        '''
        function which uses working or raw data input to split the 
        cabin data into three seperate columns
        :input use_working: bool
        '''
        if use_working:
            new_column_names = ['deck', 'cabin_num', 'cabin_side']
            self.working_data[new_column_names] = self.working_data['Cabin'].str.split('/', n=2, expand=True)
        else:
            new_column_names = ['deck', 'cabin_num', 'cabin_side']
            self.working_data[new_column_names] = self.raw_data['Cabin'].str.split('/', n=2, expand=True)
            
    def extract_passengerID(self,use_working=True):
        '''
        function which uses working or raw data input to split the 
        cabin data into three seperate columns
        :input use_working: bool
        '''
        if use_working:
            new_column_names = 'Passenger_Group'
            self.working_data[new_column_names] = self.working_data['PassengerId'].str.split('_', n=1, expand=False).str[0]
        else:
            new_column_names = 'Passenger_Group'
            self.working_data[new_column_names] = self.raw_data['PassengerId'].str.split('_', n=1, expand=False).str[0]

            
class Preprocessing():
    
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.working_data = raw_data
    
    @classmethod
    def load_and_run_preprocessing(cls, raw_data):
        self.raw_data = raw_data
        self.working_data = raw_data
        self.extract_cabin_data()
        
    def run_preprocessing(self,use_working=True):
        '''
        function to run all of the relvent preprocessing functions
        :input use_working: bool
        '''
        self.binary_encode_transported(use_working)
        self.binary_encode_VIP(use_working)
        self.binary_encode_CryoSleep(use_working)
        self.one_hot_encode_homeplanet(use_working)
        self.one_hot_encode_destination(use_working)
        
    def set_raw_data(self, data):
        self.raw_data = data
    
    def set_working_data(self, data):
        self.working_data = data
    
    def get_raw_data(self):
        return self.raw_data
    
    def get_working_data(self):
        return self.working_data
    
    def binary_encode_transported(self,use_working=True):
        '''
        
        '''
        if use_working:
            self.working_data['Transported'] = self.working_data['Transported'].astype('Int64')
        else:
            self.working_data['Transported'] = self.raw_data['Transported'].astype('Int64')
    
    def binary_encode_VIP(self,use_working=True):
        '''
        '''
        if use_working:
            self.working_data['VIP'] = self.working_data['VIP'].astype('Int64')
        else:
            self.working_data['VIP'] = self.raw_data['VIP'].astype('Int64')
            
    def binary_encode_CryoSleep(self,use_working=True):
        '''
        '''
        if use_working:
            self.working_data['CryoSleep'] = self.working_data['CryoSleep'].astype('Int64')
        else:
            self.working_data['CryoSleep'] = self.raw_data['CryoSleep'].astype('Int64')
            
    
    def one_hot_encode_homeplanet(self,use_working=True):
        '''
        
        '''
        if use_working:
            self.working_data = pd.get_dummies(self.working_data, columns=["HomePlanet"],dummy_na=True)
        else:
            self.working_data = pd.get_dummies(self.raw_data, columns=["HomePlanet"],dummy_na=True)
            
        self.working_data['HomePlanet_Earth'] = self.working_data['HomePlanet_Earth'].astype(int)
        self.working_data['HomePlanet_Europa'] = self.working_data['HomePlanet_Europa'].astype(int)
        self.working_data['HomePlanet_Mars'] = self.working_data['HomePlanet_Mars'].astype(int)
        if 'HomePlanet_nan' in self.working_data.columns:
            self.working_data['HomePlanet_nan'] = self.working_data['HomePlanet_nan'].astype(int)
        
    def one_hot_encode_destination(self,use_working=True):
        '''
        
        '''
        if use_working:
            self.working_data = pd.get_dummies(self.working_data, columns=["Destination"],dummy_na=True)
        else:
            self.working_data = pd.get_dummies(self.raw_data, columns=["Destination"],dummy_na=True)
            
        self.working_data['Destination_55 Cancri e'] = self.working_data['Destination_55 Cancri e'].astype(int)
        self.working_data['Destination_PSO J318.5-22'] = self.working_data['Destination_PSO J318.5-22'].astype(int)
        self.working_data['Destination_TRAPPIST-1e'] = self.working_data['Destination_TRAPPIST-1e'].astype(int)
        if 'Destination_nan' in self.working_data.columns:
            self.working_data['Destination_nan'] = self.working_data['Destination_nan'].astype(int)
        

In [None]:
etl = ETL(raw_data)
etl.run_etl()
etl_df = etl.get_working_data()
pre = Preprocessing(etl_df)
pre.run_preprocessing()
preprocessed_df = pre.get_working_data()


In [None]:
preprocessed_df.columns


columns_to_select = ['PassengerId', 'CryoSleep', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported',
       'deck', 'cabin_num', 'cabin_side', 'Passenger_Group',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_nan', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_nan']
selected_preprocessed_df = preprocessed_df[columns_to_select]


In [None]:
preprocessed_df.groupby(['Transported','CryoSleep'])['deck'].value_counts()

# Lets explore the ship in 3d to see if we see any patterns visually

In [None]:
def visualize_missing_data(data):
    plt.figure(figsize=(16,10))
    sns.heatmap(data.isnull(), yticklabels=False, cmap='viridis')
    plt.show()
    
visualize_missing_data(preprocessed_df)

In [None]:
filtered_df = preprocessed_df[preprocessed_df.notna().all(axis=1)]
visualize_missing_data(filtered_df)
len(filtered_df) / len(preprocessed_df)