In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# File and Data Field Descriptions

**train.csv** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
* PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
* HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
* CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
* Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
* Destination - The planet the passenger will be debarking to.
* Age - The age of the passenger.
* VIP - Whether the passenger has paid for special VIP service during the voyage.
* RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
* Name - The first and last names of the passenger.
* Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

**test.csv** - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.

**sample_submission.csv** - A submission file in the correct format.
* PassengerId - Id for each passenger in the test set.
* Transported - The target. For each passenger, predict either True or False.

# Import Data & Check Dataframe

In [None]:
# Import training files
train = pd.read_csv('../input/spaceship-titanic/train.csv')
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
# Import Test Files
test = pd.read_csv('../input/spaceship-titanic/test.csv')
test.head()

In [None]:
test.shape

In [None]:
test.info()

In [None]:
# Install fast-ml library
# Fast-ML is a Python package with numerous inbuilt functionalities to make the life of a data scientist much easier
!pip install fast_ml

* df : Dataframe, refers to dataset used for analysis
* variable : str, refers to a single variable. As required in the function it has to be passed ex 'V1'
* variables : list type, refers to list of variables. Must be passed as list ex ['V1', 'V2]. Even a single variable has to be passed in list format. ex ['V1']
* target : str, refers to target variable
* model : str, ML problem type. use 'classification' or 'clf' for classification problems and 'regression' or 'reg' for regression problems
* method : str, refers to various techniques available for Missing Value Imputation, Feature Engieering... as available in each module

In [None]:
# EDA using Fast-ML
from fast_ml import eda

# One of the most useful dataframe summary view
# Returns a dataframe with useful summary - variables, datatype, number of unique values, 
# sample of unique values, missing count, missing percent
eda.df_info(train)

# Create Data Copy

In [None]:
# Create a copy of train
train1 = train
train1.head()

In [None]:
# Create a copy of test
test1 = test
test1.head()

# EDA

In [None]:
# Import Library
from pandas_profiling import ProfileReport

# Perform Pandas Profiling
train_profile = ProfileReport(train1, title="EDA-Spaceship Titanic Training Data")
train_profile

In [None]:
test_profile = ProfileReport(test1, title="EDA-Spaceship Titanic Test Data")
test_profile

In [None]:
# EDA
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.countplot(train1['HomePlanet'], hue=train1['Transported']);

In [None]:
sns.countplot(train1['CryoSleep'], hue=train1['Transported']);

In [None]:
sns.countplot(train1['Destination'], hue=train1['Transported']);

In [None]:
sns.distplot(train1['Age']);

In [None]:
sns.countplot(train1['VIP'], hue=train1['Transported']);

In [None]:
sns.countplot(train1['Transported']);

# Data Treatment

In [None]:
# No need for PassengerId and Name as they won't help with model building
train1 = train1.drop(['PassengerId','Name'], axis=1)
train1.head()

In [None]:
# Perform same operation on test dataframe
test1 = test1.drop(['PassengerId','Name'], axis=1)
test1.head()

In [None]:
# Add the amount spent by the passengers in RoomService, FoodCourt, ShoppingMall, Spa, VRDeck in a separate column
train1['AmountSpent'] = train1['RoomService']+train1['FoodCourt']+train1['ShoppingMall']+train1['Spa']+train1['VRDeck']
# Drop these columns since we have captured the data in them in AmountSpent column
train1 = train1.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)
train1.head()

In [None]:
sns.distplot(train1['AmountSpent']);

In [None]:
# Add the amount spent by the passengers in RoomService, FoodCourt, ShoppingMall, Spa, VRDeck in a separate column
test1['AmountSpent'] = test1['RoomService']+test1['FoodCourt']+test1['ShoppingMall']+test1['Spa']+test1['VRDeck']
# Drop these columns since we have captured the data in them in AmountSpent column
test1 = test1.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)
test1.head()

In [None]:
# How many unique cabin entires are there
train1.Cabin.unique

In [None]:
# Split Cabin column into respective constituents in the training data
cabin_details = train1['Cabin'].str.split('/',expand=True)
cabin_details

In [None]:
# Split Cabin column into respective constituents in the test data
cabin_details_test = test['Cabin'].str.split('/',expand=True)
cabin_details_test

In [None]:
# Add the constituent columns to training data as separate columns
# SeatNumber is not required for modelling
train1['Deck']=cabin_details[0]
#train1['SeatNumber']=cabin_details[1]
train1['Side']=cabin_details[2] # P= Port, S=Starboard
# Drop Cabin column
train1=train1.drop(['Cabin'],axis=1)
train1

In [None]:
sns.countplot(train1['Deck'], hue=train1['Transported']);

In [None]:
sns.countplot(train1['Side'], hue=train1['Transported']);

In [None]:
# Add the constituent columns to test data as separate columns
# SeatNumber is not required for modelling
test1['Deck']=cabin_details_test[0]
#test1['SeatNumber']=cabin_details_test[1]
test1['Side']=cabin_details_test[2] # P= Port, S=Starboard
# Drop Cabin column
test1=test1.drop(['Cabin'],axis=1)
test1

# Missing Data Treatment

In [None]:
eda.df_info(train1)

In [None]:
# Impute Age with median, AmountSpent with median in training data
# Import Library
from sklearn.impute import SimpleImputer

# Create Object
median_impute = SimpleImputer(missing_values=np.nan, strategy='median')

# Impute Age column
#Simple imputer expects a column vector, so converting the pandas Series
train1['Age'] = median_impute.fit_transform(train1['Age'].to_numpy().reshape(-1,1))

# Prevent leakage - use same median to impute test data column
test1['Age'] = median_impute.transform(test1['Age'].to_numpy().reshape(-1,1))

# Impute AmountSpent Column
train1['AmountSpent'] = median_impute.fit_transform(train1['AmountSpent'].to_numpy().reshape(-1,1))
test1['AmountSpent'] = median_impute.transform(test1['AmountSpent'].to_numpy().reshape(-1,1))

# For categorical columns we create a separate class 'Unknown' for missing values or nan
# Source https://jamesrledoux.com/code/imputation
train1 = train1.fillna("Unknown")
test1 = test1.fillna("Unknown")

# Check if imputation worked in training data
eda.df_info(train1)

In [None]:
# Check if imputation worked in test data
eda.df_info(test1)

# Encode Categorical Variables to Numerical

In [None]:
# Encode training data with dummies
train2=pd.get_dummies(data=train1,drop_first=True)
train2

In [None]:
# Encode training data with dummies
test2=pd.get_dummies(data=test1,drop_first=True)
test2

# Standardize Numerical Variables

In [None]:
# Segregate numerical variables
num_col = ['Age','AmountSpent']

# Standardize Training Data

# Import Library
from sklearn.preprocessing import StandardScaler

# Define method
scaler = StandardScaler()

# Perform standardization on training data
train_scaled = scaler.fit_transform(train2[num_col])
# Create dataframe
train_scaled = pd.DataFrame(train_scaled, index=train2.index, columns=train2[num_col].columns)
# Merge dataframe with training data
train2 = train2.drop(num_col, axis = 1)
train2 = pd.concat([train_scaled, train2], axis=1)
train2.head()

In [None]:
# Perform standardization on test data
test_scaled = scaler.transform(test2[num_col])
# Create dataframe
test_scaled = pd.DataFrame(test_scaled, index=test2.index, columns=test2[num_col].columns)
# Merge dataframe with training data
test2 = test2.drop(num_col, axis = 1)
test2 = pd.concat([test_scaled, test2], axis=1)
test2.head()

# Split Data into Train Test

In [None]:
# Split training data into dependent and independent variables
X = train2.drop(columns = 'Transported', axis=1)
y = train2['Transported']

# Display training data file head
X.head()

In [None]:
# Splitting the data for training and testing out model
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
# Check if target variable is balanced
y.value_counts()

Target variable seems pretty balanced so no up/down sampling is necessary

# Modelling

In [None]:
import time

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#from sklearn.metrics import f1_score

# Define Function to Test Multiple Classification Models and Output Performance Metrics
def fit_n_print(model, X_train, X_test, y_train, y_test):  # take the model, train data and test data as input
    
    start = time.time()  # note the start time of training
    
    model.fit(X_train, y_train)   # fit the model using the train data
    pred = model.predict(X_test)     # model predictions on the test data
    
    stop = time.time() # note end time for training
    
    duration = stop - start  # calculate the total duration
    train_acc = model.score(X_train,y_train) # training accuracy score
    test_acc = model.score(X_test,y_test) # test accuracy score
    kfold = KFold(n_splits=10) # Perform cross validation
    results = cross_val_score(model,X_train, y_train, cv=kfold)
    mean_train_acc = np.mean(abs(results))
    std_train_acc = results.std()
    #f1score = f1_score(y_test,pred)
           
    return train_acc, mean_train_acc, std_train_acc, test_acc, duration, pred  # return all the metrics along with predictions
print("Function defined successfully!")

In [None]:
# Define Function to Create Model Objects from Algorithms and Output Final Results
def run_model(X_train, X_test, y_train, y_test):
    
    # Import Model Libraries
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    import xgboost as xgb
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import BaggingClassifier

    # Define Model Objects
    logreg = LogisticRegression()
    nb = GaussianNB()
    sgd = SGDClassifier()
    knn = KNeighborsClassifier()
    dtree = DecisionTreeClassifier()
    rfc = RandomForestClassifier()
    svc = SVC()
    xgbc = xgb.XGBClassifier()
    ada_boost = AdaBoostClassifier()
    gbcl = GradientBoostingClassifier()
    bgcl = BaggingClassifier()
    
    result = {}   # Create an empty dictionary to later use to store metrics of each of the models
    for model, name  in zip([logreg, nb, sgd, knn, dtree, rfc, svc,xgbc,ada_boost,gbcl,bgcl], 
                        ['Logistic Regression','Gaussian Naive Bayes', 'Stochastic Gradient Descent',
                         'K-Nearest Neighbours', 'Decision Tree Classifier', 'Random Forest Classifier',
                        'Support Vector Classifier','XGBoost Classifier','AdaBoost Classifier',
                        'Gradient Boosting Classifier','Bagging Classifier']):
        result[name] = fit_n_print(model,X_train, X_test, y_train, y_test)  
    # store all the metrics in the result dict, with name as key
    # make a dataframe out of the metrics from result dictionary 
    result1 = pd.DataFrame(np.array(list(result.values()))[:,:-1],    
                       columns= ['Training Accuracy','CV-Mean Training Accuracy','CV-Sigma Training Accuracy', 'Test Accuracy', 'Duration'],
                      index= result.keys())   # use the model names as index
    result1.index.name = 'Model'   # name the index of the result1 dataframe as 'Model'
    return result1

base_result = run_model(X_train, X_test, y_train, y_test)
base_result