In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
plt.style.use('ggplot')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
train_copy = train.copy()

# Hypotheses
We have the following passenger information:
- **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
- **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- **Destination** - The planet the passenger will be debarking to.
- **Age** - The age of the passenger.
- **VIP** - Whether the passenger has paid for special VIP service during the voyage.
- **RoomService**, **FoodCourt**, **ShoppingMall**, **Spa**, **VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- **Name** - The first and last names of the passenger.
- **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.


*What are some speculative reasons for getting transported to another dimension?*


1. Location on the ship when collision occured. Is there a common **deck**, num or side with passengers who were transported? Out of the common location, was it primarily people who were in **cryosleep**?
2. Was age a factor? Did younger people tend to get transported because they were up late at the time of the collision? Were older people transported because there was an event for older guests on at the time and location of the collision?
3. If one member of a group (gggg part of the PassengerId) was transported, did this mean that others in the same group were too?

# Data

In [None]:
print('Train test shape:', train_copy.shape)
print('Test test shape:', test.shape)

train_copy.head(5)

## Data Types

In [None]:
train_copy.dtypes

## Missing Values

In [None]:
print('TRAINING SET MISSING VALUES:')
print(train_copy.isna().sum())
print('\n')
print('TEST SET MISSING VALUES:')
print(test.isna().sum())

# EDA

**Target distribution**

In [None]:
plt.figure(figsize = (10,7))
train_copy['Transported'].value_counts().plot.pie(explode = [0.1, 0.1], autopct = '%1.1f%%')

Because our target is evenly distributed, we don't have to worry about regarding sampling.

## Numerical Data:

**Age**

In [None]:
plt.figure(figsize = (10,7))
sn.histplot(data = train_copy, x = 'Age', binwidth = 1, hue = 'Transported', kde = True)
plt.title('Age Distribution (shows two histograms overlayed on each other)')

*Notes*:
- 0-18 year olds were more likely to be transported compared to other age groups.
- 18-25 year olds less likely to be transported than not.
- Over 25s are equally likely 

*Insight*:
- It would make sense to create a new feature which categorises people into either child, adolescent or adult.

**Spending on amenities**

In [None]:
amenities = ['RoomService', 'Spa', 'FoodCourt', 'VRDeck', 'ShoppingMall']

fig = plt.figure(figsize = (30, 20))

for counter, amenity in enumerate(amenities):
    ax = fig.add_subplot(5,2, 2*counter + 1)
    sn.histplot(data = train_copy, x = amenity, axes = ax, bins = 30, kde = False, hue = 'Transported')
    
    ax = fig.add_subplot(5,2, 2*counter + 2)
    sn.histplot(data = train_copy, x = amenity, axes = ax, bins = 30, kde = True, hue = 'Transported')
    ax.set_ylim([0,100])

*Notes*:
- Across all amenity spending, those who spent less roughly less than £200 were more likely to be transported. Those who spent over this amount were more likely to not be transported.

*Insight*:
- Create two new categories: small spender (< 200£) vs big spender for each amenity.

## Categorical Data

In [None]:
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

figure = plt.figure(figsize = (20,20))

for counter, feat in enumerate(cat_features):
    ax = figure.add_subplot(4, 1, counter + 1)
    sn.countplot(data = train_copy, x = feat, hue = 'Transported', ax = ax)
    
fig.tight_layout()

*Notes*:

HomePlanet: 
- Those coming from Europa had a higher chance of being transported.
- Those coming from Earth were more likely to not be transported.
- People from Mars had roughly a 50% chance of being transported.

CryoSleep:
- Those who were in CryoSleep were much more likely to be transported.
- Those who weren't were more likely not to be transported.

Destination and VIP categories were roughly equally likely to be transported or not.

*Insights*:

- We should probably consider dropping the VIP feature as does not look insightful. 


## Qualitative features

In [None]:
qual_features = ['PassengerId', 'Cabin', 'Name']

train_copy[qual_features].head()

*Notes*

- PassengerId takees the form gggg_pp where gggg indicates the group and pp indicates their number within the group.
- Cabin takes the form deck/num/side. It could be interesting to see whether certain decks or sides are more prone to transportation.

*Insights*

- We can extract the group and group size from the PassengerId feature.
- We can extract the deck, number and side from the cabin feature.
- We could extract surnames from the name feature to identify families.

# Feature engineering

**Age status**

Let's group ages and hence convert this numerical feature into a categorical one.

In [None]:
train_age = train.copy()

train_age['age_group'], bins = pd.cut(x = train_age['Age'],
                                 bins = [0, 18, 25, 40, 100],
                                 labels = ('age_0-18', 'age_18-25', 'age_25-40', 'age_over40'),
                                 retbins = True)

# Plot new feature distribution\
figure = plt.figure(figsize = (10,8))
sn.histplot(data = train_age, x = 'age_group', hue = 'Transported')
plt.title('Age group distribution')

**Expenditure**

Calculate total expenditure and identify passengers with no expenditure.

In [None]:
train_exp = train_age.copy()

# Create new feature for total expenditure on amenities
train_exp['total_expenditure'] = train_exp[amenities].sum(axis = "columns", skipna = True)

# Identify passengers with no expenditure
train_exp['zero_expenditure'] = (train_exp['total_expenditure'] == 0).astype(int)

# Distributions
figure, (ax1, ax2) = plt.subplots(2, 1, figsize = (20, 12))
sn.histplot(data = train_exp, x = 'total_expenditure', binwidth = 200, 
            hue = 'Transported', kde = True, ax = ax1)
sn.countplot(data = train_exp, x = "zero_expenditure", hue = 'Transported', ax = ax2)
ax2.set_xlabel('zero_expenditure (1 = True, 0 = False)')

**Passenger group**

Extract passenger group and group size from PassengerId

In [None]:
train_pass = train_exp.copy()

# New feature - group
train_pass['group'] = train_pass['PassengerId'].str.split('_').str[0]

# New feature - group size
group_size = pd.DataFrame(train_pass['group'].value_counts())
train_pass = train_pass.merge(right = group_size, 
                             how = 'left',
                             left_on = 'group',
                             right_index = True)

train_pass.rename(columns = {'group_x': 'group', 'group_y': 'group_size'}, inplace = True)

# Visualise new features
figure, (ax1, ax2) = plt.subplots(1, 2, figsize = (20,4))
sn.histplot(data = train_pass,
            x = train_pass['group_size'],
            hue = 'Transported', ax = ax1)
sn.histplot(data = train_pass, 
            x = train_pass['group'],
            hue = 'Transported',
            ax = ax2,
            binwidth = 5)

Notes:
- We can see that those travelling one their own are slightly more likely to be transported.

Insight:
- Create another feature which identifies whether someone is travelling on their own or not.

In [None]:
# New feature - solo travelling
train_solo = train_pass.copy()

train_solo['solo'] = (train_solo['group_size'] == 1).astype(int)

# Visualise new feature
plt.figure(figsize = (15,4))
sn.countplot(data = train_solo,
             x = 'solo',
             hue = 'Transported')

**Features for future work:**

- Cabin location
- Last name, family size 

# Missing values

In [None]:
train_missing = train_solo.copy()

na_count = train_missing.isna().sum()
na_pct = (na_count/train_missing.count()) * 100

na_df = pd.concat([na_count, na_pct], axis = 'columns')
na_df.rename(columns = {0: 'count', 1: 'pct'}, inplace = True)
na_df

**Features going to be used in the model:**

- Categorical features: HomePlanet, CryoSleep, Destination, age_group, zero_expenditure, group_size, solo
- Numerical features: 

**Plan for missing values:**

Impute missing values with mode for categorical features.

# Model

The features we will use in this model are:

1. Age group
2. Zero-expenditure
3. Group-size

**Transformation Function**

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class NumericalAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        return None
    
    def fit(self, X, y = None):
        
        return self
    
    def transform(self, X):
        
        age_ix, roomservice_ix, foodcourt_ix, shoppingmall_ix, spa_ix, vrdeck_id = 0, 1, 2, 3, 4, 5
        
        amenities = [1, 2, 3, 4, 5]
        
        X = pd.DataFrame(X)
        
        # Create new feature - age_group 
        age_group, bins = pd.cut(x = X[age_ix],
                                     bins = [0, 18, 25, 40, 100],
                                     labels = ('age_0-18', 'age_18-25', 'age_25-40', 'age_over40'),
                                     retbins = True,
                                     include_lowest = True)
                
        # Reshape so this can be One Hot Encoded
        age_group = np.array(age_group).reshape(-1,1)
        
        # One Hot Encode age_group
        from sklearn.preprocessing import OneHotEncoder
        age_group_ohe = OneHotEncoder(sparse = False)
        age_group_encoded = pd.DataFrame(age_group_ohe.fit_transform(age_group))
        
        # Create new feature for total expenditure on amenities
        total_expenditure = X[amenities].sum(axis = 1)
        
        # Identify passengers with no expenditure
        zero_expenditure = (total_expenditure == 0).astype(int)
        
        # Keep only the columns we want
        X_enriched = pd.concat([age_group_encoded, zero_expenditure], axis = 'columns')
        
        return X_enriched
    

In [None]:
class CategoricalAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        return None
    
    def fit(self, X, y = None):
        
        return self
    
    def transform(self, X):
        
        X = pd.DataFrame(X)
        
        passengerid_ix, homeplanet_ix, cryosleep_ix = 0, 1, 2
        
        # New feature - group
        X['group'] = X[passengerid_ix].str.split('_').str[0]

        # New feature - group size
        group_size = pd.DataFrame(X['group'].value_counts())
        
        # Merge group size back to data
        X = X.merge(right = group_size, 
                     how = 'left',
                     left_on = 'group',
                     right_index = True,
                     suffixes = ('_', '_size'))
        
        # One Hot Encode: HomePlanet, CryoSleep
        from sklearn.preprocessing import OneHotEncoder
        cat_ohe = OneHotEncoder(sparse = False)
        cat_encoded = pd.DataFrame(cat_ohe.fit_transform(X.loc[:, [homeplanet_ix, cryosleep_ix]]))
        
        # Keep only the columns we want
        X_enriched = pd.concat([X['group_size'], cat_encoded], axis = 'columns')
        
        return np.array(X_enriched)

**Testing**

In [None]:
# Numerical function testing
train2 = train.copy()

num_cols = [col for col in train2.columns if train2[col].dtype == 'float64']

train_num = train2[num_cols]

# Impute data
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy = 'mean')

train_num = num_imputer.fit_transform(train_num)

num_class = NumericalAttributesAdder()

num_enriched = pd.DataFrame(num_class.fit_transform(train_num))
num_enriched

In [None]:
# Categorical function testing
train3 = train.copy()

# Get categorical columns
cat_cols = ['PassengerId', 'HomePlanet', 'CryoSleep']

# Reduce to categorical data
train3 = train3[cat_cols]

# Categorical imputer
cat_imputer = SimpleImputer(strategy = 'most_frequent')

# Fit_transform
train_cat = cat_imputer.fit_transform(train3)

# Enrich data
cat_class = CategoricalAttributesAdder()

cat_enriched = cat_class.fit_transform(train_cat)
cat_enriched

**Data Pre-Processing Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Finding columns
num_cols = [col for col in train.columns if train[col].dtype == 'float64']
cat_cols = ['PassengerId', 'HomePlanet', 'CryoSleep']

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('attributes_adder', NumericalAttributesAdder())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('attributes_adder', CategoricalAttributesAdder()),
])

# Pre-processing pipeline
pre_processing = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat1', cat_pipeline, cat_cols)
])

# Fit pipeline to training data
train_prepared = pd.DataFrame(pre_processing.fit_transform(train))
train_prepared

**Estimator (Logistic Regression)**

In [None]:
# Preparing data
y_train = train['Transported'].astype('int')
X_train = train.drop(labels = 'Transported', axis = 'columns')

# Pre-processing
X_train_prepared = pre_processing.fit_transform(X_train)

# Selecting the model
from sklearn.linear_model import LogisticRegression
logit_reg = LogisticRegression()

# GridSearchCV
from sklearn.model_selection import GridSearchCV
my_params = [
            {'solver' : ['newton-cg'], 'penalty': ['l2', 'none']},
            {'solver' : ['lbfgs'], 'penalty' : ['l2', 'none']},
            {'solver' : ['liblinear'], 'penalty' : ['l1', 'l2']},
            {'solver' : ['sag'], 'penalty' : ['l2', 'none'], 'max_iter' : [500, 1000]}
]

# Fit model 
logit_grid = GridSearchCV(estimator = logit_reg, param_grid = my_params, scoring = 'accuracy', cv = 4)
logit_grid.fit(X_train_prepared, y_train)

pd.DataFrame(logit_grid.cv_results_)

In [None]:
logit_estimator = logit_grid.best_estimator_
print(logit_estimator, logit_grid.best_score_)

**Estimator (Support Vector Classifier)**

In [None]:
# Preparing data
y_train = train['Transported'].astype('int')
X_train = train.drop(labels = 'Transported', axis = 'columns')

# Pre-processing
X_train_prepared = pre_processing.fit_transform(X_train)

# Selecting the model
from sklearn.svm import SVC
svc = SVC()

# GridSearchCV
from sklearn.model_selection import GridSearchCV
my_params = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']}

# Fit model 
svc_grid = GridSearchCV(estimator = svc, param_grid = my_params, scoring = 'accuracy', cv = 4)
svc_grid.fit(X_train_prepared, y_train)

pd.DataFrame(svc_grid.cv_results_)

In [None]:
svc_estimator = svc_grid.best_estimator_
print(svc_estimator, svc_grid.best_score_)

**Complete Pipeline**

In [None]:
# Complete pipeline
full_pipeline = Pipeline([
    ('pre_processing', pre_processing),
    ('estimator', svc_estimator)
])

# Fit the transformers, transform the data and then fit the estimator to the data
full_pipeline.fit(X_train, y_train)

# Make predictions using the estimator
pred = full_pipeline.predict(X_train)

**Evaluation**

In [None]:
# Let's get a classification report
from sklearn.metrics import classification_report
report = classification_report(y_train, pred, output_dict = True)
report

**Predicting test set**

In [None]:
sample_submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

submission = sample_submission.copy()

# Make transformations and predictions on the test set
test_pred = full_pipeline.predict(test)

# Convert 1 & 0s to True/False
test_pred_tf = (test_pred == 1)

# Make submission
submission['Transported'] = test_pred_tf

# Output to csv
submission.to_csv('submission.csv', index = False)
submission