In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import missingno

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, plot_roc_curve

# Spaceship Titanic

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
print(f'training data:\nsamples: {train.shape[0]}\nfeatures: {train.shape[1]}')
print('='*50)
print(f'testing data:\nsamples: {test.shape[0]}\nfeatures: {test.shape[1]}')

In [None]:
train.head()

In [None]:
test.head()

Since 'Name' and 'PassengerID' are unique, we drop them

In [None]:
cols_to_exclude = ['Name', 'PassengerId']
df_train = train.drop(cols_to_exclude, axis=1)
df_test = test.drop(cols_to_exclude, axis=1)

# Exploratory Data Analysis

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_test.info()

In [None]:
df_test.describe()

In [None]:
plt.figure(figsize=(18,12))
plt.title('Correlation Matrix')

sns.heatmap(df_train.corr(), cmap='Blues', annot=True)
plt.show()

**Separating the Features based on data types**

In [None]:
FEATURES = df_train.columns.tolist()

categorical = [col for col in FEATURES if df_train[col].dtype=='object']
continuous = [col for col in FEATURES if df_train[col].dtype=='float64']

In [None]:
## Count Plot for Categorical variables
def cat_plot(col):
    
    plt.figure(figsize=(14,10))
    plt.title(f'Transported based on {col}')
    
    sns.countplot(x=col, 
                  hue='Transported', 
                  data=df_train, 
                  palette='Set1')
    plt.show()

In [None]:
for i in categorical:
    if i == 'Cabin':
        continue
    cat_plot(i)
    print('='*125+'\n')

## Inferences:
* Most people are from Earth, but it is also the only planet where less people were transported
* Lesser number of people chose to go into Cryosleep, but those who did mostly completed the journey
* Highest number of people are travelling to TRAPPIST-1e, but the proportions of transported remains consistent for all destinations
* There are very few people classified as VIP 

In [None]:
plt.figure(figsize=(14,10))
plt.pie(df_train['Transported'].value_counts(), 
        labels=['Transported', 'Not Transported'], 
        autopct='%1.1f%%', explode=[0, 0.2], 
        shadow=True, colors=['#0c06c7', '#05daed'], textprops={'fontsize':15})
plt.show()

In [None]:
df_train['Transported'].value_counts()

## Inference
**The training dataset is balanced in terms of the Target variable**

In [None]:
## Distribution plot and Histogram 
def continuous_plot(col):
    
    fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(18,10))
    
    sns.distplot(df_train[df_train.Transported==False][col], label='Not Transported', hist=True, color='#fc0328', ax=axs[0])
    sns.distplot(df_train[df_train.Transported==True][col], label='Transported', hist=True, color='#0c06c7', ax=axs[0])
    
    sns.distplot(df_train[col], label='Training dataset', hist=False, color='#fc0328', ax=axs[1])
    sns.distplot(df_test[col], label='Testing dataset', hist=False, color='#0c06c7', ax=axs[1])
    
    axs[0].legend(loc='upper right', prop={'size': 12})
    axs[1].legend(loc='upper right', prop={'size': 12})
    
    plt.show()

In [None]:
for i in continuous:
    continuous_plot(i)
    print('='*160+'\n')

## Inferences
* Distribution of Age is consistent and is closely similar to a normal distribution
* Rest of the Continuous features are highly postively skewed

The one feature that is left, is 'Cabin'. It is a little tricky, since it is not unique like 'PassengerId', neither does it represent a factor numerically nor is it feasibly categorical. Therefore, we pay special attention to this feature.

In [None]:
cabin = df_train['Cabin'].unique().tolist()
cabin.remove(np.nan)
len(cabin)

'Cabin' has **6560** unique values! Therefore, we cannot use it directly for our model. <br>
That being said, we can apply some logic to this feature upon close observation. <br>
A Cabin datapoint is represented as - deck/num/side. We can separate these as three different features!

In [None]:
cab_str = cabin[0].split('/')
print(f'{cabin[0]}: {cab_str}')

In [None]:
df_train[['deck', 'num', 'side']] = df_train['Cabin'].str.split(pat='/', expand=True)
df_test[['deck', 'num', 'side']] = df_test['Cabin'].str.split(pat='/', expand=True)

In [None]:
df_train['num'].value_counts().sort_index()

'num' is still not feasible as it has too many categories. So we will keep only 'deck' and 'side'

In [None]:
cols_to_remove = ['Cabin', 'num']
df_train.drop(cols_to_remove, axis=1, inplace=True)
df_test.drop(cols_to_remove, axis=1, inplace=True)

In [None]:
cat_plot('deck')
print('='*125+'\n')
cat_plot('side')

That's it for the EDA part!

# Feature Engineering

1. Dealing with missing values: <br>
NA values lead to inconsistent data, therefore we fill these cells with an appropriate value
2. Encoding: <br>
The working of algorithms is purely based on Math and Math~numbers. So we encode the categorical variables
3. Standardization:<br>
Linear Models like Logistic regression are based on Gradient Descent algorithm. Therefore, the scale of the dataset hugely impacts the working of the algorithm. Hence, standardization is required.

In [None]:
df_train.head()

## Dealing with missing values

In [None]:
missingno.matrix(df_train)

In [None]:
missingno.matrix(df_test)

The matrix shows that a lot of datapoints are missing. We first fill the Categorical NANs with mode, and the numerical features with their median.

In [None]:
categorical.remove('Cabin')

In [None]:
categorical.append('deck')
categorical.append('side')

In [None]:
def fill_categorical_na(col):
    
    col_without_na = df_train[col][df_train[col].notna()==True]
    mode_of_col = col_without_na.mode()[0]
    
    df_train[col].fillna(mode_of_col, inplace=True)
    df_test[col].fillna(mode_of_col, inplace=True)

In [None]:
for i in categorical:
    fill_categorical_na(i)

In [None]:
df_train.fillna(df_train.median(), inplace=True)
df_test.fillna(df_test.median(), inplace=True)

In [None]:
df_train.isna().sum().sort_values(ascending=False)

In [None]:
df_test.isna().sum().sort_values(ascending=False)

**The missing values have been dealt with.** <br>
The process for Categorical variables is longer, because we can only use mode as measure of central tendency. And it is sometimes possible that the mode for a feature turns out to be NAN itself! Therefore, we find mode without NANs, and then fill out the datasets.

## Encoding of Categorical features

In [None]:
ohe = OneHotEncoder()
def encode(col):
    ohe.fit(df_train[[col]])
    
    unique= []
    
    transformed = ohe.transform(df_train[[col]]).toarray()
    test_transformed = ohe.transform(df_test[[col]]).toarray()

    unique = df_train[col].unique().tolist()
    df_train[unique] = transformed
    df_test[unique] = test_transformed

    df_train.drop([col], axis=1, inplace=True)
    df_test.drop([col], axis=1, inplace=True)

In [None]:
for i in categorical:
    if i in ['deck', 'CryoSleep', 'VIP']:
        continue
    encode(i)

In [None]:
le = LabelEncoder()
def int_encode(col):
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.fit_transform(df_test[col])

In [None]:
for i in ['deck', 'CryoSleep', 'VIP']:
    int_encode(i)

All the features are now encoded. Some based on OneHotEncoding, while others on standard encoding. <br>
Generally, if a feature has too many categories (e.g 'deck' has 7) or if the feature is bool dtype (e.g 'VIP'), we prefer standard encoding. <br>
Otherwise, OneHotEncoding is a better way to encode features.

**Last step is to encode the target variable**

In [None]:
df_train['Transported'] = le.fit_transform(df_train['Transported'])

This completes our encoding

## Standardization

In [None]:
X = df_train.loc[:, df_train.columns != 'Transported']
y = df_train['Transported']

In [None]:
ss = StandardScaler()
def standardize(col):
    ss.fit(X[col].values.reshape(-1,1))
    X[col] = ss.transform(X[col].values.reshape(-1,1))
    df_test[col] = ss.transform(df_test[col].values.reshape(-1,1))

In [None]:
for i in X.columns.tolist():
    standardize(i)

In [None]:
X.head()

In [None]:
df_test.head()

All the features in both the datasets are now standardized. With this, the feature engineering part is complete. <br>
Our data is now ready for the Machine Learning Model!

# Machine Learning - Logistic Regression

First, we split the dataset into training and validation sets

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=1)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(clf, X_valid, y_valid)

In [None]:
y_pred = clf.predict(X_valid)

In [None]:
scores = {'type':[], 'score':[]}

scores['type'].append('Accuracy score')
scores['score'].append(accuracy_score(y_valid, y_pred))

scores['type'].append('Precision score')
scores['score'].append(precision_score(y_valid, y_pred,average='macro'))

scores['type'].append('Recall score')
scores['score'].append(recall_score(y_valid, y_pred,average='macro'))

scores_df = pd.DataFrame(scores)
scores_df

In [None]:
plot_precision_recall_curve(clf, X_valid, y_valid)

In [None]:
plot_roc_curve(clf, X_valid, y_valid)

# Submission

In [None]:
predictions = clf.predict(df_test)

In [None]:
dummy_df = pd.read_csv('../input/spaceship-titanic/test.csv')
pass_ids = dummy_df.iloc[:,0]
pass_ids

In [None]:
del dummy_df

In [None]:
submission = pd.DataFrame({'PassengerId':pass_ids, 'Transported':predictions})
mapping = {1:True, 0:False}
submission['Transported'] = submission['Transported'].map(mapping)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)