In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## Loading the Data

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

test_id = test['PassengerId']

In [None]:
# transforming True/False to 1/0 format

def f0t1(data, cols):
    for i in cols:
        data[i] = data[i].apply(lambda x: 1 if x else 0)
    return data

train = f0t1(train, ['CryoSleep', 'VIP', 'Transported'])
test = f0t1(test, ['CryoSleep', 'VIP'])

In [None]:
data = pd.concat([train, test], ignore_index=True)
data.drop(columns=['Transported'], inplace=True)

In [None]:
data.head()

### Feature descriptions:

* **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
* **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
* **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
* **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
* **Destination** - The planet the passenger will be debarking to.
* **Age** - The age of the passenger.
* **VIP** - Whether the passenger has paid for special VIP service during the voyage.
* **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
* **Name** - The first and last names of the passenger.
* **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## EDA (Exploratory Data Analysis)

In [None]:
sns.histplot(data=train, x='HomePlanet', hue='Transported').set(title='HomePlanet vs Transported')
plt.show()

In [None]:
sns.histplot(data=train, x='CryoSleep', hue='Transported').set(title='CryoSleep vs Transported')
plt.show()

In [None]:
sns.histplot(data=train, x='Destination', hue='Transported').set(title='Destination vs Transported')
plt.show()

In [None]:
sns.histplot(data=train, x='Age', hue='Transported', kde=True).set(title='Age vs Transported')
plt.show()
print('Skewness:',train.Age.skew())

The **KDE line** of the plot clearly shows that the transported chance for minors(passengers less than 18 years of age) is greater than adults.

In [None]:
sns.histplot(data=train, x='VIP', hue='Transported').set(title='VIP vs Transported')
plt.show()

Since there are no clear indications which might help the prediction model to improve, **VIP column might be dropped**.

## Imputations

In [None]:
data.isna().sum()[data.isna().sum()!=0]

In [None]:
def imput(data):
    
    data.HomePlanet.fillna(data.HomePlanet.mode()[0], inplace=True)
    
    data.Cabin.fillna('None/-1/None', inplace=True)
    
    data.Destination.fillna(data.Destination.mode()[0], inplace=True)
    
    data.Age.fillna(data.Age.median(), inplace=True)
    
    data.RoomService.fillna(data.RoomService.median(), inplace=True)
    
    data.FoodCourt.fillna(data.FoodCourt.median(), inplace=True)
    
    data.ShoppingMall.fillna(data.ShoppingMall.median(), inplace=True)
    
    data.Spa.fillna(data.Spa.median(), inplace=True)
    
    data.VRDeck.fillna(data.VRDeck.median(), inplace=True)
    
    data.Name.fillna('Anonymous N/A', inplace=True)
    
    return data

data = imput(data)

# copying imputation changes back to train (will help in plotting)
transported = train.Transported
train = data[:len(train)].copy()
train['Transported'] = transported

In [None]:
data.isna().sum()

## Feature Engineering

### PassengerId - GroupSize/Solo Extraction (Group & Batch rejected for final model)

In [None]:
def group(data):
    data['Group'] = data['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    temp = data.Group.value_counts()
    
    # GroupSize as the name suggests is the size of the group the passenger is a part of
    data['GroupSize'] = data['Group'].apply(lambda x: temp[x])
    
    # If the Passenger is the single person in his/her group, he/she lies in the solo category
    data['Solo'] = data['GroupSize'].apply(lambda x: 1 if x==1 else 0)
    
    # Since Group has a large number of numerical data, we are dividing it into batches (each of 500 groups)
    labels = []
    for i in range(19):
        labels.append('Batch '+str(i))
    data['Batch'] = pd.cut(data['Group'], bins=range(0,10000,500), labels=labels)
    
    return data

data = group(data)
train = group(train)

In [None]:
sns.histplot(data=train, x='Group', hue='Transported', kde=True).set(title='Group vs Transported')
plt.show()
print('Skewness:',train.Group.skew())

In [None]:
sns.histplot(data=train, x='GroupSize', hue='Transported', kde=True).set(title='GroupSize vs Transported')
plt.show()
print('Skewness:',train.GroupSize.skew())

In [None]:
sns.histplot(data=train, x='Solo', hue='Transported').set(title='Solo vs Transported')
plt.show()

In [None]:
plt.figure(figsize=(5,8))
sns.histplot(data=train, y='Batch', hue='Transported', kde=True).set(title='Batch vs Transported')
plt.show()

### Age - Adult Extraction

In [None]:
def adult(data):
    data['Adult'] = data['Age'].apply(lambda x: 0 if x<18 else 1)
    return data

data = adult(data)
train = adult(train)

In [None]:
sns.histplot(data=train, x='Adult', hue='Transported').set(title='Adult vs Transported')
plt.show()

### Cabin - Deck/Num/Side Extraction

In [None]:
def cabin(data):
    data['Deck'] = data['Cabin'].apply(lambda x: str(x).split('/')[0])
    data['Num'] = data['Cabin'].apply(lambda x: int(str(x).split('/')[1]))
    data['Side'] = data['Cabin'].apply(lambda x: str(x).split('/')[2])
    return data

data = cabin(data)
train = cabin(train)

In [None]:
sns.histplot(data=train, x='Deck', hue='Transported').set(title='Deck vs Transported')
plt.show()

In [None]:
# plt.figure(figsize=(15,250))
# sns.histplot(data=train, x='Num', hue='Transported')
# plt.show()

In [None]:
sns.histplot(data=train, x='Side', hue='Transported').set(title='Side vs Transported')
plt.show()

### RoomService/FoodCourt/ShoppingMall/Spa/VRDeck - Expenditure Extraction

In [None]:
def expenditure(data):
    data['Expenditure'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']
    return data

data = expenditure(data)
train = expenditure(train)

In [None]:
sns.histplot(data=train, x='Expenditure', hue='Transported', kde=True).set(title='Expenditure vs Transported')
plt.show()
print('Skewness:',train.Expenditure.skew())

### Expenditure - MoneySpent Extraction

In [None]:
def moneyspent(data):
    data['MoneySpent'] = data['Expenditure'].apply(lambda x: 0 if x==0 else 1)
    return data

data = moneyspent(data)
train = moneyspent(train)

In [None]:
sns.histplot(data=train, x='MoneySpent', hue='Transported').set(title='MoneySpent vs Transported')
plt.show()

### Name - FamilyName Extraction

In [None]:
def name(data):
    data['FamilyName'] = data['Name'].apply(lambda x: x.split()[1])
    return data

data = name(data)
train = name(train)

In [None]:
# plt.figure(figsize=(15,250))
# sns.histplot(data=train, y='FamilyName', hue='Transported')
# plt.show()

### Removing unwanted features

In [None]:
# note that some columns are dropped because of their detrimental effect on prediction model found out by trying and testing later on

data.drop(columns=['PassengerId','Cabin','VIP','Name','Group','Batch'], inplace=True)

In [None]:
data.head()

## Numerical Data Handling

In [None]:
def expenditure(data):
    data['Expenditure'] = np.log(1 + data['Expenditure'])
    return data

data = expenditure(data)
train = expenditure(train)

In [None]:
sns.histplot(data=train, x='Expenditure', hue='Transported', kde=True)
plt.show()
print('Skewness:',train.Expenditure.skew())

## Categorical Data Handling

In [None]:
from sklearn.preprocessing import LabelEncoder

def lenc(data, cols):
    lc = LabelEncoder()
    for i in cols:
        data[i] = lc.fit_transform(data[i])
    return data

data = lenc(data, ['FamilyName'])

In [None]:
data.FamilyName.value_counts().sort_values()

In [None]:
data = pd.get_dummies(data)

In [None]:
data.head()

In [None]:
# print(data.columns.to_list())

### Data Breakdown

In [None]:
train = data[:len(train)].copy()
train['Transported'] = transported

In [None]:
test = data[len(train):].copy()

## Model Analysis

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['Transported'])
y = train.Transported

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(learning_rate=0.2, random_state=42)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(learning_rate=0.1, random_state=42)
lgb.fit(X_train, y_train)
pred = lgb.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

## Making Predictions

In [None]:
X = train.drop(columns=['Transported'])
y = train.Transported

In [None]:
lgb.fit(X, y)
pred = lgb.predict(test)

In [None]:
submit = pd.DataFrame()
submit['PassengerId'] = test_id
submit['Transported'] = pred
submit['Transported'] = submit['Transported'].apply(lambda x: 'True' if x==1 else 'False')

submit.to_csv('submit.csv', index=False)
print('Predictions saved to submit.csv')

In [None]:
sns.countplot(data=submit, x='Transported')