In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

In [None]:
root="/kaggle/input/spaceship-titanic/"

### Loading

In [None]:
train=pd.read_csv(root+"train.csv")
test=pd.read_csv(root+"test.csv")
sample_submission=pd.read_csv(root+"sample_submission.csv")

### Column Descriptions  :


- `PassengerId` - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- `HomePlanet` - The planet the passenger departed from, typically their planet of permanent residence.
- `CryoSleep` - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- `Cabin` - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- `Destination` - The planet the passenger will be debarking to.
- `Age` - The age of the passenger.
- `VIP` - Whether the passenger has paid for special VIP service during the voyage.
- `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- `Name` - The first and last names of the passenger.
- `Transported` - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
train.info()

In [None]:
test.info()

In [None]:
sample_submission.info()

In [None]:
categorical_features = train.select_dtypes(exclude=['float64'])
numerical_features = train.select_dtypes(exclude=['object'])

In [None]:
categorical_features = [col for col in categorical_features.columns 
                        if col not in ["PassengerId", "Transported"]]
numerical_features = [col for col in numerical_features.columns 
                      if col not in ["PassengerId", "Transported"]]

In [None]:
display(categorical_features)
display(numerical_features)

### Exploring dataset

In [None]:
train.head(3)

In [None]:
test.head(3)

Some values are missing :

In [None]:
train.isna().sum().sort_values(ascending = False)

In [None]:
train.isna().sum().sort_values(ascending = False).sum()

In [None]:
test.isna().sum().sort_values(ascending = False)

In [None]:
test.isna().sum().sort_values(ascending = False).sum()

In [None]:
train_missing_val_freq = train.isna().sum().sort_values(ascending = False)/train.shape[0]

In [None]:
test_missing_val_freq = test.isna().sum().sort_values(ascending = False)/test.shape[0]

In [None]:
display(train_missing_val_freq*100)

In [None]:
display(test_missing_val_freq*100)

In [None]:
display((train_missing_val_freq-test_missing_val_freq)*100)

The missing values frequencies are very close between train and test dataset.

In [None]:
train.describe()

In [None]:
test.describe()

Repartitions of numericals features are very close between train and test dataset.

In [None]:
train[categorical_features].describe()

In [None]:
l = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
for col in l :
    display(col)
    display(train[col].value_counts())

In [None]:
test[categorical_features].describe()

In [None]:
l = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
for col in l :
    display(col)
    display(test[col].value_counts())

Let's compare relation :

In [None]:
l = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
for col in l :
    tr = train[col].value_counts()/train.shape[0]
    te = test[col].value_counts()/test.shape[0]
    display(col)
    display((tr-te).sort_values(ascending=False))

It's very close again.

## Inputation

Now we will input missing values, for the purpose of making Name feature a usable category we will get family name insteed of full name. Moreover we will input most of the missing values using the group id part of PassengerId feature assuming that is a good criterion.

In [None]:
train["Group"]=train["PassengerId"].str.split('_')

In [None]:
train["Group"] = train["Group"].apply(lambda x: x[0])

In [None]:
train["Family_name"]=train["Name"].str.split(' ')

In [None]:
train["Family_name"] = train[train["Family_name"]==train["Family_name"]]\
                ["Family_name"].apply(lambda x: x[-1])

In [None]:
train[["Name", "Family_name"]].head(10)

In [None]:
train["Name"]=train["Family_name"]
train.drop("Family_name", axis=1, inplace=True)

In [None]:
for i in range(20):
    df=train[train["Group"]==train.loc[10*(i+40), "Group"]]
    if len(df)>1 :
        display(df[["HomePlanet",
                    "CryoSleep",
                    "Cabin",
                    "Destination",
                    "VIP",
                    "Name"]])

### `Name`

We input the missing values using the Group feature :

In [None]:
to_complete=train[train["Name"].isna()]

In [None]:
to_complete.shape

In [None]:
for row in to_complete.iterrows() :
    df=train[train["Group"]==row[1]["Group"]]
    if df["Name"].count()>1 :
        name=df["Name"].value_counts().index[0]
    else :
        name="Solo"
    train.loc[row[0], "Name"]=name

In [None]:
train[train["Name"].isna()].shape

In [None]:
train[train["Name"]=="Solo"].shape

### `Cabin`

We input the missing values using the Group feature :

In [None]:
to_complete=train[train["Cabin"].isna()]

In [None]:
to_complete.shape

In [None]:
for row in to_complete.iterrows() :
    df=train[train["Group"]==row[1]["Group"]]
    if df["Cabin"].count()>1 :
        cabin=df["Cabin"].value_counts(ascending=True).index[0]
    else :
        cabin="Corridor"
    train.loc[row[0], "Cabin"]=cabin

In [None]:
train[train["Cabin"].isna()].shape

In [None]:
train[train["Cabin"]=="Corridor"].shape

### `CryoSleep`

In [None]:
money_var=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
train["Money"]=train[money_var].sum(axis=1)

In [None]:
train[train["CryoSleep"]==True]["Money"].describe()

In [None]:
train[
    (train["CryoSleep"]==False) &
    (train["Money"]==0)
].shape

In [None]:
train[
    (train["CryoSleep"]==False) &
    (train["Money"]>0)
].shape

In [None]:
to_complete=train[train["CryoSleep"].isna()]

In [None]:
to_complete.shape

If money features are all zero we set CryoSleep True, else we set False :

In [None]:
for row in to_complete.iterrows() :
    if row[1]["Money"]==0 :
        cryosleep=True
    else :
        cryosleep=False
    train.loc[row[0], "CryoSleep"]=cryosleep

In [None]:
train[train["CryoSleep"].isna()].shape

### `VIP`

In [None]:
pd.crosstab(index=train["VIP"], columns=train["CryoSleep"])

In [None]:
train[train["VIP"]==True]["Money"].describe()

In [None]:
train[train["VIP"]==False]["Money"].describe()

In [None]:
to_complete=train[train["VIP"].isna()]

In [None]:
to_complete.shape

If sum of money features is greater than 1300 we set VIP True, else we set False :

In [None]:
for row in to_complete.iterrows() :
    if row[1]["Money"]<1300 :
        vip=False
    else :
        vip=True
    train.loc[row[0], "VIP"]=vip

In [None]:
train[train["VIP"].isna()].shape

### `Homeplanet`

We complete HomePlanet using the Group feature and if the passenger is alone we use CryoSleep, Destination and VIP feature to set the majority of the passenger's category :

In [None]:
to_complete=train[train["HomePlanet"].isna()]

In [None]:
to_complete.shape

In [None]:
var=['CryoSleep', 'Destination', 'VIP']

for row in to_complete.iterrows() :
    df=train[train["Group"]==row[1]["Group"]]
    if df["HomePlanet"].count()>0 :
        homeplanet=df["HomePlanet"].value_counts().index[0]
    elif pd.isna(row[1]['Destination']) :
        df=train[
            (train[var[0]]==row[1][var[0]]) &
            (train[var[2]]==row[1][var[2]])]
        homeplanet=df["HomePlanet"].value_counts().index[0]
    else :
        df=train[
            (train[var[0]]==row[1][var[0]]) &
            (train[var[1]]==row[1][var[1]]) &
            (train[var[2]]==row[1][var[2]])]
        homeplanet=df["HomePlanet"].value_counts().index[0]
    train.loc[row[0], "HomePlanet"]=homeplanet

### `Destination`

We use CryoSleep, HomePlanet and VIP feature to set the majority of the passenger's category :

In [None]:
to_complete=train[train["Destination"].isna()]

In [None]:
to_complete.shape

In [None]:
var=['HomePlanet', 'CryoSleep', 'VIP']
for row in to_complete.iterrows() :
    df=train[
        (train[var[0]]==row[1][var[0]]) &
        (train[var[1]]==row[1][var[1]]) &
        (train[var[2]]==row[1][var[2]])]
    destination=df["Destination"].value_counts().index[0]
    train.loc[row[0], "Destination"]=destination

In [None]:
train[train["Destination"].isna()].shape

In [None]:
train.isna().sum().sort_values(ascending = False)

### `Age`

We set Age at the median value :

In [None]:
med=train["Age"].median()
display(med)

In [None]:
def age_input(x):
    if x!=x:
        x=med
    return(x)

In [None]:
train[train["Age"].isna()].shape

In [None]:
train["Age"]=train["Age"].apply(age_input)

In [None]:
train[train["Age"].isna()].shape

### Money features

If CryoSleep feature is set True, we put all money features at 0.
If CryoSleep feature is set False we use knn to input money features separting two cases, if vip feature set True and False.

In [None]:
money_var=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
train[train[money_var].count(axis=1)==0].shape

In [None]:
def input_var_1(df, var):
    to_complete=df[df[var].isna()]
    display(to_complete.shape)
    for row in to_complete.iterrows() :
        if row[1]["CryoSleep"]==True :
            df.loc[row[0], var]=0
    display(df[df[var].isna()].shape)
    return(df)

In [None]:
for col in money_var :
    train=input_var_1(train, col)

In [None]:
standardiser=train[money_var].std()
train[money_var]=train[money_var]/standardiser

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def input_var_2(df):
    knn = KNeighborsRegressor(3, weights='distance')
    to_complete=df[df.isna().any(axis=1)]
    full_completed=df[df.count(axis=1)==df.shape[1]]
    
    display(to_complete.shape)
    
    for row in to_complete.iterrows() :
        col_target=df.columns[row[1].isna()]
        X=full_completed.drop(col_target, axis=1)
        col=X.columns
        y=full_completed[col_target]
        knn.fit(X,y)
        data=[row[1][col]]
        target=knn.predict(pd.DataFrame(data))
        train.loc[row[0], col_target]=target[0]

In [None]:
t_1=train[train["VIP"]==True][money_var]
display(t_1.shape)
t_2=train[train["VIP"]==False][money_var]
display(t_2.shape)
to_complete=t_1[t_1.isna().any(axis=1)]
display(to_complete.shape)
to_complete=t_2[t_2.isna().any(axis=1)]
display(to_complete.shape)

In [None]:
input_var_2(t_1)

In [None]:
input_var_2(t_2)

In [None]:
t_1=train[train["VIP"]==True][money_var]
display(t_1.shape)
t_2=train[train["VIP"]==False][money_var]
display(t_2.shape)
to_complete=t_1[t_1.isna().any(axis=1)]
display(to_complete.shape)
to_complete=t_2[t_2.isna().any(axis=1)]
display(to_complete.shape)

In [None]:
train[money_var]=train[money_var]*standardiser

In [None]:
train.isna().sum().sort_values(ascending = False)

In [None]:
train.drop("Money", axis=1, inplace=True)

In [None]:
train.info()

In [None]:
# train.to_csv(root+"train_cleaned.csv", index=False)

Now we can rerun the chain to input value of test dataframe.