# <span style='color:#A80808'>Objective</span>

This notebook aims to extract some basic features from the given dataset.

In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

# <span style='color:#A80808'>Feature engineering</span>

In [None]:
# Helper functions
def num_features_fillna(df, num_features):
    for feature in num_features:
        df[feature] = df[feature].fillna(df[feature].mean())
    return df

def num_features_to_category(df, num_features, q=3):
    for feature in num_features:
        df[feature] = pd.cut(df[feature], q, labels=False).astype('category')
    return df

def num_features_to_log_scale(df, num_features):
    for feature in num_features:
        df[feature] = np.log(df[feature] + 1e-6) # add 1e-6 to avoid 0
    return df

def num_features_scale(df, num_features):
    sum_num_features = df[num_features].sum(axis=1) + 1e-6 # add 1e-6 to avoid 0
    for feature in num_features:
        df[feature] = df[feature]/sum_num_features
    return df

In [None]:
def feature_engineering(df):
    num_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    df = num_features_fillna(df, num_features)
    #df = num_features_to_log_scale(df, num_features[1:])
    #df = num_features_scale(df, num_features[1:])
    #df = num_features_to_category(df, num_features, q=3)

    df['Cabin'] = df['Cabin'].fillna('None/-1/None')
    df['Cabin_1'] = df['Cabin'].apply(lambda x: x.split('/')[0]).astype('category')
    df['Cabin_2'] = df['Cabin'].apply(lambda x: x.split('/')[1]).astype('int32')
    df['Cabin_3'] = df['Cabin'].apply(lambda x: x.split('/')[2]).astype('category')
    df = df.drop('Cabin', axis=1)

    df['Name'] = df['Name'].fillna('None None')
    le = LabelEncoder()
    df['Name'] = le.fit_transform(df['Name'].apply(lambda x: x.split()[1]).astype('string')).astype('int32')

    df[['CryoSleep', 'VIP', 'HomePlanet', 'Destination']] = df[['CryoSleep', 'VIP', 'HomePlanet', 'Destination']].fillna('None')
    df[['CryoSleep', 'VIP']] = df[['CryoSleep', 'VIP']].astype(bool)
    df[['HomePlanet', 'Destination']] = df[['HomePlanet', 'Destination']].astype('category')

    df['PassengerGroup'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1])).astype('category')
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0])).astype('int32')
    df = df.drop('PassengerId', axis=1)

    cat_features = [feature for feature in df.columns if feature not in ['Name', 'Cabin_2', 'PassengerNum']]
    df = pd.concat([pd.get_dummies(df[cat_features]), df[['Name', 'Cabin_2', 'PassengerNum']]], axis=1)

    return df

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
train.head(1)

In [None]:
train = feature_engineering(train)
test = feature_engineering(test)

train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

# <span style='color:#A80808'>Short EDA</span>

In [None]:
train.info()

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
for feature in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    fig = px.histogram(train[feature],  marginal=None, nbins = 200, template="plotly_white", color_discrete_sequence=['red'], title=f'Train {feature}')   
    fig.show()
    fig = px.histogram(test[feature],  marginal=None, nbins = 200, template="plotly_white", color_discrete_sequence=['blue'], title=f'Test {feature}')
    fig.show()