The features for this notebook have been taken from here: https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-cv



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler



In [None]:
def label_encoder(c):
    lc = LabelEncoder()
    return lc.fit_transform(c)


In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

all_df = pd.concat([train_df, test_df])

In [None]:
# Age fillna with mean age for each class
age_map = all_df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
all_df.Age = all_df.Age.fillna(all_df.Pclass.map(age_map['Age']))

# Cabin, fillna with 'X' and take first letter
all_df.Cabin = all_df.Cabin.fillna('X').map(lambda x: x[0].strip())

# Ticket, fillna with 'X', split string and take first split 
all_df.Ticket = all_df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

# Fare, fillna with mean value
all_df.Fare = all_df.Fare.fillna(all_df.Fare.mean())

# Embarked, fillna with 'X' value
all_df.Embarked = all_df.Embarked.fillna('X')

# Name, take only surnames
all_df.Name = all_df.Name.map(lambda x: x.split(',')[0])

In [None]:
label_cols = ['Name', 'Ticket']
onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked']
numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'Survived']

In [None]:
onehot_encoded_df = pd.get_dummies(all_df[onehot_cols])
label_encoded_df = all_df[label_cols].apply(label_encoder)
numerical_df = all_df[numerical_cols]

all_df = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df], axis=1)

In [None]:
# Re-split all data
train = all_df[:train_df.shape[0]]

test = all_df[train_df.shape[0]:].drop(columns=['Survived'])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.to_csv('train.csv')
test.to_csv('test.csv')