# Preprocessing the data

## Importing the packages

In [9]:
import sys
import os
%matplotlib inline
# Add the src directory to the Python path
sys.path.append(os.path.abspath('../src'))
from data_prepocessing import  split_data, preprocess_data, normalize_data
from utils import save_dataframe_as_csv, set_pandas_display_options, load_data

## Loading the dataset

In [10]:
#loading the dataset
dataset_path = '../datasets/raw/titanic.csv'
df = load_data(dataset_path)
df.columns

Index(['passenger_id', 'survived', 'p_class', 'name', 'sex', 'age', 'sib_sp', 'parch', 'ticket', 'fare', 'cabin', 'embarked'], dtype='object')

## Initializing the variables

In [11]:
test_size = 0.2
val_size = 0.2
features = ['survived', 'p_class', 'sex', 'age', 'sib_sp', 'parch', 'fare']
normaliztion_method =  'z_score'
normalization_columns = ['p_class','age', 'fare']

In [12]:
#setting the maximum number of printing columns 
set_pandas_display_options()

In [13]:
#preprocessing the data that includes removing duplicate and null values
print(df.shape)
df = df[features]
df = preprocess_data(df)
# print(df.shape)
df.head

(891, 12)


<bound method NDFrame.head of      survived  p_class     sex   age  sib_sp  parch     fare
0           0        3    male  22.0       1      0   7.2500
1           1        1  female  38.0       1      0  71.2833
2           1        3  female  26.0       0      0   7.9250
3           1        1  female  35.0       1      0  53.1000
4           0        3    male  35.0       0      0   8.0500
..        ...      ...     ...   ...     ...    ...      ...
885         0        3  female  39.0       0      5  29.1250
886         0        2    male  27.0       0      0  13.0000
887         1        1  female  19.0       0      0  30.0000
889         1        1    male  26.0       0      0  30.0000
890         0        3    male  32.0       0      0   7.7500

[714 rows x 7 columns]>

In [14]:
normalize_data(df, normaliztion_method, normalization_columns)

Unnamed: 0,survived,p_class,sex,age,sib_sp,parch,fare
0,0,0.910594,male,-0.530005,1,0,-0.518614
1,1,-1.475329,female,0.571430,1,0,0.691412
2,1,0.910594,female,-0.254646,0,0,-0.505859
3,1,-1.475329,female,0.364911,1,0,0.347805
4,0,0.910594,male,0.364911,0,0,-0.503497
...,...,...,...,...,...,...,...
885,0,0.910594,female,0.640270,0,5,-0.105246
886,0,-0.282368,male,-0.185807,0,0,-0.409958
887,1,-1.475329,female,-0.736524,0,0,-0.088711
889,1,-1.475329,male,-0.254646,0,0,-0.088711


In [15]:
# One-hot encode 'Sex' column
import pandas as pd
df = pd.get_dummies(df, columns=['sex'], drop_first=False)
df.columns

Index(['survived', 'p_class', 'age', 'sib_sp', 'parch', 'fare', 'sex_female', 'sex_male'], dtype='object')

In [16]:
#selecting the feature columns 
feature_columns = ['survived', 'p_class', 'sex_male','sex_female', 'age', 'sib_sp', 'parch', 'fare']

#selecting the label column
label_columns = 'survived'

#shuffleing the data before splitting
df = df.sample(frac=1).reset_index(drop=True)

#spiliting the data into x_train, y_train, x_test, y_test
x_train, x_test, x_val, y_train, y_val, y_test = split_data(df, feature_columns, label_columns, test_size=test_size, val_size=val_size)


## Saving the x_train, y_train, x_val, y_val, x_test, y_test

In [17]:

save_dataframe_as_csv(x_train, "../datasets/ready/train/x_train.csv")
save_dataframe_as_csv(y_train, "../datasets/ready/train/y_train.csv")
save_dataframe_as_csv(x_val, "../datasets/ready/val/x_val.csv")
save_dataframe_as_csv(y_val, "../datasets/ready/val/y_val.csv")
save_dataframe_as_csv(x_test, "../datasets/ready/test/x_test.csv")
save_dataframe_as_csv(y_test, "../datasets/ready/test/y_test.csv")
