# Preprocessing the data

## Importing the packages

In [1]:
import sys
import os
# Add the src directory to the Python path
sys.path.append(os.path.abspath('../src'))
from data_prepocessing import  split_data, preprocess_data, normalize_data
from utils import save_dataframe_as_csv, set_pandas_display_options, load_data

2024-08-10 14:24:45.681982: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Loading the dataset

In [2]:
#loading the dataset
dataset_path = '../datasets/raw/titanic.csv'
df = load_data(dataset_path)


In [3]:
selected_feature_columns =['survived', 'passenger_class',  'sex',
       'sibling_spouse', 'parent_children', 'fare', 'cabin_number',
       'embarked_port']

# selected_feature_columns =['p_class', 'sib_sp','survived']

df = df[selected_feature_columns]
df.columns

Index(['survived', 'passenger_class', 'sex', 'sibling_spouse',
       'parent_children', 'fare', 'cabin_number', 'embarked_port'],
      dtype='object')

## Initializing the variables

In [4]:
test_size = 0.2
val_size = 0.2
normaliztion_method =  'robust'
normalization_columns = ['fare', 'sibling_spouse', 'parent_children']
normalization_columns = [ 'sibling_spouse']

In [5]:
#setting the maximum number of printing columns 
set_pandas_display_options()

In [6]:
#preprocessing the data that includes removing duplicate and null values
print(df.shape)
null_counts = df.isnull().sum()
null_counts


(891, 8)


survived             0
passenger_class      0
sex                  0
sibling_spouse       0
parent_children      0
fare                 0
cabin_number       687
embarked_port        2
dtype: int64

In [7]:
normalize_data(df, normaliztion_method, normalization_columns)

Unnamed: 0,survived,passenger_class,sex,sibling_spouse,parent_children,fare,cabin_number,embarked_port
0,0,3,male,1.0,0,7.2500,,S
1,1,1,female,1.0,0,71.2833,C85,C
2,1,3,female,0.0,0,7.9250,,S
3,1,1,female,1.0,0,53.1000,C123,S
4,0,3,male,0.0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,0.0,0,13.0000,,S
887,1,1,female,0.0,0,30.0000,B42,S
888,0,3,female,1.0,2,23.4500,,S
889,1,1,male,0.0,0,30.0000,C148,C


In [8]:
# One-hot encode 'Sex' column
import pandas as pd
df = pd.get_dummies(df, columns=['sex'], drop_first=False)
df.columns

Index(['survived', 'passenger_class', 'sibling_spouse', 'parent_children', 'fare', 'cabin_number', 'embarked_port', 'sex_female', 'sex_male'], dtype='object')

In [9]:
#selecting the feature columns 
feature_columns = ['passenger_class', 'sibling_spouse', 'parent_children', 'fare', 'sex_male', 'sex_female']


#selecting the label column
label_columns = 'survived'


#shuffleing the data before splitting
df = df.sample(frac=1).reset_index(drop=True)
df.head
# normalize_data(df, normaliztion_method, feature_columns)

#spiliting the data into x_train, y_train, x_test, y_test
x_train, x_test, x_val, y_train, y_val, y_test = split_data(df, feature_columns, label_columns, test_size=test_size, val_size=val_size)




## Saving the x_train, y_train, x_val, y_val, x_test, y_test

In [10]:

save_dataframe_as_csv(x_train, "../datasets/ready/train/x_train.csv")
save_dataframe_as_csv(y_train, "../datasets/ready/train/y_train.csv")
save_dataframe_as_csv(x_val, "../datasets/ready/val/x_val.csv")
save_dataframe_as_csv(y_val, "../datasets/ready/val/y_val.csv")
save_dataframe_as_csv(x_test, "../datasets/ready/test/x_test.csv")
save_dataframe_as_csv(y_test, "../datasets/ready/test/y_test.csv")