In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [5]:
df = pd.read_csv("titanic_cleaned.csv")

# Create Class

In [7]:
class DataPreprocessor:
    def __init__(self, path):
        self.path = path
        self.df = None

    def load_data(self):
        self.df = pd.read_csv(self.path)

    def handle_missing_values(self):
        num_cols = self.df.select_dtypes(include='number').columns
        self.df[num_cols] = self.df[num_cols].fillna(self.df[num_cols].median())

        cat_cols = self.df.select_dtypes(include='object').columns
        for col in cat_cols:
            self.df[col] = self.df[col].fillna(self.df[col].mode()[0])

    def encode_categorical(self):
        self.df = pd.get_dummies(self.df, drop_first=True)

    def scale_features(self):
        scaler = StandardScaler()
        feature_cols = self.df.drop(columns=['Survived']).columns
        self.df[feature_cols] = scaler.fit_transform(self.df[feature_cols])

    def split_data(self):
        X = self.df.drop('Survived', axis=1)
        y = self.df['Survived']
        return train_test_split(X, y, test_size=0.2, random_state=42)

    def save_processed_data(self, filename):
        self.df.to_csv(filename, index=False)


In [8]:
processor = DataPreprocessor("titanic_cleaned.csv")
processor.load_data()
processor.handle_missing_values()
processor.encode_categorical()
processor.scale_features()

X_train, X_test, y_train, y_test = processor.split_data()
processor.save_processed_data("titanic_processed.csv")
