<a href="https://colab.research.google.com/github/sm183/CCE_Assignment/blob/main/Src/Preprocessing/Data_Pre_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def load_and_preprocess_data(file_path, save_folder, save_filename="preprocessed_data.csv"):
    """
    Load the dataset and preprocess it:
    - Encode categorical variables.
    - Handle missing values:
      - For non-categorical data, check skewness to decide between mean and median imputation.
      - For categorical data, replace missing values with the most frequent value.
    - Ensure that 'Age' column values are absolute, and replace missing values with absolute mean/median.
    - Save the preprocessed data to a specified folder.
    """
    # Load the data
    data = pd.read_csv(file_path, on_bad_lines='skip')

    # Replace spaces in column names with underscores
    data.columns = data.columns.str.replace(' ', '_')

    # Encode categorical variable 'Gender' (assigns 0 to female and 1 to male)
    label_encoder = LabelEncoder()
    data['Gender'] = label_encoder.fit_transform(data['Gender'])

    # Handle 'Age' column specifically: Ensure absolute values and handle missing data
    if 'Age' in data.columns:
        data['Age'] = data['Age'].abs()  # Ensure 'Age' is absolute
        if data['Age'].isnull().sum() > 0:  # If there are missing values
            if abs(data['Age'].skew()) >= 0.5:  # Check for skewness
                # Replace with absolute median
                median_age = abs(data['Age'].median())
                data['Age'] = data['Age'].fillna(value=median_age)
            else:
                # Replace with absolute mean
                mean_age = abs(data['Age'].mean())
                data['Age'] = data['Age'].fillna(value=mean_age)

    # Handle missing values for other columns
    for column in data.columns:
        if column != 'Age':  # 'Age' already handled
            if data[column].dtype == 'object':  # Categorical data
                most_frequent = data[column].mode()[0]
                data[column] = data[column].fillna(value=most_frequent)
            else:  # Numeric data
                if data[column].isnull().sum() > 0:  # Only process columns with missing values
                    if abs(data[column].skew()) >= 0.5:  # Check for skewness
                        # Replace with absolute median
                        median_value = abs(data[column].median())
                        data[column] = data[column].fillna(value=median_value)
                    else:
                        # Replace with absolute mean
                        mean_value = abs(data[column].mean())
                        data[column] = data[column].fillna(value=mean_value)

    # Ensure the save folder exists
    os.makedirs(save_folder, exist_ok=True)

    # Save the preprocessed data to the specified folder
    save_path = os.path.join(save_folder, save_filename)
    data.to_csv(save_path, index=False)

    print(f"Preprocessed data saved to: {save_path}")
    return data


In [None]:
# File path to the dataset
file_path = 'advertising_ef.csv'
save_folder = 'Data/Processed_data'
preprocessed_data = load_and_preprocess_data(file_path, save_folder)
print(preprocessed_data)