In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

data_url = "/content/drive/MyDrive/project/1. PakEcomDataSet20k.csv"

# Output path 
output_path = "/content/drive/MyDrive/project/cleaned_data.csv"

def load_clean_data(data_url, output_path):
   
    df = pd.read_csv(data_url)

    # Handle missing values
    # Identify numerical and categorical columns
    numerical_columns = df.select_dtypes(include=[np.number]).columns
    categorical_columns = df.select_dtypes(include=['object']).columns

    # Impute missing values
    imputer_numbers = SimpleImputer(strategy="mean")  # Use mean for numerical data
    df[numerical_columns] = imputer_numbers.fit_transform(df[numerical_columns])

    imputer_category = SimpleImputer(strategy="most_frequent")  # Use most frequent for categorical data
    df[categorical_columns] = imputer_category.fit_transform(df[categorical_columns])

    # Encode categorical variables (optional)
    encoder = LabelEncoder()
    for col in categorical_columns:
        df[col] = encoder.fit_transform(df[col])

    # Normalize or standardize the data (optional)
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns]) # Scale only numerical columns

    # Save the cleaned data
    df.to_csv(output_path, index=False)  # Save the original dataframe, not df_scaled


if __name__ == "__main__":
    load_clean_data(data_url, output_path)
    print(f"Cleaned data saved to: {output_path}") # Use f-string for formatting