In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

def preprocess_data(file_path, output_path):

    try:
        # Load the dataset
        data = pd.read_csv(file_path) # Fixed: Enclosed file path in quotes
        print("Loaded data successfully.")

        # Handling missing values
        print("Handling missing values...")
        imputer = SimpleImputer(strategy='mean')
        for column in data.select_dtypes(include=['float64', 'int64']).columns:
            data[column] = imputer.fit_transform(data[[column]])

        # Encoding categorical variables
        print("Encoding categorical variables...")
        for column in data.select_dtypes(include=['object']).columns:
            if data[column].nunique() < 10:  # Only encode columns with few categories
                encoder = LabelEncoder()
                data[column] = encoder.fit_transform(data[column])

        # Scaling numeric variables
        print("Scaling numeric features...")
        scaler = StandardScaler()
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

        # Removing duplicates
        print("Removing duplicates...")
        data = data.drop_duplicates()

        # Save the cleaned dataset
        print(f"Saving cleaned data to {output_path}...")
        data.to_csv(output_path, index=False)

        print("Data preprocessing and cleaning complete.")
        return data

    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None

In [12]:
# Example usage:
preprocess_data('/content/shopping_trends.csv', 'cleaned_output.csv') # Fixed: Removed extra space before the function call

Loaded data successfully.
Handling missing values...
Encoding categorical variables...
Scaling numeric features...
Removing duplicates...
Saving cleaned data to cleaned_output.csv...
Data preprocessing and cleaning complete.


Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Preferred Payment Method,Frequency of Purchases
0,-1.731607,0.718913,0.685994,Blouse,-0.002002,-0.285629,Kentucky,-1.203814,Gray,1.349198,-0.907584,1.644294,-0.295206,-0.892178,1.151339,1.151339,-0.785831,1.471636,0.012575
1,-1.730719,-1.648629,0.685994,Sweater,-0.002002,0.178852,Maine,-1.203814,Maroon,1.349198,-0.907584,1.644294,-1.472409,-0.892178,1.151339,1.151339,-1.616552,-0.894631,0.012575
2,-1.729830,0.390088,0.685994,Jeans,-0.002002,0.558882,Massachusetts,0.945854,Maroon,-0.441163,-0.907584,1.644294,-0.883807,-0.303032,1.151339,1.151339,-0.162789,-0.303064,1.513849
3,-1.728942,-1.517099,0.685994,Sandals,1.113356,1.276716,Rhode Island,-0.128980,Maroon,-0.441163,-0.349027,1.644294,0.881996,0.286113,1.151339,1.151339,1.637107,0.880069,1.513849
4,-1.728054,0.061263,0.685994,Blouse,-0.002002,-0.454531,Oregon,-0.128980,Turquoise,-0.441163,-1.466141,1.644294,-0.883807,-0.303032,1.151339,1.151339,0.391025,0.880069,-1.488699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,1.728054,-0.267563,-1.457738,Hoodie,-0.002002,-1.341267,Virginia,-1.203814,Turquoise,0.454017,0.628448,-0.608164,-0.883807,-1.481324,-0.868554,-0.868554,0.460252,1.471636,1.513849
3896,1.728942,0.521618,-1.457738,Backpack,-1.117360,-0.454531,Iowa,-1.203814,White,-0.441163,1.047366,-0.608164,0.881996,1.464405,-0.868554,-0.868554,1.083293,-1.486198,-0.988274
3897,1.729830,0.127028,-1.457738,Belt,-1.117360,-1.130139,New Jersey,-1.203814,Green,-0.441163,-1.186862,-0.608164,-0.295206,0.875259,-0.868554,-0.868554,-0.093563,1.471636,1.013424
3898,1.730719,-0.004502,-1.457738,Shoes,1.113356,0.727784,Minnesota,0.945854,Brown,0.454017,0.069891,-0.608164,0.881996,-0.892178,-0.868554,-0.868554,-0.093563,1.471636,1.513849
