In [1]:
# DATA PREPROCESSING PIPELINE

import pandas as pd
import numpy as np
import pickle
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def load_cleaned_data(file_path):
    """Load the cleaned data"""
    print("üìÇ Loading cleaned data...")
    df = pd.read_csv(file_path)
    print(f"‚úÖ Data loaded! Shape: {df.shape}")
    return df


def prepare_features(df):
    """Separate features for encoding"""
    print("\n" + "=" * 50)
    print("üîç PREPARING FEATURES")
    print("=" * 50)

    categorical_features = ['city', 'cuisine']
    numerical_features = ['rating', 'rating_count', 'cost']

    # Validate features
    categorical_features = [c for c in categorical_features if c in df.columns]
    numerical_features = [n for n in numerical_features if n in df.columns]

    print(f"‚úÖ Categorical features: {categorical_features}")
    print(f"‚úÖ Numerical features: {numerical_features}")

    return categorical_features, numerical_features


def encode_categorical_features(df, categorical_features):
    """Apply One-Hot Encoding"""
    print("\n" + "=" * 50)
    print("üîÑ ONE-HOT ENCODING")
    print("=" * 50)

    # Fill missing categorical values
    df[categorical_features] = df[categorical_features].fillna("Unknown")

    encoder = OneHotEncoder(
        sparse_output=False,
        handle_unknown="ignore"
    )

    encoded_array = encoder.fit_transform(df[categorical_features])
    feature_names = encoder.get_feature_names_out(categorical_features)

    encoded_df = pd.DataFrame(
        encoded_array,
        columns=feature_names,
        index=df.index
    )

    print(f"‚úÖ Encoded columns created: {encoded_df.shape[1]}")
    return encoded_df, encoder


def normalize_numerical_features(df, numerical_features):
    """Normalize numerical features"""
    print("\n" + "=" * 50)
    print("üìä NORMALIZING NUMERICAL FEATURES")
    print("=" * 50)

    scaler = StandardScaler()
    scaled_array = scaler.fit_transform(df[numerical_features])

    scaled_df = pd.DataFrame(
        scaled_array,
        columns=[f"{col}_scaled" for col in numerical_features],
        index=df.index
    )

    print(f"‚úÖ Normalized features: {numerical_features}")
    return scaled_df, scaler


def combine_features(encoded_df, scaled_df):
    """Combine all features"""
    print("\n" + "=" * 50)
    print("üîó COMBINING FEATURES")
    print("=" * 50)

    final_df = pd.concat([encoded_df, scaled_df], axis=1)

    print(f"‚úÖ Final dataset shape: {final_df.shape}")
    return final_df


def save_encoder(encoder, scaler, output_path):
    """Save encoder and scaler"""
    print("\n" + "=" * 50)
    print("üíæ SAVING ENCODER & SCALER")
    print("=" * 50)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, "wb") as f:
        pickle.dump(
            {"encoder": encoder, "scaler": scaler},
            f
        )

    print(f"‚úÖ Saved preprocessing objects to: {output_path}")


def save_encoded_data(df, output_path):
    """Save encoded dataset"""
    print("\n" + "=" * 50)
    print("üíæ SAVING ENCODED DATA")
    print("=" * 50)

    df.to_csv(output_path, index=False)
    print(f"‚úÖ Encoded data saved at: {output_path}")
    print(f"üìê Shape: {df.shape}")


def verify_rows(cleaned_path, encoded_path):
    """Verify row count consistency"""
    print("\n" + "=" * 50)
    print("üîç VERIFYING ROW COUNTS")
    print("=" * 50)

    c = pd.read_csv(cleaned_path)
    e = pd.read_csv(encoded_path)

    if len(c) == len(e):
        print(f"‚úÖ Row count matched: {len(c)}")
    else:
        print("‚ùå Row mismatch!")
        print(f"Cleaned: {len(c)}, Encoded: {len(e)}")


def main():
    print("\n" + "üéØ DATA PREPROCESSING PIPELINE üéØ".center(60))

    # ‚úÖ UPDATED PATHS (MATCH YOUR PROJECT)
    cleaned_file = r"D:\Py_start\Python\project_SN\Project4\1\swiggy_cleaned_data.csv"
    encoded_file = r"D:\Py_start\Python\project_SN\Project4\1\swiggy_encoded_data.csv"
    encoder_file = r"D:\Py_start\Python\project_SN\Project4\models\preprocessing.pkl"

    try:
        df = load_cleaned_data(cleaned_file)

        categorical_features, numerical_features = prepare_features(df)

        encoded_df, encoder = encode_categorical_features(df, categorical_features)
        scaled_df, scaler = normalize_numerical_features(df, numerical_features)

        final_df = combine_features(encoded_df, scaled_df)

        save_encoder(encoder, scaler, encoder_file)
        save_encoded_data(final_df, encoded_file)

        verify_rows(cleaned_file, encoded_file)

        print("\n" + "=" * 50)
        print("üéâ PREPROCESSING COMPLETED SUCCESSFULLY üéâ")
        print("=" * 50)

        print(f"""
üìä SUMMARY
---------
Total Restaurants : {final_df.shape[0]}
Final Features    : {final_df.shape[1]}
Categorical Used  : {categorical_features}
Numerical Used    : {numerical_features}
""")

    except Exception as e:
        print("‚ùå PREPROCESSING FAILED")
        print("Error:", e)


if __name__ == "__main__":
    main()



              üéØ DATA PREPROCESSING PIPELINE üéØ               
üìÇ Loading cleaned data...
‚úÖ Data loaded! Shape: (148442, 11)

üîç PREPARING FEATURES
‚úÖ Categorical features: ['city', 'cuisine']
‚úÖ Numerical features: ['rating', 'rating_count', 'cost']

üîÑ ONE-HOT ENCODING
‚úÖ Encoded columns created: 2953

üìä NORMALIZING NUMERICAL FEATURES
‚úÖ Normalized features: ['rating', 'rating_count', 'cost']

üîó COMBINING FEATURES
‚úÖ Final dataset shape: (148442, 2956)

üíæ SAVING ENCODER & SCALER
‚úÖ Saved preprocessing objects to: D:\Py_start\Python\project_SN\Project4\models\preprocessing.pkl

üíæ SAVING ENCODED DATA
‚úÖ Encoded data saved at: D:\Py_start\Python\project_SN\Project4\1\swiggy_encoded_data.csv
üìê Shape: (148442, 2956)

üîç VERIFYING ROW COUNTS
‚úÖ Row count matched: 148442

üéâ PREPROCESSING COMPLETED SUCCESSFULLY üéâ

üìä SUMMARY
---------
Total Restaurants : 148442
Final Features    : 2956
Categorical Used  : ['city', 'cuisine']
Numerical Used    : [