In [None]:

#Step 1: Data Cleaning


import pandas as pd
import numpy as np


def load_data(file_path):
    """Load the raw data from CSV file"""
    print("üìÇ Loading data...")
    df = pd.read_csv(file_path)
    print(f"‚úÖ Data loaded successfully! Shape: {df.shape}")
    return df


def explore_data(df):
    """Display basic information about the dataset"""
    print("\n" + "=" * 50)
    print("üìä DATA EXPLORATION")
    print("=" * 50)

    print("\nüîç First 5 rows:")
    print(df.head())

    print("\nüìã Dataset Info:")
    df.info()

    print("\nüìà Statistical Summary:")
    print(df.describe(include="all"))

    print("\n‚ùì Missing Values:")
    print(df.isnull().sum())

    print("\nüîÅ Duplicate Rows:")
    print(f"Number of duplicates: {df.duplicated().sum()}")


def remove_duplicates(df):
    """Remove duplicate rows from the dataset"""
    print("\n" + "=" * 50)
    print("üßπ REMOVING DUPLICATES")
    print("=" * 50)

    initial_rows = df.shape[0]
    df = df.drop_duplicates()
    final_rows = df.shape[0]

    print(f"Initial rows: {initial_rows}")
    print(f"Final rows: {final_rows}")
    print(f"Removed duplicates: {initial_rows - final_rows}")

    return df


def handle_missing_values(df):
    """Handle missing values in the dataset"""
    print("\n" + "=" * 50)
    print("üîß HANDLING MISSING VALUES")
    print("=" * 50)

    print("\n‚ùì Missing values before handling:")
    print(df.isnull().sum())

    df = df.copy()

    # Essential columns (only drop if they exist)
    essential_cols = [col for col in ['name', 'city', 'cuisine'] if col in df.columns]
    if essential_cols:
        df = df.dropna(subset=essential_cols)

    # Handle numerical columns safely
    if 'rating' in df.columns:
        median_rating = df['rating'].median()
        df['rating'] = df['rating'].fillna(median_rating)
        print(f"‚úÖ Filled 'rating' with median: {median_rating}")

    if 'rating_count' in df.columns:
        df['rating_count'] = df['rating_count'].fillna(0)
        print("‚úÖ Filled 'rating_count' with 0")

    if 'cost' in df.columns:
        median_cost = df['cost'].median()
        df['cost'] = df['cost'].fillna(median_cost)
        print(f"‚úÖ Filled 'cost' with median: {median_cost}")

    # Drop remaining missing values
    df = df.dropna()

    print("\n‚ùì Missing values after handling:")
    print(df.isnull().sum())

    return df


def save_cleaned_data(df, output_path):
    """Save cleaned data to CSV"""
    print("\n" + "=" * 50)
    print("üíæ SAVING CLEANED DATA")
    print("=" * 50)

    df.to_csv(output_path, index=False)
    print(f"‚úÖ Cleaned data saved at: {output_path}")
    print(f"Final dataset shape: {df.shape}")


def main():
    """Main function to run the cleaning process"""
    print("\n" + "üéØ RESTAURANT DATA CLEANING PIPELINE üéØ".center(60))

    # ‚úÖ UPDATED CORRECT PATHS
    input_file = r"D:\Py_start\Python\project_SN\Project4\1\swiggy.csv"
    output_file = r"D:\Py_start\Python\project_SN\Project4\1\swiggy_cleaned_data.csv"

    try:
        df = load_data(input_file)
        explore_data(df)
        df = remove_duplicates(df)
        df = handle_missing_values(df)
        save_cleaned_data(df, output_file)

        print("\n" + "=" * 50)
        print("üéâ DATA CLEANING COMPLETED SUCCESSFULLY üéâ")
        print("=" * 50)

    except FileNotFoundError:
        print("‚ùå ERROR: CSV file not found.")
        print("Check the input file path carefully.")

    except Exception as e:
        print(f"‚ùå UNEXPECTED ERROR: {e}")


if __name__ == "__main__":
    main()



           üéØ RESTAURANT DATA CLEANING PIPELINE üéØ            
üìÇ Loading data...
‚úÖ Data loaded successfully! Shape: (148541, 11)

üìä DATA EXPLORATION

üîç First 5 rows:
       id               name    city rating     rating_count   cost  \
0  567335     AB FOODS POINT  Abohar     --  Too Few Ratings  ‚Çπ 200   
1  531342  Janta Sweet House  Abohar    4.4      50+ ratings  ‚Çπ 200   
2  158203  theka coffee desi  Abohar    3.8     100+ ratings  ‚Çπ 100   
3  187912          Singh Hut  Abohar    3.7      20+ ratings  ‚Çπ 250   
4  543530      GRILL MASTERS  Abohar     --  Too Few Ratings  ‚Çπ 250   

                      cuisine          lic_no  \
0            Beverages,Pizzas  22122652000138   
1               Sweets,Bakery  12117201000112   
2                   Beverages  22121652000190   
3            Fast Food,Indian  22119652000167   
4  Italian-American,Fast Food  12122201000053   

                                                link  \
0  https://www.swiggy.com/rest

In [5]:
#Step 1: Data Cleaning


import pandas as pd
import numpy as np
import re


def load_data(file_path):
    print("üìÇ Loading data...")
    df = pd.read_csv(file_path)
    print(f"‚úÖ Data loaded successfully! Shape: {df.shape}")
    return df


def explore_data(df):
    print("\n" + "=" * 50)
    print("üìä DATA EXPLORATION")
    print("=" * 50)

    print("\nüîç First 5 rows:")
    print(df.head())

    print("\nüìã Dataset Info:")
    df.info()

    print("\n‚ùì Missing Values:")
    print(df.isnull().sum())


def preprocess_columns(df):
    """Convert text columns to numeric"""
    print("\n" + "=" * 50)
    print("üß™ PREPROCESSING COLUMNS")
    print("=" * 50)

    df = df.copy()

    # ---- Rating ----
    if 'rating' in df.columns:
        df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

    # ---- Rating Count ----
    if 'rating_count' in df.columns:
        df['rating_count'] = (
            df['rating_count']
            .astype(str)
            .str.extract(r'(\d+)')
            .astype(float)
        )

    # ---- Cost ----
    if 'cost' in df.columns:
        df['cost'] = (
            df['cost']
            .astype(str)
            .str.replace('‚Çπ', '', regex=False)
            .str.strip()
        )
        df['cost'] = pd.to_numeric(df['cost'], errors='coerce')

    print("‚úÖ Column preprocessing completed")
    return df


def remove_duplicates(df):
    print("\n" + "=" * 50)
    print("üßπ REMOVING DUPLICATES")
    print("=" * 50)

    before = df.shape[0]
    df = df.drop_duplicates()
    after = df.shape[0]

    print(f"Removed {before - after} duplicate rows")
    return df


def handle_missing_values(df):
    print("\n" + "=" * 50)
    print("üîß HANDLING MISSING VALUES")
    print("=" * 50)

    df = df.copy()

    # Essential columns
    df = df.dropna(subset=['name', 'city', 'cuisine'])

    # Fill numeric columns
    df['rating'] = df['rating'].fillna(df['rating'].median())
    df['rating_count'] = df['rating_count'].fillna(0)
    df['cost'] = df['cost'].fillna(df['cost'].median())

    print("‚úÖ Missing values handled")
    return df


def save_cleaned_data(df, output_path):
    print("\n" + "=" * 50)
    print("üíæ SAVING CLEANED DATA")
    print("=" * 50)

    df.to_csv(output_path, index=False)
    print(f"‚úÖ File saved successfully at:\n{output_path}")
    print(f"üìê Final Shape: {df.shape}")


def main():
    print("\n" + "üéØ SWIGGY DATA CLEANING PIPELINE üéØ".center(60))

    input_file = r"D:\Py_start\Python\project_SN\Project4\1\swiggy.csv"
    output_file = r"D:\Py_start\Python\project_SN\Project4\1\swiggy_cleaned_data.csv"

    try:
        df = load_data(input_file)
        explore_data(df)
        df = preprocess_columns(df)
        df = remove_duplicates(df)
        df = handle_missing_values(df)
        save_cleaned_data(df, output_file)

        print("\nüéâ DATA CLEANING COMPLETED SUCCESSFULLY üéâ")

    except Exception as e:
        print("‚ùå PIPELINE FAILED")
        print("Error:", e)


if __name__ == "__main__":
    main()



             üéØ SWIGGY DATA CLEANING PIPELINE üéØ              
üìÇ Loading data...
‚úÖ Data loaded successfully! Shape: (148541, 11)

üìä DATA EXPLORATION

üîç First 5 rows:
       id               name    city rating     rating_count   cost  \
0  567335     AB FOODS POINT  Abohar     --  Too Few Ratings  ‚Çπ 200   
1  531342  Janta Sweet House  Abohar    4.4      50+ ratings  ‚Çπ 200   
2  158203  theka coffee desi  Abohar    3.8     100+ ratings  ‚Çπ 100   
3  187912          Singh Hut  Abohar    3.7      20+ ratings  ‚Çπ 250   
4  543530      GRILL MASTERS  Abohar     --  Too Few Ratings  ‚Çπ 250   

                      cuisine          lic_no  \
0            Beverages,Pizzas  22122652000138   
1               Sweets,Bakery  12117201000112   
2                   Beverages  22121652000190   
3            Fast Food,Indian  22119652000167   
4  Italian-American,Fast Food  12122201000053   

                                                link  \
0  https://www.swiggy.com/rest