In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob

def extract_data(file_path):
    """Extract data from a CSV file."""
    df = pd.read_csv(file_path)
    return df

def transform(df):

  # Rename 'cost' column to 'price'
    df.rename(columns={'cost': 'price'}, inplace=True)
    """Transform the DataFrame by cleaning and renaming columns."""
    # Convert 'cost' column to numeric after removing '₹' symbol
    df['cost_num'] = df['price'].str.replace('₹', '').str.replace(',', '')
    df['cost_num'] = pd.to_numeric(df['cost_num'])

   # df = df.dropna()


    """Transform the DataFrame by cleaning and renaming columns."""
    # Convert 'rating' column to numeric after removing '--' symbol
    df['rate_num'] = df['rating'].str.replace('--', '0').str.replace(',', '')
    df['rate_num'] = pd.to_numeric(df['rate_num'])

    rows_before = len(df)

    # Drop rows with any empty cells
    df.dropna(inplace=True)

    # Count the number of rows after dropping NAs
    rows_after = len(df)

    # Calculate the number of rows deleted
    rows_deleted = rows_before - rows_after
    print(f"Rows deleted: {rows_deleted}")

    # Calculate and print the mean price
    average = df['cost_num'].mean()
    print("Mean price: ", average)

    # Calculate and print the mean rating
    rate_average = df['rate_num'].mean()
    print("Average rating is : ", rate_average)

    return df

def load_data(df, output_path):
    """Load the DataFrame to a CSV file."""
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")

def etl(input_pattern, output_file):
    """ETL process: Extract, Transform, Load."""
    # Find all files that match the input pattern
    csv_files = glob.glob(input_pattern)
    print(f"Found files: {csv_files}")

    # Extract and transform data
    data_frames = []
    for file in csv_files:
        df = extract_data(file)
        transformed_df = transform(df)
        data_frames.append(transformed_df)

    # Concatenate all DataFrames
    combined_df = pd.concat(data_frames, ignore_index=True)

    # Load data to the output file
    load_data(combined_df, output_file)

# Example usage
input_pattern = "/content/restaurant*.csv"
output_path = "/content/restaurant1_transformed.csv"

etl(input_pattern, output_path)

# Load the concatenated and transformed data for visualization
data = pd.read_csv(output_path)

# Example visualization: Histogram of 'cost_num' column
plt.hist(data['cost_num'], bins=30, edgecolor='k')
plt.title('Histogram of Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()