In [7]:
import pandas as pd
import os
from datetime import datetime
import csv

def detect_delimiter(file_path):
    with open(file_path, 'r') as file:
        dialect = csv.Sniffer().sniff(file.read(1024))
        return dialect.delimiter

def generate_metadata(file_path):
    # Check if file exists
    if not os.path.exists(file_path):
        return "File not found."
    
    # Detect delimiter
    delimiter = detect_delimiter(file_path)
    
    # Reading the CSV file
    df = pd.read_csv(file_path, delimiter=delimiter)

    # Converting timestamps to readable dates
    date_created = datetime.fromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%d %H:%M:%S')
    date_modified = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S')
    
    # Descriptive Metadata
    descriptive_metadata = {
        "File Name": os.path.basename(file_path),
        "File Size": f"{os.path.getsize(file_path) / 1024:.2f} KB",
        "Number of Rows": df.shape[0],
        "Number of Columns": df.shape[1],
        "Column Names": list(df.columns),
        "Date Created": date_created,
        "Date Modified": date_modified,
        "Description": f"The dataset contains {df.shape[0]} rows and {df.shape[1]} columns."
    }
    
    # Operational Metadata
    operational_metadata = {
        "Column Data Types": df.dtypes.to_dict(),
        "Missing Values Count": df.isnull().sum().to_dict(),
        "Unique Values Count": df.nunique().to_dict(),
        "File Delimiter": delimiter
    }
    
    # Combine the metadata
    metadata = {
        "Descriptive Metadata": descriptive_metadata,
        "Operational Metadata": operational_metadata
    }
    
    return metadata

# Example usage
file_path = r'c:\Users\66023\OneDrive - Bain\Desktop\rishabhpanda\streamliner_beta\test_data\dataset_small.csv'
metadata = generate_metadata(file_path)
print(metadata)

{'Descriptive Metadata': {'File Name': 'dataset_small.csv', 'File Size': '0.73 KB', 'Number of Rows': 9, 'Number of Columns': 10, 'Column Names': ['Transaction_ID', 'Customer_ID', 'Total_Purchases', 'Amount', 'Total_Amount', 'Product_Category', 'Product_Type', 'Products', 'Feedback', 'Ratings'], 'Date Created': '2024-08-20 00:29:22', 'Date Modified': '2024-08-20 00:29:22', 'Description': 'The dataset contains 9 rows and 10 columns.'}, 'Operational Metadata': {'Column Data Types': {'Transaction_ID': dtype('int64'), 'Customer_ID': dtype('int64'), 'Total_Purchases': dtype('int64'), 'Amount': dtype('float64'), 'Total_Amount': dtype('float64'), 'Product_Category': dtype('O'), 'Product_Type': dtype('O'), 'Products': dtype('O'), 'Feedback': dtype('O'), 'Ratings': dtype('int64')}, 'Missing Values Count': {'Transaction_ID': 0, 'Customer_ID': 0, 'Total_Purchases': 0, 'Amount': 0, 'Total_Amount': 0, 'Product_Category': 0, 'Product_Type': 0, 'Products': 0, 'Feedback': 0, 'Ratings': 0}, 'Unique Val