In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import hashlib

# Project paths
PROJECT_ROOT = r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data', 'cars_data_clean.csv')
REPORTS_PATH = os.path.join(PROJECT_ROOT, 'reports')
os.makedirs(REPORTS_PATH, exist_ok=True)

# Load dataset
df = pd.read_csv(DATA_PATH, low_memory=False)
df.columns = df.columns.str.strip().str.lower()

# Compute car_age
df['car_age'] = 2025 - df['myear']
print('Dataset shape:', df.shape)

# Metadata report
metadata = {
    "shape": df.shape,
    "columns": df.columns.tolist(),
    "missing_counts": df.isnull().sum().to_dict()
}
with open(os.path.join(REPORTS_PATH, 'eda_metadata.json'), 'w') as f:
    json.dump(metadata, f, indent=2)

# Dataset hash
with open(DATA_PATH, "rb") as f:
    file_hash = hashlib.md5(f.read()).hexdigest()
print("Dataset MD5:", file_hash)

# Missing values heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Overview")
plt.savefig(os.path.join(REPORTS_PATH, 'missing_values_heatmap.png'))
plt.close()

# 1. Price distribution (log scale due to skew)
plt.figure(figsize=(10, 6))
sns.histplot(df['listed_price'], bins=50, log_scale=True)
plt.title('Distribution of Listed Price (Log Scale)')
plt.xlabel('Listed Price (₹, log scale)')
plt.ylabel('Count')
plt.savefig(os.path.join(REPORTS_PATH, 'price_distribution.png'))
plt.close()

# 2. Kilometers driven vs. Price (scatter)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='km', y='listed_price', data=df, alpha=0.5)
plt.title('Kilometers Driven vs. Listed Price')
plt.xlabel('Kilometers Driven')
plt.ylabel('Listed Price (₹)')
plt.yscale('log')
plt.savefig(os.path.join(REPORTS_PATH, 'km_vs_price.png'))
plt.close()

# 3. Car age vs. Price (scatter)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='car_age', y='listed_price', data=df, alpha=0.5)
plt.title('Car Age vs. Listed Price')
plt.xlabel('Car Age (Years)')
plt.ylabel('Listed Price (₹)')
plt.yscale('log')
plt.savefig(os.path.join(REPORTS_PATH, 'car_age_vs_price.png'))
plt.close()

# 4. Fuel type distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='fuel', data=df, order=df['fuel'].value_counts().index)
plt.title('Fuel Type Distribution')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.savefig(os.path.join(REPORTS_PATH, 'fuel_distribution.png'))
plt.close()

# 5. Transmission type distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='transmission', data=df)
plt.title('Transmission Type Distribution')
plt.xlabel('Transmission')
plt.ylabel('Count')
plt.savefig(os.path.join(REPORTS_PATH, 'transmission_distribution.png'))
plt.close()

# 6. Price by city (top 10 cities)
top_cities = df['city'].value_counts().index[:10]
plt.figure(figsize=(12, 6))
sns.boxplot(x='city', y='listed_price', data=df[df['city'].isin(top_cities)])
plt.title('Listed Price by Top 10 Cities')
plt.xlabel('City')
plt.ylabel('Listed Price (₹)')
plt.yscale('log')
plt.xticks(rotation=45)
plt.savefig(os.path.join(REPORTS_PATH, 'price_by_city.png'))
plt.close()

# Basic statistics
print('\nSummary statistics for key numerical columns:')
print(df[['listed_price', 'km', 'car_age']].describe())

# Save cleaned dataset with car_age
df.to_csv(os.path.join(PROJECT_ROOT, 'data', 'cleaned_with_age.csv'), index=False)
print('Cleaned dataset with car_age saved to data/cleaned_with_age.csv')

# Write README summary
readme_content = f"""
# EDA Summary
- Dataset: CarDekho used cars ({df.shape[0]} rows, {df.shape[1]} columns).
- Key features: listed_price, km, myear, car_age (computed), fuel, transmission, city.
- Price is right-skewed; log transformation recommended.
- Visuals saved in reports/: price distribution, km vs. price, car_age vs. price, fuel, transmission, and price by city.
- Missing values summary stored in reports/eda_metadata.json and heatmap.
- Dataset MD5 checksum: {file_hash}
- Next steps: Clean missing values, encode categoricals, and train baseline model.
"""
with open(os.path.join(PROJECT_ROOT, 'README.md'), 'w') as f:
    f.write(readme_content)
print('README.md updated with EDA summary.')

Dataset shape: (37813, 67)
Dataset MD5: d2c9aa9c030052df712388d301382f49

Summary statistics for key numerical columns:
       listed_price            km       car_age
count  3.781300e+04  3.781300e+04  37813.000000
mean   7.999868e+05  6.240930e+04      9.430487
std    3.043045e+06  5.847216e+04      3.779089
min    1.196300e+04  1.010000e+02      2.000000
25%    3.200000e+05  3.173900e+04      7.000000
50%    5.250000e+05  5.672600e+04      9.000000
75%    8.552720e+05  8.331000e+04     12.000000
max    5.500006e+08  6.300000e+06     42.000000
Cleaned dataset with car_age saved to data/cleaned_with_age.csv
README.md updated with EDA summary.
