In [1]:
import os
import pandas as pd

# Project root path
PROJECT_ROOT = r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace'

# Create folder structure
folders = ['data', 'notebooks', 'src', 'app', 'reports']
for folder in folders:
    os.makedirs(os.path.join(PROJECT_ROOT, folder), exist_ok=True)
print('Folder structure created.')

# Assume dataset files are downloaded and placed in data/
data_files = ['cars_data_clean.csv', 'cars_details_merges.csv', 'feature_dictionary.csv']  # Note: cars_data_merges might be a typo for cars_details_merges based on your message
for file in data_files:
    if not os.path.exists(os.path.join(PROJECT_ROOT, 'data', file)):
        print(f'Warning: {file} not found in data/. Please download from Kaggle and place it there.')

# Load and verify main dataset (cars_data_clean.csv)
main_csv = os.path.join(PROJECT_ROOT, 'data', 'cars_data_clean.csv')
df = pd.read_csv(main_csv, low_memory=False)
df.columns = df.columns.str.strip().str.lower()  # Normalize columns

# Verify key columns
required_columns = ['listed_price', 'km', 'myear']
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
    raise ValueError(f'Missing required columns: {missing_cols}')
print(f'All required columns found. Shape: {df.shape}')
print(df.head())  # Quick preview
print(df.info())  # Data types and missing values

# Save a sample for quick tests
sample_path = os.path.join(PROJECT_ROOT, 'data', 'sample_head.csv')
df.head(100).to_csv(sample_path, index=False)
print('Sample saved for testing.')

Folder structure created.
All required columns found. Shape: (37813, 66)
                           usedcarskuid            loc  myear       body  \
0  7111bf25-97af-47f9-867b-40879190d800    gomti nagar   2016  hatchback   
1  c309efc1-efaf-4f82-81ad-dcb38eb36665  borivali west   2015  hatchback   
2  7609f710-0c97-4f00-9a47-9b9284b62d3a         jasola   2015      sedan   
3  278b76e3-5539-4a5e-ae3e-353a2e3b6d7d         jasola   2013  hatchback   
4  b1eab99b-a606-48dd-a75b-57feb8a9ad92  mumbai g.p.o.   2022        muv   

  transmission fuel        km  ip  \
0       manual  cng   69162.0   0   
1       manual  cng   45864.0   0   
2       manual  cng   81506.0   0   
3       manual  cng  115893.0   0   
4       manual  cng   18900.0   0   

                                              images  imgcount  ...  \
0  [{'img': 'https://images10.gaadi.com/usedcar_i...        15  ...   
1  [{'img': 'https://images10.gaadi.com/usedcar_i...        15  ...   
2  [{'img': 'https://images10.gaad