In [22]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv(r"C:\Users\shiba\Downloads\car__sales__data.csv")

# 2. Rename columns (lowercase and replace spaces with underscores)
df.columns = df.columns.str.lower().str.replace(" ", "_")

# 3. Check and drop duplicates
df = df.drop_duplicates()

# 4. Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# 5. Handle missing values (Example: fill with mode or drop rows)
# Fill categorical nulls with mode, numerical with median
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())


# 6. Standardize categorical text fields (e.g., 'Fuel_Type', 'Brand')
if 'fuel_type' in df.columns:
    df['fuel_type'] = df['fuel_type'].str.capitalize()

if 'brand' in df.columns:
    df['brand'] = df['brand'].str.title()

# 7. Fix date formats (example: 'sale_date' to datetime)
if 'sale_date' in df.columns:
    df['sale_date'] = pd.to_datetime(df['sale_date'], errors='coerce')

# 8. Convert prices or numeric fields to appropriate types
if 'price' in df.columns:
    df['price']=df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)


# 9. Final check
print("Data types:\n", df.dtypes)
print("First 5 rows:\n", df.head())

# 10. Export the cleaned dataset
df.to_csv("cleaned_car_sales_data.csv", index=False)

Missing values per column:
 car_make            0
car_model           0
year                0
mileage             0
price               0
fuel_type           0
color               0
transmission        0
options/features    0
condition           0
accident            0
dtype: int64
Data types:
 car_make             object
car_model            object
year                  int64
mileage               int64
price               float64
fuel_type            object
color                object
transmission         object
options/features     object
condition            object
accident             object
dtype: object
First 5 rows:
      car_make    car_model  year  mileage      price fuel_type   color  \
0     Hyundai       Tucson  2010    52554  44143.820    Hybrid   Black   
1  Land Rover  Range Rover  2016   115056  25414.060    Diesel  Silver   
2       Honda       Accord  2022    18044  28262.872  Electric  Yellow   
3         Kia         Soul  2011    79251  28415.848    Hybrid  Orange 