### Clean all data together so that the categorical conversion is applicable.!

In [432]:
import pandas as pd
import json
from pandas import json_normalize

df=pd.read_csv(r'./parsed_data/all_cars_parsed_data.csv',header=[0])
df.head(2)

Unnamed: 0,City,it,ft,bt,km,transmission,ownerNo,owner,oem,model,...,Front Brake Type,Rear Brake Type,Top Speed,Acceleration,Tyre Type,No Door Numbers,Cargo Volumn,Wheel Size,Alloy Wheel Size,Ground Clearance Unladen
0,Bangalore,0,Petrol,Hatchback,120000,Manual,3,3rd Owner,Maruti,Maruti Celerio,...,Ventilated Disc,Drum,150 Kmph,15.05 Seconds,"Tubeless, Radial",5.0,235-litres,,,
1,Bangalore,0,Petrol,SUV,32706,Manual,2,2nd Owner,Ford,Ford Ecosport,...,Ventilated Disc,Drum,,,"Tubeless,Radial",4.0,352-litres,16.0,16.0,


- Keep int64, float64, and bool as-is
- Attempt to convert object columns to numeric — if successful, assign the result
- Attempt boolean conversion for text-based columns like 'yes'/'no', 'true'/'false'
- Leave everything else as object

In [433]:
# Attempt to convert each column
for col in df.columns:
    # Skip already numeric or boolean
    if pd.api.types.is_numeric_dtype(df[col]) or pd.api.types.is_bool_dtype(df[col]):
        continue
    
    # Try converting to numeric (int/float)
    converted = pd.to_numeric(df[col], errors='coerce')
    
    # If it successfully converts (not all NaNs), use it
    if not converted.isna().all():
        df[col] = converted
        continue
    
    # Try boolean conversion for strings like 'yes', 'no', 'true', 'false'
    unique_vals = df[col].dropna().str.lower().unique()
    if set(unique_vals).issubset({'yes', 'no', 'true', 'false', '0', '1'}):
        df[col] = df[col].str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False, '1': True, '0': False})


### check col dtype

In [434]:
# Group similar types together
def get_count_dtypes(df):
    dtype_counts = {
        'int': 0,
        'float': 0,
        'bool': 0,
        'object': 0,
        'datetime': 0,
        'other': 0
    }
    
    for dtype in df.dtypes:
        if pd.api.types.is_integer_dtype(dtype):
            dtype_counts['int'] += 1
        elif pd.api.types.is_float_dtype(dtype):
            dtype_counts['float'] += 1
        elif pd.api.types.is_bool_dtype(dtype):
            dtype_counts['bool'] += 1
        elif pd.api.types.is_object_dtype(dtype):
            dtype_counts['object'] += 1
        elif pd.api.types.is_datetime64_any_dtype(dtype):
            dtype_counts['datetime'] += 1
        else:
            dtype_counts['other'] += 1
    
    # Display the count
    print(pd.Series(dtype_counts))


In [435]:
def convert_object_to_int_float(df):
    # Attempt to convert each column
    for col in df.columns:
        # Skip already numeric or boolean
        if pd.api.types.is_numeric_dtype(df[col]) or pd.api.types.is_bool_dtype(df[col]):
            continue
        
        # Try converting to numeric (int/float)
        converted = pd.to_numeric(df[col], errors='coerce')
        
        # If it successfully converts (not all NaNs), use it
        if not converted.isna().all():
            df[col] = converted
            continue
        
        # Try boolean conversion for strings like 'yes', 'no', 'true', 'false'
        unique_vals = df[col].dropna().str.lower().unique()
        if set(unique_vals).issubset({'yes', 'no', 'true', 'false', '0', '1'}):
            df[col] = df[col].str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False, '1': True, '0': False})

    return df

In [436]:
get_count_dtypes(df)

int           4
float        31
bool        173
object       35
datetime      0
other         0
dtype: int64


### convert True/False to 1/0

In [437]:
# Convert all boolean columns: True → 1, False → 0
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)


In [438]:
get_count_dtypes(df)

int         177
float        31
bool          0
object       35
datetime      0
other         0
dtype: int64


### Get object cols

In [439]:
# Get list of all object dtype columns
object_cols = df.select_dtypes(include='object').columns
print(object_cols)

Index(['City', 'ft', 'bt', 'transmission', 'owner', 'oem', 'model', 'price',
       'priceActual', 'trendingText.imgUrl', 'trendingText.heading',
       'trendingText.desc', 'Fuel Type', 'Seats', 'Kms Driven', 'RTO',
       'Ownership', 'Engine Displacement', 'Transmission', 'Mileage', 'Engine',
       'Max Power', 'Color', 'Engine Type', 'Max Torque',
       'Value Configuration', 'Fuel Suppy System', 'BoreX Stroke',
       'Turbo Charger', 'Super Charger', 'Drive Type', 'Steering Type',
       'Front Brake Type', 'Rear Brake Type', 'Tyre Type'],
      dtype='object')


In [440]:
df_object=df[object_cols]

In [441]:
df_object.head(2)

Unnamed: 0,City,ft,bt,transmission,owner,oem,model,price,priceActual,trendingText.imgUrl,...,Value Configuration,Fuel Suppy System,BoreX Stroke,Turbo Charger,Super Charger,Drive Type,Steering Type,Front Brake Type,Rear Brake Type,Tyre Type
0,Bangalore,Petrol,Hatchback,Manual,3rd Owner,Maruti,Maruti Celerio,₹ 4 Lakh,,https://stimg.cardekho.com/used-cars/common/ic...,...,DOHC,MPFi,73 X 82 mm,No,False,FWD,Power,Ventilated Disc,Drum,"Tubeless, Radial"
1,Bangalore,Petrol,SUV,Manual,2nd Owner,Ford,Ford Ecosport,₹ 8.11 Lakh,,https://stimg.cardekho.com/used-cars/common/ic...,...,DOHC,Direct Injection,79 X 76.5 mm,No,False,FWD,Power,Ventilated Disc,Drum,"Tubeless,Radial"


In [442]:
nunique_counts = df.select_dtypes(include='object').nunique()

# Filter columns with unique values > 10
cols_gt_10_unique = nunique_counts[nunique_counts > 10]
print(cols_gt_10_unique)

oem                      33
model                   315
price                  1400
priceActual             721
Kms Driven             4089
RTO                     464
Engine Displacement     150
Mileage                 522
Engine                  150
Max Power               760
Color                   153
Engine Type             508
Max Torque              617
Value Configuration      13
Fuel Suppy System        81
BoreX Stroke            171
Drive Type               20
Front Brake Type         34
Rear Brake Type          35
Tyre Type                34
dtype: int64


### select few imp columns and clean the data to float

In [443]:
df_tmp=df_object[['price','Kms Driven','Engine Displacement','Mileage','Max Power']]

In [444]:
# Clean price
def clean_price(val):
    if pd.isna(val):
        return None
    val = val.replace("₹", "").replace(",", "").strip().lower()
    if 'lakh' in val:
        return float(val.replace('lakh', '').strip()) * 1e5
    if 'crore' in val:
        return float(val.replace('crore', '').strip()) * 1e7
    return float(val)

# Apply transformations
df_tmp.loc[:, 'price'] = df_tmp['price'].apply(clean_price)


In [445]:
# Clean Kms Driven: remove commas and extract numeric values
df_tmp['Kms Driven'] = df_tmp['Kms Driven'].str.replace(',', '').str.extract('(\d+)').astype(float)

# Clean Engine Displacement: Extract digits from the "cc" part
df_tmp['Engine Displacement'] = df_tmp['Engine Displacement'].str.extract('(\d+)').astype(float)

# Clean Mileage: Extract the numeric value (assuming it's a float)
df_tmp['Mileage'] = df_tmp['Mileage'].str.extract(r'([\d.]+)').astype(float)

# Clean Max Power: Extract numeric part of the max power value
df_tmp['Max Power'] = df_tmp['Max Power'].str.replace(',', '').str.extract(r'([\d.]+)').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['Kms Driven'] = df_tmp['Kms Driven'].str.replace(',', '').str.extract('(\d+)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['Engine Displacement'] = df_tmp['Engine Displacement'].str.extract('(\d+)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['

### merge the few cleaned columns into original df

In [446]:
df.update(df_tmp)

In [447]:
df=convert_object_to_int_float(df)

In [448]:
df['price']

0        400000.0
1        811000.0
2        585000.0
3        462000.0
4        790000.0
          ...    
8364     700000.0
8365    2000000.0
8366    3500000.0
8367     850000.0
8368    1500000.0
Name: price, Length: 8369, dtype: float64

In [449]:
get_count_dtypes(df)

int         177
float        37
bool          0
object       29
datetime      0
other         0
dtype: int64


In [450]:
# Get list of all object dtype columns
object_cols = df.select_dtypes(include='object').columns
print(object_cols)
df_object =df[object_cols]

Index(['City', 'ft', 'bt', 'transmission', 'owner', 'oem', 'model',
       'priceActual', 'trendingText.imgUrl', 'trendingText.heading',
       'trendingText.desc', 'Fuel Type', 'Seats', 'RTO', 'Ownership',
       'Transmission', 'Engine', 'Color', 'Engine Type', 'Max Torque',
       'Value Configuration', 'Fuel Suppy System', 'BoreX Stroke',
       'Turbo Charger', 'Drive Type', 'Steering Type', 'Front Brake Type',
       'Rear Brake Type', 'Tyre Type'],
      dtype='object')


In [451]:
# Step 1: Convert object columns to categorical type
df_object = df_object.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)

# Step 2: Label Encoding for each column with categorical data
label_encoder = LabelEncoder()

# Apply label encoding on all categorical columns
for col in df_object.select_dtypes(include='category').columns:
    df_object[col] = label_encoder.fit_transform(df_object[col])

# Step 3: Print the resulting DataFrame
df_object.head(2)

Unnamed: 0,City,ft,bt,transmission,owner,oem,model,priceActual,trendingText.imgUrl,trendingText.heading,...,Max Torque,Value Configuration,Fuel Suppy System,BoreX Stroke,Turbo Charger,Drive Type,Steering Type,Front Brake Type,Rear Brake Type,Tyre Type
0,0,4,2,1,3,20,160,721,0,0,...,593,3,53,30,1,12,6,22,7,23
1,0,4,7,1,2,6,50,721,0,0,...,146,3,23,102,1,12,6,22,7,26


In [452]:
df.update(df_object)

In [453]:
print(df.shape)

(8369, 243)


In [454]:
df=convert_object_to_int_float(df)

In [455]:
df['price']

0        400000.0
1        811000.0
2        585000.0
3        462000.0
4        790000.0
          ...    
8364     700000.0
8365    2000000.0
8366    3500000.0
8367     850000.0
8368    1500000.0
Name: price, Length: 8369, dtype: float64

In [456]:
get_count_dtypes(df)

int         206
float        37
bool          0
object        0
datetime      0
other         0
dtype: int64


In [457]:
### All object columns are converted to numerical

In [458]:
df.to_csv('./cleaned_data/all_cars_cleaned_data.csv',index=False)