In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import joblib
import json



In [2]:
# import files
df = pd.read_csv(r"SBS_Processed_Datasets/03_feature_engineered_output.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8189 entries, 0 to 8188
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                8189 non-null   object 
 1   transmission        8189 non-null   object 
 2   owner_no            8189 non-null   int64  
 3   brand               8189 non-null   object 
 4   model               8189 non-null   object 
 5   model_year          8189 non-null   int64  
 6   variant_name        8189 non-null   object 
 7   price               8189 non-null   float64
 8   registered_year     8189 non-null   int64  
 9   fuel_type           8189 non-null   object 
 10  kms_driven          8189 non-null   int64  
 11  engine_cc           8189 non-null   int64  
 12  mileage_kmpl        8189 non-null   float64
 13  car_age             8189 non-null   int64  
 14  mileage_normalized  8189 non-null   float64
 15  brand_popularity    8189 non-null   float64
 16  age_gr

In [4]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl,car_age,mileage_normalized,brand_popularity,age_group,high_mileage
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000,998,23.1,9,2.566667,504649.260948,old,1
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706,1497,17.0,6,2.833333,730169.934641,mid-age,0
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949,1199,23.84,6,3.973333,748643.178049,mid-age,1
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794,1197,19.1,10,1.91,605200.000615,old,0
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000,1248,23.65,9,2.627778,504649.260948,old,1


In [5]:
# Check the possible encoding methods
categorical_dtypes = df.select_dtypes(include=['object']).columns.to_list() # take only categorical columns dtypes

enc_recommendation = {
    "column_names" : [],
    "unique_values_count": [],
    "recommended_encoding": []
}

for col in categorical_dtypes:
    unique_count = df[col].nunique()
    enc_recommendation["column_names"].append(col)
    enc_recommendation["unique_values_count"].append(unique_count)

    if unique_count <= 10:
        encoding_type = "one-hot-encoding"
    elif unique_count <= 100:
        encoding_type = "label-encoding"
    else:
        encoding_type = "target-encoding"

    enc_recommendation["recommended_encoding"].append(encoding_type)

encoding_recommended_df = pd.DataFrame(enc_recommendation)

In [13]:
encoding_recommended_df.sort_values(by=['recommended_encoding'])

Unnamed: 0,column_names,unique_values_count,recommended_encoding
2,brand,33,label-encoding
0,city,6,one-hot-encoding
1,transmission,2,one-hot-encoding
5,fuel_type,5,one-hot-encoding
6,age_group,3,one-hot-encoding
3,model,299,target-encoding
4,variant_name,2087,target-encoding


ONE HOT ENCODING

In [6]:
one_hot_encoding_columns = ['city','transmission','fuel_type']  # age-group omitted here cos it shows relative progession (use in label encoding)

df = pd.get_dummies(df,columns=one_hot_encoding_columns,drop_first=True)

In [7]:
df["age_group"]

0           old
1       mid-age
2       mid-age
3           old
4           old
         ...   
8184        new
8185        old
8186        old
8187        old
8188    mid-age
Name: age_group, Length: 8189, dtype: object

LABEL ENCODING

In [8]:
#from sklearn.preprocessing import LabelEncoder (used LabelEncoder from this module) 
#AGE
age_group_transform = {
    "new": 0,
    "mid-age": 1,
    "old": 2
}

df["age_group"] = df["age_group"].map(age_group_transform)

#BRAND
label_encoder = LabelEncoder()

df["brand"] = label_encoder.fit_transform(df["brand"])

In [9]:
# just to know which brand maps which numerical representation we write code below to understand for later use
label_classes = {i: l for i,l in enumerate(label_encoder.classes_)}
with open('JSON_Files/brand.json', 'w') as f:
     json.dump(label_classes, f, indent=4)

joblib.dump(label_classes, 'JSON_Files/brand.pkl')

['JSON_Files/brand.pkl']

In [10]:
df.head(2)

Unnamed: 0,owner_no,brand,model,model_year,variant_name,price,registered_year,kms_driven,engine_cc,mileage_kmpl,...,city_chennai,city_delhi,city_hyderabad,city_jaipur,city_kolkata,transmission_manual,fuel_type_diesel,fuel_type_electric,fuel_type_lpg,fuel_type_petrol
0,3,19,maruti celerio,2015,vxi,400000.0,2015,120000,998,23.1,...,False,False,False,False,False,True,False,False,False,True
1,2,6,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,32706,1497,17.0,...,False,False,False,False,False,True,False,False,False,True


In [None]:
TARGET ENCODING

In [11]:

# df is  DataFrame and `price` is the target variable
# Step 1: Calculate the target mean for `variant_name` and `model`
variant_target_mean = df.groupby('variant_name')['price'].mean()
model_target_mean = df.groupby('model')['price'].mean()

# Step 2: Map the calculated mean to create encoded columns
df['variant_name_encoded'] = df['variant_name'].map(variant_target_mean)
df['model_encoded'] = df['model'].map(model_target_mean)

# Step 3: Drop the original columns as they’re now encoded
df = df.drop(['variant_name', 'model'], axis=1)

# Step 4: Convert mappings to dictionaries for saving
variant_name_mapping = variant_target_mean.to_dict()
model_mapping = model_target_mean.to_dict()

# Step 5: Save the mappings as JSON files
with open('JSON_Files/variant_name_mapping.json', 'w') as f:
    json.dump(variant_name_mapping, f)
with open('JSON_Files/model_mapping.json', 'w') as f:
    json.dump(model_mapping, f)

# Step 6: Save the mappings as PKL files
joblib.dump(variant_name_mapping, 'JSON_Files/variant_name_mapping.pkl')
joblib.dump(model_mapping, 'JSON_Files/model_mapping.pkl')

['JSON_Files/model_mapping.pkl']

In [12]:
df[['variant_name_encoded', 'model_encoded']].head()

Unnamed: 0,variant_name_encoded,model_encoded
0,424566.346883,440245.033113
1,787214.285714,683109.090909
2,470370.37037,521147.368421
3,433500.0,432471.698113
4,797000.0,837482.758621


In [13]:
df.head(2)

Unnamed: 0,owner_no,brand,model_year,price,registered_year,kms_driven,engine_cc,mileage_kmpl,car_age,mileage_normalized,...,city_hyderabad,city_jaipur,city_kolkata,transmission_manual,fuel_type_diesel,fuel_type_electric,fuel_type_lpg,fuel_type_petrol,variant_name_encoded,model_encoded
0,3,19,2015,400000.0,2015,120000,998,23.1,9,2.566667,...,False,False,False,True,False,False,False,True,424566.346883,440245.033113
1,2,6,2018,811000.0,2018,32706,1497,17.0,6,2.833333,...,False,False,False,True,False,False,False,True,787214.285714,683109.090909


In [14]:
df.columns

Index(['owner_no', 'brand', 'model_year', 'price', 'registered_year',
       'kms_driven', 'engine_cc', 'mileage_kmpl', 'car_age',
       'mileage_normalized', 'brand_popularity', 'age_group', 'high_mileage',
       'city_chennai', 'city_delhi', 'city_hyderabad', 'city_jaipur',
       'city_kolkata', 'transmission_manual', 'fuel_type_diesel',
       'fuel_type_electric', 'fuel_type_lpg', 'fuel_type_petrol',
       'variant_name_encoded', 'model_encoded'],
      dtype='object')

FEATURE SCALING

In [15]:
from sklearn.preprocessing import MinMaxScaler

# Initialize Min-Max Scaler
scaler = MinMaxScaler()

# List of columns to scale (check they exist in the DataFrame)
columns_to_scale = ['price', 'kms_driven', 'engine_cc', 'mileage_kmpl', 
                    'car_age', 'mileage_normalized', 'brand_popularity', 'price_per_km']

# Filter only existing columns
existing_columns = [col for col in columns_to_scale if col in df.columns]

# Apply Min-Max Scaling only to existing columns
df[existing_columns] = scaler.fit_transform(df[existing_columns])

# Verify scaling by checking a summary
print(df[existing_columns].describe())


             price   kms_driven    engine_cc  mileage_kmpl      car_age  \
count  8189.000000  8189.000000  8189.000000   8189.000000  8189.000000   
mean      0.091990     0.010747     0.274795      0.091198     0.169489   
std       0.109231     0.013575     0.095120      0.030377     0.102326   
min       0.000000     0.000000     0.000000      0.000000     0.000000   
25%       0.038759     0.005534     0.228287      0.074631     0.105263   
50%       0.059758     0.009800     0.238636      0.088926     0.157895   
75%       0.096218     0.014527     0.289367      0.107734     0.236842   
max       1.000000     1.000000     1.000000      1.000000     1.000000   

       mileage_normalized  brand_popularity  
count         8189.000000       8189.000000  
mean             0.093726          0.133196  
std              0.080035          0.125128  
min              0.000000          0.000000  
25%              0.045507          0.068786  
50%              0.071024          0.084822  
75

In [16]:
# Export it
df.to_csv(r"SBS_Processed_Datasets\04_preprocessed_output.csv",index=False)