In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import joblib
import json

In [2]:
# import files
df = pd.read_csv(r"SBS_Processed_Datasets/03_feature_engineered_output.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8107 entries, 0 to 8106
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   city                  8107 non-null   object 
 1   transmission          8107 non-null   object 
 2   owner_no              8107 non-null   int64  
 3   brand                 8107 non-null   object 
 4   model                 8107 non-null   object 
 5   model_year            8107 non-null   int64  
 6   variant_name          8107 non-null   object 
 7   price                 8107 non-null   float64
 8   registered_year       8107 non-null   int64  
 9   fuel_type             8107 non-null   object 
 10  kms_driven            8107 non-null   float64
 11  mileage_kmpl          8107 non-null   float64
 12  engine_cc             8107 non-null   int64  
 13  car_age               8107 non-null   int64  
 14  model_age             8107 non-null   int64  
 15  registration_lag     

In [4]:
df.describe()

Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
count,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0
mean,1.36487,2016.450475,885495.0,2016.518071,59566.22,19.190692,1422.127421,7.481929,7.549525,0.067596,0.844578,0.539064,0.007648,0.29493,885495.0,7298.162929
std,0.635211,3.853358,989002.7,3.862395,74921.85,3.671887,465.210269,3.862395,3.853358,0.319016,0.088946,0.103143,0.087122,0.45604,740407.8,10446.455266
min,1.0,1985.0,28000.0,1985.0,101.0,7.08,624.0,1.0,1.0,0.0,0.105263,0.198876,0.0,0.0,73333.33,50.5
25%,1.0,2014.0,398000.0,2014.0,31218.0,17.0,1197.0,5.0,5.0,0.0,0.8,0.477528,0.0,0.0,504517.5,4506.570707
50%,1.0,2017.0,600000.0,2017.0,54305.0,18.9,1248.0,7.0,7.0,0.0,0.875,0.530899,0.0,0.0,595115.6,6547.0
75%,2.0,2019.0,925000.0,2019.0,80000.0,21.4,1498.0,10.0,10.0,0.0,0.909091,0.601124,0.0,1.0,748643.2,9090.909091
max,5.0,2023.0,9600000.0,2023.0,5500000.0,35.6,5000.0,39.0,39.0,16.0,0.975,1.0,1.0,1.0,6343750.0,785714.285714


In [5]:
df.head(2)

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,kms_bins,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000.0,23.1,998,9,9,0,0.9,High,0.648876,0,1,504517.464043,12000.0
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706.0,17.0,1497,6,6,0,0.857143,Low,0.477528,0,1,730169.934641,4672.285714


In [6]:
# Check the possible encoding methods
categorical_dtypes = df.select_dtypes(include=['object']).columns.to_list() # take only categorical columns dtypes

enc_recommendation = {
    "column_names" : [],
    "unique_values_count": [],
    "recommended_encoding": []
}

for col in categorical_dtypes:
    unique_count = df[col].nunique()
    enc_recommendation["column_names"].append(col)
    enc_recommendation["unique_values_count"].append(unique_count)

    if unique_count <= 10:
        encoding_type = "one-hot-encoding"
    elif unique_count <= 100:
        encoding_type = "label-encoding"
    else:
        encoding_type = "target-encoding"

    enc_recommendation["recommended_encoding"].append(encoding_type)

encoding_recommended_df = pd.DataFrame(enc_recommendation)

In [7]:
encoding_recommended_df.sort_values(by=['recommended_encoding'])

Unnamed: 0,column_names,unique_values_count,recommended_encoding
2,brand,33,label-encoding
0,city,6,one-hot-encoding
1,transmission,2,one-hot-encoding
5,fuel_type,4,one-hot-encoding
6,kms_bins,5,one-hot-encoding
3,model,282,target-encoding
4,variant_name,2042,target-encoding


ONE HOT ENCODING

In [8]:
df.columns

Index(['city', 'transmission', 'owner_no', 'brand', 'model', 'model_year',
       'variant_name', 'price', 'registered_year', 'fuel_type', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'kms_bins', 'mileage_normalized',
       'high_mileage', 'multiple_owners', 'brand_popularity', 'kms_per_year'],
      dtype='object')

In [9]:
# One-Hot Encoding for 'city', 'transmission' and 'fuel_type'
df = pd.get_dummies(df, columns=['city', 'transmission', 'fuel_type','kms_bins'], prefix=['city', 'transmission', 'fuel_type','kms'], drop_first=False)

LABEL ENCODING

In [10]:
from sklearn.preprocessing import LabelEncoder
import joblib
#BRAND ENCODING
# Create and fit the LabelEncoder
label_encoder_brand = LabelEncoder()
df["brand_encoded"] = label_encoder_brand.fit_transform(df["brand"])  # Fit and transform the column
df.drop(columns=['brand'], inplace=True)  # Drop the original column after encoding

# Save the fitted LabelEncoder
joblib.dump(label_encoder_brand, 'PKL_Files/brand.pkl')  # Save the correctly fitted encoder
print("LabelEncoder for brand saved successfully!")

LabelEncoder for brand saved successfully!


In [11]:
df['brand_encoded'].head()

0    19
1     6
2    29
3     9
4    19
Name: brand_encoded, dtype: int64

TARGET ENCODING

In [13]:
# Step 1: Calculate the target mean for `variant_name` and `model`
variant_target_mean = df.groupby('variant_name')['price'].mean()
model_target_mean = df.groupby('model')['price'].mean()

# Step 2: Map the calculated mean to create encoded columns
df['variant_name_encoded'] = df['variant_name'].map(variant_target_mean)
df['model_encoded'] = df['model'].map(model_target_mean)

# Step 3: Drop the original columns as they’re now encoded
df = df.drop(['variant_name', 'model'], axis=1)

# Step 4: Convert mappings to dictionaries for saving
variant_name_mapping = variant_target_mean.to_dict()
model_mapping = model_target_mean.to_dict()

# Step 5: Save the mappings as PKL files
joblib.dump(variant_name_mapping, 'PKL_Files/variant_name_mapping.pkl')
joblib.dump(model_mapping, 'PKL_Files/model_mapping.pkl')



['PKL_Files/model_mapping.pkl']

In [14]:
df[['variant_name_encoded', 'model_encoded']].head()

Unnamed: 0,variant_name_encoded,model_encoded
0,424566.346883,440245.033113
1,787214.285714,683109.090909
2,470370.37037,521147.368421
3,433500.0,432471.698113
4,797000.0,837482.758621


In [15]:
df.columns

Index(['owner_no', 'model_year', 'price', 'registered_year', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'mileage_normalized', 'high_mileage',
       'multiple_owners', 'brand_popularity', 'kms_per_year', 'city_bangalore',
       'city_chennai', 'city_delhi', 'city_hyderabad', 'city_jaipur',
       'city_kolkata', 'transmission_automatic', 'transmission_manual',
       'fuel_type_cng', 'fuel_type_diesel', 'fuel_type_lpg',
       'fuel_type_petrol', 'kms_High', 'kms_Low', 'kms_Moderate', 'kms_Unused',
       'kms_Very_Low', 'brand_encoded', 'variant_name_encoded',
       'model_encoded'],
      dtype='object')

FEATURE SCALING

In [16]:
df.head(2)

Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year,city_bangalore,city_chennai,city_delhi,city_hyderabad,city_jaipur,city_kolkata,transmission_automatic,transmission_manual,fuel_type_cng,fuel_type_diesel,fuel_type_lpg,fuel_type_petrol,kms_High,kms_Low,kms_Moderate,kms_Unused,kms_Very_Low,brand_encoded,variant_name_encoded,model_encoded
0,3,2015,400000.0,2015,120000.0,23.1,998,9,9,0,0.9,0.648876,0,1,504517.464043,12000.0,True,False,False,False,False,False,False,True,False,False,False,True,True,False,False,False,False,19,424566.346883,440245.033113
1,2,2018,811000.0,2018,32706.0,17.0,1497,6,6,0,0.857143,0.477528,0,1,730169.934641,4672.285714,True,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,6,787214.285714,683109.090909


In [17]:
print(df.columns)      # Check the new column names

Index(['owner_no', 'model_year', 'price', 'registered_year', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'mileage_normalized', 'high_mileage',
       'multiple_owners', 'brand_popularity', 'kms_per_year', 'city_bangalore',
       'city_chennai', 'city_delhi', 'city_hyderabad', 'city_jaipur',
       'city_kolkata', 'transmission_automatic', 'transmission_manual',
       'fuel_type_cng', 'fuel_type_diesel', 'fuel_type_lpg',
       'fuel_type_petrol', 'kms_High', 'kms_Low', 'kms_Moderate', 'kms_Unused',
       'kms_Very_Low', 'brand_encoded', 'variant_name_encoded',
       'model_encoded'],
      dtype='object')


In [18]:
# from sklearn.preprocessing import StandardScaler

# Step 1: Identify the columns to scale
columns_to_scale = [
    'owner_no', 'kms_driven', 'mileage_kmpl', 'engine_cc', 'car_age', 
    'model_age', 'registration_lag', 'normalized_model_age', 
    'mileage_normalized', 'brand_popularity', 'kms_per_year'
]

# Step 2: Create a copy of the DataFrame to avoid overwriting
df_scaled = df.copy()

# Step 3: Apply StandardScaler to the necessary columns
scaler = StandardScaler()
df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

# Step 4: Verify the scaled columns
print(df_scaled[columns_to_scale].describe())

# Optional: Save the scaled DataFrame if needed
df_scaled.to_csv(r'SBS_Processed_Datasets/04_1_scaled_dataframe.csv', index=False)
df.to_csv(r'SBS_Processed_Datasets/04_2_Non_scaled_dataframe.csv', index=False)


           owner_no    kms_driven  mileage_kmpl     engine_cc       car_age  \
count  8.107000e+03  8.107000e+03  8.107000e+03  8.107000e+03  8.107000e+03   
mean  -2.454076e-17 -3.856406e-17  3.155241e-16  7.011647e-17  3.155241e-17   
std    1.000062e+00  1.000062e+00  1.000062e+00  1.000062e+00  1.000062e+00   
min   -5.744427e-01 -7.937457e-01 -3.298424e+00 -1.715733e+00 -1.678318e+00   
25%   -5.744427e-01 -3.783939e-01 -5.966488e-01 -4.839560e-01 -6.426277e-01   
50%   -5.744427e-01 -7.022716e-02 -7.917170e-02 -3.743214e-01 -1.247824e-01   
75%    9.999343e-01  2.727514e-01  6.017192e-01  1.631031e-01  6.519856e-01   
max    5.723065e+00  7.261926e+01  4.469180e+00  7.691346e+00  8.160743e+00   

          model_age  registration_lag  normalized_model_age  \
count  8.107000e+03      8.107000e+03          8.107000e+03   
mean   1.402329e-17     -3.242887e-17         -2.839717e-16   
std    1.000062e+00      1.000062e+00          1.000062e+00   
min   -1.699798e+00     -2.119018e-0

In [19]:
df.columns 

Index(['owner_no', 'model_year', 'price', 'registered_year', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'mileage_normalized', 'high_mileage',
       'multiple_owners', 'brand_popularity', 'kms_per_year', 'city_bangalore',
       'city_chennai', 'city_delhi', 'city_hyderabad', 'city_jaipur',
       'city_kolkata', 'transmission_automatic', 'transmission_manual',
       'fuel_type_cng', 'fuel_type_diesel', 'fuel_type_lpg',
       'fuel_type_petrol', 'kms_High', 'kms_Low', 'kms_Moderate', 'kms_Unused',
       'kms_Very_Low', 'brand_encoded', 'variant_name_encoded',
       'model_encoded'],
      dtype='object')