In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df = pd.read_csv(r"SBS_Processed_Datasets/02_cleaned_output.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8107 entries, 0 to 8106
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   city             8107 non-null   object 
 1   transmission     8107 non-null   object 
 2   owner_no         8107 non-null   int64  
 3   brand            8107 non-null   object 
 4   model            8107 non-null   object 
 5   model_year       8107 non-null   int64  
 6   variant_name     8107 non-null   object 
 7   price            8107 non-null   float64
 8   registered_year  8107 non-null   int64  
 9   fuel_type        8107 non-null   object 
 10  kms_driven       8107 non-null   float64
 11  mileage_kmpl     8107 non-null   float64
 12  engine_cc        8107 non-null   int64  
dtypes: float64(3), int64(4), object(6)
memory usage: 823.5+ KB


In [4]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,mileage_kmpl,engine_cc
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000.0,23.1,998
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706.0,17.0,1497
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949.0,23.84,1199
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794.0,19.1,1197
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000.0,23.65,1248


FEATURE ENGINEERING

In [5]:
#STEP_1 ADD AGE RELATED FEATURES

# Get the current year
current_year = datetime.now().year

# Calculate car age
df['car_age'] = current_year - df['registered_year']
df['model_age'] = current_year - df['model_year']
df['registration_lag']  = df['registered_year'] - df['model_year'] 
df['normalized_model_age']  = df['car_age'] / (df['model_age']  + 1)

In [6]:
#STEP_2 Add price_per_km

bins = [0, 5000, 10000, 50000, 100000, float('inf')]
labels = ['Unused', 'Very_Low', 'Low', 'Moderate', 'High']  # Remove one label to match bin intervals
df['kms_bins'] = pd.cut(df['kms_driven'], bins=bins, labels=labels, include_lowest=True)

In [7]:
#STEP_3 Create mileage_normalized

# Calculate normalized mileage
df['mileage_normalized'] = df['mileage_kmpl'] / df['mileage_kmpl'].max()

In [8]:
#STEP_4 High Mileage Flag

# high_mileage (binary feature)
high_mileage_threshold = 150000  # Define the threshold for high mileage
df['high_mileage'] = (df['kms_driven'] > high_mileage_threshold).astype(int)

In [9]:
#STEP_5 Multiple Owners

# Flag for cars with multiple previous owners
df['multiple_owners'] = (df['owner_no'] > 1).astype(int)

In [10]:
#STEP_6 Brand Popularity

# Compute average price per brand
brand_avg_price = df.groupby('brand')['price'].mean().to_dict()

# Map the average price to each row
df['brand_popularity'] = df['brand'].map(brand_avg_price)

# Example: Adding brand popularity column
print(df[['brand', 'brand_popularity']].head())

     brand  brand_popularity
0   maruti     504517.464043
1     ford     730169.934641
2     tata     748643.178049
3  hyundai     595115.553404
4   maruti     504517.464043


In [11]:
#STEP_7 kms_per_year
df['kms_per_year'] = df['kms_driven'] / (df['car_age'] + 1)

In [12]:
df.head(2)

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,...,car_age,model_age,registration_lag,normalized_model_age,kms_bins,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,...,9,9,0,0.9,High,0.648876,0,1,504517.464043,12000.0
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,...,6,6,0,0.857143,Low,0.477528,0,1,730169.934641,4672.285714


In [13]:
df.describe()

Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,mileage_kmpl,engine_cc,car_age,model_age,registration_lag,normalized_model_age,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
count,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0,8107.0
mean,1.36487,2016.450475,885495.0,2016.518071,59566.22,19.190692,1422.127421,7.481929,7.549525,0.067596,0.844578,0.539064,0.007648,0.29493,885495.0,7298.162929
std,0.635211,3.853358,989002.7,3.862395,74921.85,3.671887,465.210269,3.862395,3.853358,0.319016,0.088946,0.103143,0.087122,0.45604,740407.8,10446.455266
min,1.0,1985.0,28000.0,1985.0,101.0,7.08,624.0,1.0,1.0,0.0,0.105263,0.198876,0.0,0.0,73333.33,50.5
25%,1.0,2014.0,398000.0,2014.0,31218.0,17.0,1197.0,5.0,5.0,0.0,0.8,0.477528,0.0,0.0,504517.5,4506.570707
50%,1.0,2017.0,600000.0,2017.0,54305.0,18.9,1248.0,7.0,7.0,0.0,0.875,0.530899,0.0,0.0,595115.6,6547.0
75%,2.0,2019.0,925000.0,2019.0,80000.0,21.4,1498.0,10.0,10.0,0.0,0.909091,0.601124,0.0,1.0,748643.2,9090.909091
max,5.0,2023.0,9600000.0,2023.0,5500000.0,35.6,5000.0,39.0,39.0,16.0,0.975,1.0,1.0,1.0,6343750.0,785714.285714


In [14]:
df.head(2)

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,...,car_age,model_age,registration_lag,normalized_model_age,kms_bins,mileage_normalized,high_mileage,multiple_owners,brand_popularity,kms_per_year
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,...,9,9,0,0.9,High,0.648876,0,1,504517.464043,12000.0
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,...,6,6,0,0.857143,Low,0.477528,0,1,730169.934641,4672.285714


In [15]:
df.columns

Index(['city', 'transmission', 'owner_no', 'brand', 'model', 'model_year',
       'variant_name', 'price', 'registered_year', 'fuel_type', 'kms_driven',
       'mileage_kmpl', 'engine_cc', 'car_age', 'model_age', 'registration_lag',
       'normalized_model_age', 'kms_bins', 'mileage_normalized',
       'high_mileage', 'multiple_owners', 'brand_popularity', 'kms_per_year'],
      dtype='object')

In [16]:
# Export it
df.to_csv(r"SBS_Processed_Datasets\03_feature_engineered_output.csv",index=False)