In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df = pd.read_csv(r"SBS_Processed_Datasets/02_cleaned_output.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8189 entries, 0 to 8188
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   city             8189 non-null   object 
 1   transmission     8189 non-null   object 
 2   owner_no         8189 non-null   int64  
 3   brand            8189 non-null   object 
 4   model            8189 non-null   object 
 5   model_year       8189 non-null   int64  
 6   variant_name     8189 non-null   object 
 7   price            8189 non-null   float64
 8   registered_year  8189 non-null   int64  
 9   fuel_type        8189 non-null   object 
 10  kms_driven       8189 non-null   int64  
 11  engine_cc        8189 non-null   int64  
 12  mileage_kmpl     8189 non-null   float64
dtypes: float64(2), int64(5), object(6)
memory usage: 831.8+ KB


In [4]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000,998,23.1
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706,1497,17.0
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949,1199,23.84
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794,1197,19.1
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000,1248,23.65


FEATURE ENGINEERING

In [5]:
#STEP_1 ADD AGE COLUMN

# Get the current year
current_year = datetime.now().year

# Calculate car age
df['car_age'] = current_year - df['registered_year']

In [6]:
#STEP_2 Create mileage_normalized

# Calculate normalized mileage
df['mileage_normalized'] = df['mileage_kmpl'] / df['car_age']

In [7]:
#STEP_3 brand_popularity

# Calculate brand popularity as average price per brand
brand_popularity = df.groupby('brand')['price'].mean().to_dict()
df['brand_popularity'] = df['brand'].map(brand_popularity)

In [8]:
#STEP_5 Age Bins

# Define age bins and labels
age_bins = [0, 2, 7, float('inf')]
age_labels = ['new', 'mid-age', 'old']
df['age_group'] = pd.cut(df['car_age'], bins=age_bins, labels=age_labels)

In [9]:
#STEP_6 High Mileage Flag

# Flag for cars with high mileage
df['high_mileage'] = (df['mileage_kmpl'] > df['mileage_kmpl'].quantile(0.75)).astype(int)

In [10]:
df.head()

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl,car_age,mileage_normalized,brand_popularity,age_group,high_mileage
0,bangalore,manual,3,maruti,maruti celerio,2015,vxi,400000.0,2015,petrol,120000,998,23.1,9,2.566667,504649.260948,old,1
1,bangalore,manual,2,ford,ford ecosport,2018,1.5 petrol titanium bsiv,811000.0,2018,petrol,32706,1497,17.0,6,2.833333,730169.934641,mid-age,0
2,bangalore,manual,1,tata,tata tiago,2018,1.2 revotron xz,585000.0,2018,petrol,11949,1199,23.84,6,3.973333,748643.178049,mid-age,1
3,bangalore,manual,1,hyundai,hyundai xcent,2014,1.2 kappa s option,462000.0,2014,petrol,17794,1197,19.1,10,1.91,605200.000615,old,0
4,bangalore,manual,1,maruti,maruti sx4 s cross,2015,ddis 200 zeta,790000.0,2015,diesel,60000,1248,23.65,9,2.627778,504649.260948,old,1


In [11]:
df.describe()

Unnamed: 0,owner_no,model_year,price,registered_year,kms_driven,engine_cc,mileage_kmpl,car_age,mileage_normalized,brand_popularity,high_mileage
count,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0
mean,1.36256,2016.492246,908529.6,2016.559409,59210.17,1426.188668,19.202011,7.440591,3.634105,908529.6,0.248016
std,0.633744,3.879895,1045562.0,3.888378,74659.23,468.752493,4.03772,3.888378,2.822979,784603.8,0.431887
min,1.0,1985.0,28000.0,1985.0,101.0,72.0,7.08,1.0,0.328205,73333.33,0.0
25%,1.0,2014.0,399000.0,2014.0,30538.0,1197.0,17.0,5.0,1.933333,504649.3,0.0
50%,1.0,2017.0,600000.0,2017.0,54000.0,1248.0,18.9,7.0,2.833333,605200.0,0.0
75%,2.0,2019.0,949000.0,2019.0,80000.0,1498.0,21.4,10.0,4.28,748643.2,0.0
max,5.0,2023.0,9600000.0,2023.0,5500000.0,5000.0,140.0,39.0,35.6,6343750.0,1.0


In [12]:
df[df["engine_cc"] < 100]

Unnamed: 0,city,transmission,owner_no,brand,model,model_year,variant_name,price,registered_year,fuel_type,kms_driven,engine_cc,mileage_kmpl,car_age,mileage_normalized,brand_popularity,age_group,high_mileage
562,bangalore,automatic,1,mahindra,mahindra e2o plus,2017,p6,550000.0,2017,electric,20000,72,110.0,7,15.714286,930921.126761,mid-age,1
817,bangalore,automatic,2,mahindra,mahindra e2o plus,2017,p8,460000.0,2017,electric,20000,72,140.0,7,20.0,930921.126761,mid-age,1


In [13]:
# Export it
df.to_csv(r"SBS_Processed_Datasets\03_feature_engineered_output.csv",index=False)