In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/processed/data.csv')

In [3]:
df.sample(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
1189,Honda Activa DLX,2020,2100.0,1st,New Delhi,65000.0
1953,KTM RC 200 BS VI,2015,40000.0,1st,Bangalore,110000.0
3672,Royal,2018,11216.0,1st,Delhi,138000.0
1115,Hero Honda Passion STD,2001,34000.0,1st,East Delhi,23000.0
36,Honda Hornet 2.0 Standard,2020,4000.0,1st,Ravulapalem,120000.0
3342,TVS,2019,42019.0,1st,Delhi,74000.0
5188,Bajaj Pulsar 220F,2014,25300.0,1st,Bangalore,56000.0
4480,Royal Enfield Classic 350cc,2017,17000.0,1st,Rajkot,139500.0
4767,Bajaj Pulsar AS150,2015,20500.0,1st,Delhi,42000.0
3379,Honda,2020,3295.0,1st,Delhi,76000.0


## Handle model_name

### Let's take only first two words of model_name

In [4]:
df['model_name'] = df['model_name'].apply(lambda x: ' '.join(x.split()[:2]))

In [5]:
df.sample(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
4254,Honda CB,2015,27300.0,1st,Chennai,60000.0
3087,Honda,2018,33053.0,1st,Delhi,47000.0
1273,Honda Activa,2019,34000.0,1st,Faridabad,50000.0
4947,Bajaj Pulsar,2019,89000.0,3rd,Jaipur,51000.0
1545,Bajaj Avenger,2018,32000.0,1st,Mumbai,85000.0
1300,Royal Enfield,2010,25000.0,1st,New Delhi,60000.0
2427,Honda Activa,2019,16000.0,1st,Gandhi Nagar,22000.0
3198,Hero,2018,15803.0,1st,Delhi,49000.0
4927,Bajaj Pulsar,2015,23000.0,1st,Bangalore,38250.0
14,Royal Enfield,2013,57500.0,1st,Bangalore,100000.0


In [8]:
df.model_name.value_counts()

Royal Enfield               751
Bajaj Pulsar                655
Honda Activa                438
Honda                       321
TVS Apache                  302
                           ... 
Harley-Davidson Heritage      1
Harley-Davidson Forty         1
Piaggio Vespa                 1
Triumph Rocket                1
Mahindra Duro                 1
Name: model_name, Length: 200, dtype: int64

### There are too many models, let try to create a brand category using the model name.

In [9]:
df['brand'] = df['model_name'].apply(lambda x: x.split(' ')[0])

In [12]:
df.sample(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price,brand
2836,Honda,2018,81.0,1st,Delhi,59000.0,Honda
3056,TVS,2020,8431.0,1st,Delhi,83000.0,TVS
1765,Honda Activa,2018,10600.0,1st,Mumbai,25000.0,Honda
1367,Okinawa Ridge,2018,1200.0,1st,Gurgaon,8000.0,Okinawa
1579,Royal Enfield,2019,10338.0,1st,Mumbai,170000.0,Royal
1760,Hero Maestro,2020,14500.0,1st,Mumbai,32500.0,Hero
2614,TVS,2018,32303.0,1st,Delhi,47000.0,TVS
271,Honda Shine,2016,39500.0,1st,Redhills,39000.0,Honda
3765,Royal Enfield,2016,4050.0,1st,Bangalore,180000.0,Royal
1859,Honda Activa,2011,100000.0,2nd,Bangalore,16000.0,Honda


In [20]:
# Let's check how may brands we get
print('Unique Brands :',len(df['brand'].unique()))
df['brand'].value_counts()

Unique Brands : 28


Bajaj              1103
Honda              1061
Royal               845
Hero                815
TVS                 677
Yamaha              349
KTM                 309
Suzuki              144
Harley-Davidson      40
Jawa                 33
UM                   27
Kawasaki             16
Benelli              13
Triumph              11
Hyosung              10
Aprilia               9
Mahindra              8
Husqvarna             8
Ducati                6
Vespa                 6
BMW                   3
JAWA                  3
LML                   2
Okinawa               2
Fb                    1
Piaggio               1
Indian                1
22Kymco               1
Name: brand, dtype: int64

### Let's take only top 13 brands as our base brand & make other as 'other' category

In [26]:
top_brands = df['brand'].value_counts().index[:13]

In [28]:
df['brand'] = df['brand'].apply(lambda x: x if x in top_brands else 'other')

In [39]:
df.sample(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price,brand
2327,Honda Activa,2019,22000.0,1st,Hyderabad,24000.0,Honda
4788,Jawa Standard,2020,1003.0,1st,Hyderabad,175000.0,Jawa
4621,Hero Splendor,2021,4.0,1st,Delhi,74000.0,Hero
2672,Honda,2018,24467.0,1st,Delhi,59000.0,Honda
4309,Royal Enfield,2017,23000.0,2nd,Kolhapur,150000.0,Royal Enfield
3354,Honda,2018,2.0,1st,Delhi,49000.0,Honda
2746,Hero,2016,45039.0,1st,Delhi,45000.0,Hero
927,Hero Honda,2004,110000.0,1st,Ghaziabad,15000.0,Hero
951,Hero Karizma,2013,58000.0,1st,Bangalore,55000.0,Hero
3619,Honda,2016,5.0,1st,Delhi,41000.0,Honda


### Let's change the 'Royal' to 'Royal Enfield' as the actual brand

In [30]:
df['brand'] = df.brand.str.replace('Royal','Royal Enfield')

In [41]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price,brand
2085,Bajaj Pulsar,2013,20000.0,1st,Chennai,40000.0,Bajaj
15,Royal Enfield,1986,65000.0,2nd,Bangalore,135000.0,Royal Enfield
5024,Bajaj Pulsar,2019,24200.0,1st,Srinagar,130000.0,Bajaj
4439,Royal Enfield,2014,75000.0,1st,Gurgaon,65000.0,Royal Enfield
3645,Honda,2018,52367.0,1st,Delhi,53000.0,Honda


### Drop the model_name as we can work with the brand only

In [42]:
df = df.drop('model_name',axis=1)

## Handle owner

In [44]:
df.owner.unique()

array(['1st', '3rd', '2nd', '5th', '4th'], dtype=object)

### owner category is pretty much sorted, we can directly encode that using OrdinalEncoding during model building.

## Handle location

In [47]:
print('Total Unique Locations :',len(df.location.unique()))

Total Unique Locations : 430


In [53]:
df.location.value_counts()[:10]

Delhi        1590
Bangalore     559
Mumbai        525
Hyderabad     332
New Delhi     249
Kolkata       190
Jaipur        181
Ahmedabad     174
Chennai       168
Pune          146
Name: location, dtype: int64

### Let's take only top 10 locations as our base categories and make others as 'other' category

In [54]:
top_locations = df.location.value_counts().index[:10]

In [55]:
df['location'] = df.location.apply(lambda x: x if x in top_locations else 'other')

In [63]:
df.sample(10)

Unnamed: 0,model_year,kms_driven,owner,location,price,brand
5040,2021,2700.0,1st,Mumbai,220000.0,Royal Enfield
1630,2014,26000.0,1st,Mumbai,25000.0,Honda
4699,2014,34000.0,2nd,Chennai,85000.0,Royal Enfield
419,2019,7000.0,1st,other,80000.0,Honda
2905,2018,16695.0,1st,Delhi,136000.0,Royal Enfield
3459,2015,54779.0,1st,Delhi,33000.0,Honda
5307,2012,33000.0,1st,other,96900.0,Royal Enfield
791,2018,22600.0,1st,Pune,68500.0,Bajaj
5335,2017,16090.0,1st,Hyderabad,122400.0,Royal Enfield
2112,2018,15000.0,2nd,Chennai,110000.0,Yamaha


In [71]:
# Recorder the column
# new_order = ['brand','model_year','kms_driven','owner','location','price']
new_order = [5,0,1,2,3,4]
df = df[df.columns[new_order]]

In [72]:
df.sample()

Unnamed: 0,brand,model_year,kms_driven,owner,location,price
4213,Bajaj,2018,61000.0,3rd,Pune,60000


In [69]:
# make the price as int data type
df['price'] = df['price'].astype(int)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5504 entries, 0 to 5503
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_year  5504 non-null   int64  
 1   kms_driven  5504 non-null   float64
 2   owner       5504 non-null   object 
 3   location    5504 non-null   object 
 4   price       5504 non-null   int64  
 5   brand       5504 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 258.1+ KB


In [70]:
## Save the processed data
df.to_csv('../data/processed/processed.csv',index=False)