In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('../data/raw/data.csv')

In [4]:
df.shape

(8917, 10)

In [5]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
0,Royal Enfield Bullet Electra 350cc 2011,2011,38998 Km,first owner,bangalore,\n\n 40 kmpl,,19.8 Bhp,,95000
1,Jawa Perak 330cc 2020,2020,2000 Km,first owner,bangalore,\n\n,,30 bhp,,197500
2,Hero CD Dawn 100cc 2005,2005,28000 Km,first owner,ghaziabad,\n\n 72 kmpl,,7.0 bhp,,9000
3,KTM Duke 200cc 2012,2012,24561 Km,third owner,bangalore,\n\n 35 kmpl,,25 bhp,,63400
4,TVS Apache RTR 200 4V Dual Channel ABS BS6 2020,2020,Mileage 40 Kmpl,first owner,hyderabad,\n\n 40 Kmpl,,20.21 bhp,,130500


In [6]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
8907,Honda CB Trigger 150cc 2014,2014,10000 Km,first owner,mumbai,\n\n 60 kmpl,,14 bhp,,49000
8908,Bajaj Pulsar 220F 2017,2017,1500 Km,first owner,kanpur nagar,\n\n 38 kmpl,,21 bhp,,89000
8909,Bajaj Pulsar RS200 ABS 2015,2015,Mileage 35 Kmpl,first owner,delhi,\n\n 35 Kmpl,,24.16 bhp,,80000
8910,Bajaj Pulsar RS200 ABS 2016,2016,Mileage 35 Kmpl,first owner,bangalore,\n\n 35 Kmpl,,24.50 bhp,,120000
8911,Bajaj Avenger 220cc 2015,2015,38000 Km,first owner,mathura,\n\n 40 kmpl,,19 bhp,,55000
8912,Royal Enfield Thunderbird 500cc 2013,2013,40500 Km,first owner,pune,\n\n 25 kmpl,,27.20 bhp,,95000
8913,Bajaj Dominar 400 ABS 2017,2017,Mileage 28 Kms,first owner,pune,\n\n 28 Kms,,34.50 bhp,,147500
8914,KTM RC 390cc 2016,2016,1700 Km,first owner,delhi,\n\n 26kmpl,,42.30 bhp,,190000
8915,Bajaj Pulsar 200 NS 200cc 2014,2014,43000 Km,first owner,chennai,\n\n 40 kmpl,,24.2 BHP,,53000
8916,Bajaj Pulsar 180cc 2016,2016,19718 Km,first owner,bangalore,\n\n 65 kmpl,,17 bhp,,55000


In [7]:
df.shape

(8917, 10)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8917 entries, 0 to 8916
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  8917 non-null   object 
 1   model_year  8917 non-null   int64  
 2   kms_driven  8917 non-null   object 
 3   owner       8917 non-null   object 
 4   location    8891 non-null   object 
 5   mileage     8900 non-null   object 
 6   engine      0 non-null      float64
 7   power       8899 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       8917 non-null   int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 696.8+ KB


## Data Cleaning

In [9]:
# remove whitespace from data
df['model_name'] = df.model_name.str.strip()
df['mileage'] = df.mileage.str.strip()

In [10]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
7826,Royal Enfield Classic 350cc 2017,2017,14779 Km,first owner,delhi,35 kmpl,,19.80 bhp,,105000
3641,TVS Apache RTR 200 4V Fi 2016,2016,Mileage 40 Kmpl,first owner,bangalore,40 Kmpl,,20.23 BHP,,75000
6073,Bajaj Platina 100cc 2017,2017,Mileage 104 Kmpl,first owner,faridabad,104 Kmpl,,8.10 bhp,,21000
7516,Royal Enfield Thunderbird 500cc 2016,2016,35006 Km,first owner,bangalore,25 kmpl,,27.20 bhp,,110000
7487,UM Renegade Commando Classic 2018,2018,10009 Km,first owner,patna,35 kmpl,,25 bhp,,160000


In [11]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    if 'mileage' in val.lower():
        return np.nan

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    """
    if not val:
        return ''

    val = str(val)

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)



In [12]:
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce',downcast='integer')
df['price'] = pd.to_numeric(df['price'],errors='coerce',downcast='integer')

In [13]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return str(val).split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [14]:
df.location.value_counts()[:10]

delhi        1624
mumbai        898
bangalore     851
pune          423
chennai       406
hyderabad     376
gurgaon       357
jaipur        354
ahmedabad     303
faridabad     192
Name: location, dtype: int64

In [15]:

def clean_owner(val):
    if not val:
        return val

    val = val.replace('owner','')

    if 'or more' in val.lower():
        return 'fourth'

    return val

df['owner'] = df['owner'].apply(clean_owner)

In [16]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
2193,TVS Apache RTR 160 4V Disc 2018,2018,37000.0,first,delhi,,,16.10 bhp,,47025.0
2060,Bajaj Discover 100cc 2011,2011,,first,karauli,80 Kmpl,,7.7,,20000.0
1754,Yamaha YZF-R15 2.0 150cc 2014,2014,18000.0,fourth,bangalore,42 kmpl,,16.70 bhp,,57541.0
1852,Honda CB Shine 125cc Drum BS6 2020,2020,11037.0,first,jaipur,60 kmpl,,10.59 bhp,,50000.0
3220,Yamaha Fazer 150cc 2016,2016,,first,hyderabad,45 Kmpl,,13 bhp,,75000.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8917 entries, 0 to 8916
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  8917 non-null   object 
 1   model_year  8917 non-null   int64  
 2   kms_driven  6500 non-null   float64
 3   owner       8917 non-null   object 
 4   location    8917 non-null   object 
 5   mileage     8900 non-null   object 
 6   engine      0 non-null      float64
 7   power       8899 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       8882 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 696.8+ KB


### Handle Mileage

In [18]:
def clean_mileage(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('kmpl','')

df['mileage'] = df.mileage.apply(clean_mileage)

In [19]:
df.drop('engine',axis=1,inplace=True)

### Hanlde power

In [20]:
def clean_power(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('bhp','')

df['power'] = df.power.apply(clean_power)

In [21]:
df.drop('wheel_size',axis=1,inplace=True)

In [22]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
5293,Bajaj Avenger Street 160 ABS 2019,2019,32000.0,first,chennai,,15.3,75000.0
1422,Suzuki Gixxer SF 150cc Rear Disc 2016,2016,24500.0,third,delhi,40.0,14.6,43000.0
5700,Royal Enfield Classic 350cc 2016,2016,64000.0,first,mumbai,35.0,19.8,110000.0
2470,Royal Enfield Thunderbird 350cc 2017,2017,24745.0,first,bangalore,40.0,19.8,125000.0
3742,KTM Duke 200cc 2013,2013,20000.0,third,mumbai,35.0,25.0,70000.0


In [23]:
df.duplicated().sum()

110

In [24]:
df = df[~df.duplicated()]

In [25]:
df.shape

(8807, 8)

## Handle Missing Values

In [26]:
df.isnull().sum()

model_name       0
model_year       0
kms_driven    2360
owner            0
location         0
mileage        748
power            0
price           35
dtype: int64

## Fix column type

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8807 entries, 0 to 8916
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  8807 non-null   object 
 1   model_year  8807 non-null   int64  
 2   kms_driven  6447 non-null   float64
 3   owner       8807 non-null   object 
 4   location    8807 non-null   object 
 5   mileage     8059 non-null   object 
 6   power       8807 non-null   object 
 7   price       8772 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 619.2+ KB


In [28]:
df['mileage'] = pd.to_numeric(df['mileage'],errors='coerce') #df.mileage.astype(int,errors='ignore')
df['power'] = pd.to_numeric(df['power'],errors='coerce')

In [29]:
df['mileage'] = df.mileage.astype(int,errors='ignore') #pd.to_numeric(df['mileage'],errors='coerce',downcast='integer') #df.mileage.astype(int,errors='ignore')
df['power'] = df.power.astype(int,errors='ignore')

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8807 entries, 0 to 8916
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  8807 non-null   object 
 1   model_year  8807 non-null   int64  
 2   kms_driven  6447 non-null   float64
 3   owner       8807 non-null   object 
 4   location    8807 non-null   object 
 5   mileage     7764 non-null   float64
 6   power       8320 non-null   float64
 7   price       8772 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 619.2+ KB


In [31]:
df.shape

(8807, 8)

In [35]:
cols = ['kms_driven', 'price']
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

In [None]:
df.to_csv('../data/processed/data.csv',index=False)