In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/raw/data.csv')

In [3]:
df.shape

(9630, 10)

In [4]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
0,Bajaj Pulsar 180cc 2016,2016,19718 Km,first owner,bangalore,\n\n 65 kmpl,,17 bhp,,55000
1,Jawa Perak 330cc 2020,2020,2000 Km,first owner,bangalore,\n\n,,30 bhp,,197500
2,TVS Apache RTR 200 4V Dual Channel ABS BS6 2020,2020,Mileage 40 Kmpl,first owner,hyderabad,\n\n 40 Kmpl,,20.21 bhp,,130500
3,KTM Duke 200cc 2012,2012,24561 Km,third owner,bangalore,\n\n 35 kmpl,,25 bhp,,63400
4,Royal Enfield Bullet Electra 350cc 2011,2011,38998 Km,first owner,bangalore,\n\n 40 kmpl,,19.8 Bhp,,95000


In [5]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
9620,Husqvarna Svartpilen 250 2020,2020,7000 Km,first owner,mumbai,\n\n 31.85 kmpl,,29.63 bhp,,197125
9621,KTM RC 200cc ABS BS6 2021,2021,5700 Km,first owner,mumbai,\n\n 35 kmpl,,24.6 bhp,,207500
9622,Bajaj Dominar 400 2017,2017,16000 Km,first owner,bangalore,,,"34.50 bhp @ 8,000 rpm",,150000
9623,Royal Enfield Classic 350cc ABS BS6 2020,2020,9000 Km,first owner,mumbai,\n\n 35 kmpl,,19.1 bhp,,207500
9624,Bajaj Pulsar 220cc 2017,2017,20864 Km,first owner,delhi,\n\n 38 kmpl,,21 bhp,,55000
9625,Royal Enfield Thunderbird X 350cc ABS 2019,2019,9000 Km,first owner,mumbai,\n\n,,19.80 bhp,,166000
9626,Yamaha SZ RR V 2.0 150cc 2013,2013,37788 Km,first owner,siliguri,,,"11.90 bhp @ 7,500 rpm",,43000
9627,Honda CBF Stunner 125cc 2011,2011,22000 Km,first owner,mohali,,,11 Bhp @ 8000 rpm,,21000
9628,Honda CB ShineSP 125cc Disc 2018,2018,Mileage 65 Kmpl,first owner,kukatpally,\n\n 65 Kmpl,,10 bhp,,70000
9629,Honda CB Hornet 160R ABS DLX 2019,2019,Mileage 52 Kmpl,first owner,patna,\n\n 52 Kmpl,,15.44 bhp,,82000


In [6]:
df.shape

(9630, 10)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9630 entries, 0 to 9629
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  9630 non-null   object 
 1   model_year  9630 non-null   int64  
 2   kms_driven  9630 non-null   object 
 3   owner       9630 non-null   object 
 4   location    9605 non-null   object 
 5   mileage     9606 non-null   object 
 6   engine      0 non-null      float64
 7   power       9609 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       9630 non-null   int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 752.5+ KB


## Data Cleaning

In [8]:
# remove whitespace from data
df['model_name'] = df.model_name.str.strip()
df['mileage'] = df.mileage.str.strip()

In [9]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
5790,Honda CB Shine 125cc Disc 2016,2016,Mileage 65 Kmpl,first owner,coimbatore,65 Kmpl,,10 bhp,,45000
2094,Royal Enfield Thunderbird 350cc 2018,2018,17225 Km,first owner,delhi,40 kmpl,,19.80 bhp,,126500
8917,Bajaj Pulsar RS200 2015,2015,Mileage 35 Kmpl,first owner,pune,35 Kmpl,,24.16 bhp,,80000
9515,Bajaj V15 150cc 2016,2016,5000 Km,first owner,ludhiana,57 kmpl,,11.80 bhp,,39000
5736,Yamaha SZ 150cc 2012,2012,172082 Km,first owner,vidisha,,,12 bhp,,25000


In [10]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    if 'mileage' in val.lower():
        return np.nan

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    """
    if not val:
        return ''

    val = str(val)

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)



In [11]:
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce',downcast='integer')
df['price'] = pd.to_numeric(df['price'],errors='coerce',downcast='integer')

In [12]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return str(val).split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [13]:
df.location.value_counts()[:10]

delhi        1790
mumbai        943
bangalore     930
pune          436
chennai       422
hyderabad     416
gurgaon       380
jaipur        376
ahmedabad     309
faridabad     214
Name: location, dtype: int64

In [14]:

def clean_owner(val):
    if not val:
        return val

    val = val.replace('owner','')

    if 'or more' in val.lower():
        return 'fourth'

    return val

df['owner'] = df['owner'].apply(clean_owner)

In [15]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
2233,UM Renegade Commando 2017,2017,9000.0,first,delhi,35 kmpl,,24.80 bhp,,130800.0
2145,Hero HF Deluxe Eco 100cc 2018,2018,6750.0,first,ahmednagar,65 kmpl,,8.10 bhp,,22500.0
2178,Bajaj Pulsar 150cc 2018,2018,,first,bangalore,65 Kmpl,,15 bhp,,75000.0
5577,TVS Apache RTR 200 4V ABS Race Edition 2018,2018,,first,delhi,40 Kmpl,,20.23 BHP,,95000.0
3689,KTM Duke 200cc 2017,2017,23000.0,first,dhamtari,35 kmpl,,24.60 bhp,,95000.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9630 entries, 0 to 9629
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  9630 non-null   object 
 1   model_year  9630 non-null   int64  
 2   kms_driven  6994 non-null   float64
 3   owner       9630 non-null   object 
 4   location    9630 non-null   object 
 5   mileage     9606 non-null   object 
 6   engine      0 non-null      float64
 7   power       9609 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       9590 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 752.5+ KB


### Handle Mileage

In [17]:
def clean_mileage(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('kmpl','')

df['mileage'] = df.mileage.apply(clean_mileage)

In [18]:
df.drop('engine',axis=1,inplace=True)

### Hanlde power

In [19]:
def clean_power(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('bhp','')

df['power'] = df.power.apply(clean_power)

In [20]:
df.drop('wheel_size',axis=1,inplace=True)

In [21]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
6671,Royal Enfield Classic 350cc 2017,2017,23000.0,first,faridabad,35,19.8,109999.0
1591,KTM RC 390cc 2016,2016,26000.0,first,bangalore,26,42.3,128000.0
5060,TVS Apache RTR 200 4V ABS 2018,2018,,first,bangalore,40,20.23,100389.0
4180,UM Renegade Commando 2018,2018,8000.0,first,lucknow,35,24.8,144500.0
7072,Royal Enfield Thunderbird 500cc 2017,2017,5192.0,first,mumbai,25,27.2,155000.0


In [22]:
df.duplicated().sum()

124

In [23]:
df = df[~df.duplicated()]

In [24]:
df.shape

(9506, 8)

## Handle Missing Values

In [25]:
df.isnull().sum()

model_name       0
model_year       0
kms_driven    2563
owner            0
location         0
mileage        799
power            0
price           40
dtype: int64

## Fix column type

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9506 entries, 0 to 9629
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  9506 non-null   object 
 1   model_year  9506 non-null   int64  
 2   kms_driven  6943 non-null   float64
 3   owner       9506 non-null   object 
 4   location    9506 non-null   object 
 5   mileage     8707 non-null   object 
 6   power       9506 non-null   object 
 7   price       9466 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 668.4+ KB


In [27]:
df['mileage'] = pd.to_numeric(df['mileage'],errors='coerce') #df.mileage.astype(int,errors='ignore')
df['power'] = pd.to_numeric(df['power'],errors='coerce')

In [28]:
df['mileage'] = df.mileage.astype(int,errors='ignore') #pd.to_numeric(df['mileage'],errors='coerce',downcast='integer') #df.mileage.astype(int,errors='ignore')
df['power'] = df.power.astype(int,errors='ignore')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9506 entries, 0 to 9629
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  9506 non-null   object 
 1   model_year  9506 non-null   int64  
 2   kms_driven  6943 non-null   float64
 3   owner       9506 non-null   object 
 4   location    9506 non-null   object 
 5   mileage     8385 non-null   float64
 6   power       8958 non-null   float64
 7   price       9466 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 668.4+ KB


In [30]:
df.shape

(9506, 8)

In [31]:
cols = ['kms_driven', 'price']
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

In [32]:
df.to_csv('../data/processed/data.csv',index=False)