In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/raw/data.csv')

In [3]:
df.shape

(7924, 10)

In [4]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
0,Yamaha SZR 150cc 2012,2012,34000 Km,first owner,bengaluru,\n\n,,12 bhp,,24000
1,Hero CD Dawn 100cc 2005,2005,28000 Km,first owner,ghaziabad,\n\n 72 kmpl,,7.0 bhp,,10000
2,Bajaj Avenger Street 220 2017,2017,5947 Km,first owner,chennai,\n\n 53 kmpl,,19 bhp,,80000
3,Jawa Standard 295CC Dual ABS BS6 2020,2020,2700 Km,first owner,mumbai,\n\n Liquid Cooled,,27 bhp,,160000
4,Hero Xpulse 200cc BS6 2021,2021,628 Km,first owner,gwalior,\n\n 46.40 kmpl,,17.8 bhp,,110000


In [5]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
7914,KTM RC 200cc 2017,2017,15000 Km,first owner,thane,\n\n 35 kmpl,,25.10 bhp,,115000
7915,Yamaha FZ16 150cc 2013,2013,Mileage 58 Kmpl,first owner,delhi,\n\n 58 Kmpl,,13 bhp,,30000
7916,Bajaj Pulsar 150cc 2011,2011,Mileage 65 Kmpl,first owner,faridabad,\n\n 65 Kmpl,,14.85 bhp,,21100
7917,Bajaj Avenger Street 150 2017,2017,9000 Km,first owner,lansdowne,\n\n 45 kmpl,,14.30 bhp,,70000
7918,Bajaj Avenger 220cc 2015,2015,7600 Km,first owner,bangalore,\n\n 40 kmpl,,19 bhp,,60000
7919,Bajaj Pulsar NS200 2013,2013,Mileage 35 Kmpl,first owner,navi mumbai,\n\n 35 Kmpl,,23.19 bhp,,45999
7920,Bajaj Dominar 400 ABS 2017,2017,Mileage 28 Kms,first owner,delhi,\n\n 28 Kms,,34.50 bhp,,140000
7921,Bajaj Pulsar RS200 ABS 2017,2017,Mileage 35 Kmpl,first owner,asansol,\n\n 35 Kmpl,,24.50 bhp,,130000
7922,Yamaha Fazer FI V 2.0 150cc 2016,2016,Mileage 45 Kmpl,first owner,bangalore,\n\n 45 Kmpl,,13 bhp,,70000
7923,Yamaha YZFR15 2.0 150 2016,2016,24500 Km,first owner,villupuram,\n\n 42 kmpl,,16.70 bhp,,90000


In [6]:
df.shape

(7924, 10)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7924 entries, 0 to 7923
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7924 non-null   object 
 1   model_year  7924 non-null   int64  
 2   kms_driven  7924 non-null   object 
 3   owner       7924 non-null   object 
 4   location    7908 non-null   object 
 5   mileage     7908 non-null   object 
 6   engine      0 non-null      float64
 7   power       7905 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       7924 non-null   int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 619.2+ KB


## Data Cleaning

In [8]:
# remove whitespace from data
df['model_name'] = df.model_name.str.strip()
df['mileage'] = df.mileage.str.strip()

In [9]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
1095,Yamaha FZS-FI V 3.0 150cc ABS BS6 2019,2019,50 Km,first owner,chennai,,,12.2 bhp,,104500
629,Royal Enfield Himalayan 410cc 2019,2019,8000 Km,first owner,bangalore,32 kmpl,,24.50 bhp,,180000
5841,Yamaha MT-15 150cc 2019,2019,1500 Km,first owner,delhi,,,19 bhp,,125000
4503,Yamaha YZF-R15 S 150cc 2017,2017,12500 Km,first owner,mumbai,42 kmpl,,16 bhp,,85000
6075,Honda CBR 150R 150cc 2014,2014,14600 Km,first owner,vadodara,38 kmpl,,18.28 bhp,,59000


In [10]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    if 'mileage' in val.lower():
        return np.nan

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    3. fix the val which is represented as lakh
    """
    if not val:
        return ''

    val = str(val)

    val = val.replace('₹','')

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)



In [11]:
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce',downcast='integer')
df['price'] = pd.to_numeric(df['price'],errors='coerce',downcast='integer')

In [12]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return str(val).split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [13]:
df.location.value_counts()[:10]

delhi        1509
bangalore     781
mumbai        728
jaipur        389
pune          353
chennai       342
gurgaon       314
hyderabad     310
ahmedabad     270
faridabad     181
Name: location, dtype: int64

In [14]:

def clean_owner(val):
    if not val:
        return val

    val = val.replace('owner','')

    if 'or more' in val.lower():
        return 'fourth'

    return val

df['owner'] = df['owner'].apply(clean_owner)

In [15]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
842,Honda CB Hornet 160R STD 2016,2016,,first,chennai,52 Kmpl,,15.44 bhp,,47000.0
7175,Bajaj Platina Alloy ES-100cc 2018,2018,,first,faridabad,104 Kmpl,,8.20 bhp,,32000.0
2812,Royal Enfield Thunderbird 350cc 2016,2016,7200.0,first,kalyan,40 kmpl,,19.80 bhp,,130000.0
5036,Bajaj Pulsar RS200 2016,2016,,first,pune,35 Kmpl,,24.50 bhp,,108000.0
2999,Yamaha FZs 150cc 2013,2013,24028.0,first,bangalore,45 kmpl,,13 bhp,,38000.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7924 entries, 0 to 7923
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7924 non-null   object 
 1   model_year  7924 non-null   int64  
 2   kms_driven  5734 non-null   float64
 3   owner       7924 non-null   object 
 4   location    7924 non-null   object 
 5   mileage     7908 non-null   object 
 6   engine      0 non-null      float64
 7   power       7905 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       7889 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 619.2+ KB


### Handle Mileage

In [17]:
def clean_mileage(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('kmpl','')

df['mileage'] = df.mileage.apply(clean_mileage)

In [18]:
df.drop('engine',axis=1,inplace=True)

### Hanlde power

In [19]:
def clean_power(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('bhp','')

df['power'] = df.power.apply(clean_power)

In [20]:
df.drop('wheel_size',axis=1,inplace=True)

In [21]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
3118,Yamaha FZs 150cc 2015,2015,15678.0,first,hyderabad,45.0,13,59000.0
7255,Bajaj Pulsar 220cc 2014,2014,25000.0,first,rohtak,38.0,21,40000.0
3289,Bajaj Pulsar 180cc 2014,2014,33540.0,second,bangalore,45.0,17,35000.0
7807,Yamaha YZF-R15 S 150cc 2017,2017,3800.0,first,bangalore,42.0,16,125000.0
528,Bajaj Pulsar NS160 2019,2019,77000.0,second,delhi,,15.5 ps,55900.0


In [22]:
df.duplicated().sum()

108

In [23]:
df = df[~df.duplicated()]

In [24]:
df.shape

(7816, 8)

## Handle Missing Values

In [25]:
df.isnull().sum()

model_name       0
model_year       0
kms_driven    2131
owner            0
location         0
mileage        669
power            0
price           35
dtype: int64

## Fix column type

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7816 entries, 0 to 7923
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7816 non-null   object 
 1   model_year  7816 non-null   int64  
 2   kms_driven  5685 non-null   float64
 3   owner       7816 non-null   object 
 4   location    7816 non-null   object 
 5   mileage     7147 non-null   object 
 6   power       7816 non-null   object 
 7   price       7781 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 549.6+ KB


In [27]:
df['mileage'] = pd.to_numeric(df['mileage'],errors='coerce') #df.mileage.astype(int,errors='ignore')
df['power'] = pd.to_numeric(df['power'],errors='coerce')

In [28]:
df['mileage'] = df.mileage.astype(int,errors='ignore') #pd.to_numeric(df['mileage'],errors='coerce',downcast='integer') #df.mileage.astype(int,errors='ignore')
df['power'] = df.power.astype(int,errors='ignore')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7816 entries, 0 to 7923
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7816 non-null   object 
 1   model_year  7816 non-null   int64  
 2   kms_driven  5685 non-null   float64
 3   owner       7816 non-null   object 
 4   location    7816 non-null   object 
 5   mileage     6887 non-null   float64
 6   power       7373 non-null   float64
 7   price       7781 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 549.6+ KB


In [31]:
df.shape

(7816, 8)

In [30]:
df.to_csv('../data/processed/data.csv',index=False)