In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/raw/data.csv')

In [3]:
df.shape

(38486, 8)

In [4]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
0,Bajaj Avenger Cruise 220 2017,2017,17000 Km,first owner,hyderabad,\n\n 35 kmpl,19 bhp,63500
1,Royal Enfield Classic 350cc 2016,2016,50000 Km,first owner,hyderabad,\n\n 35 kmpl,19.80 bhp,115000
2,Hyosung GT250R 2012,2012,14795 Km,first owner,hyderabad,\n\n 30 kmpl,28 bhp,300000
3,Bajaj Dominar 400 ABS 2017,2017,Mileage 28 Kms,first owner,pondicherry,\n\n 28 Kms,34.50 bhp,100000
4,Jawa Perak 330cc 2020,2020,2000 Km,first owner,bangalore,\n\n,30 bhp,197500


In [5]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
38476,TVS Apache RTR 160cc 2014,2014,Mileage 60 Kmpl,first owner,chennai,\n\n 60 Kmpl,15.2 bhp,30000
38477,Yamaha FZs 150cc 2014,2014,14326 Km,first owner,chennai,\n\n 45 kmpl,13 bhp,48000
38478,Yamaha FZs 150cc 2010,2010,9614 Km,first owner,delhi,\n\n 45 kmpl,13 bhp,24000
38479,Hero CBZ Xtreme 150cc 2011,2011,9500 Km,first owner,delhi,\n\n 65kmpl,14.4PS,21000
38480,Hero Passion Pro 100cc 2017,2017,22000 Km,first owner,delhi,\n\n 5 kmpl,8.2 Bhp,39000
38481,Bajaj V12 125cc 2017,2017,15621 Km,first owner,delhi,\n\n 57 kmpl,12bhp,35000
38482,Bajaj Discover 125cc 2012,2012,16206 Km,first owner,faridabad,\n\n 65 kmpl,12.5 ps,19000
38483,Bajaj Dominar 400 2017,2017,Mileage 28 Kms,first owner,delhi,\n\n 28 Kms,34.50 bhp,114000
38484,UM Renegade Commando Classic 2018,2018,2911 Km,first owner,delhi,\n\n 35 kmpl,25 bhp,165000
38485,Royal Enfield Bullet Twinspark 350cc 2015,2015,Mileage 37 Kmpl,first owner,delhi,\n\n 37 Kmpl,19.80 bhp,87500


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38486 entries, 0 to 38485
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   model_name  38486 non-null  object
 1   model_year  38486 non-null  int64 
 2   kms_driven  38486 non-null  object
 3   owner       38486 non-null  object
 4   location    38462 non-null  object
 5   mileage     38473 non-null  object
 6   power       37851 non-null  object
 7   price       38486 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 2.3+ MB


## Data Cleaning

In [7]:
# remove whitespace from data
df['model_name'] = df.model_name.str.strip()
df['mileage'] = df.mileage.str.strip()

In [8]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
18547,Bajaj Platina 100cc 2014,2014,Mileage 104 Kmpl,first owner,gurgaon,104 Kmpl,8.20 bhp,20700
7573,Hero Glamour 125cc 2005,2005,12000 Km,first owner,gurgaon,55 kmpl,8.90 bhp,10000
25384,Royal Enfield Bullet Electra 350cc 2012,2012,38000 Km,first owner,delhi,40 kmpl,19.8 Bhp,85000
29929,Suzuki Gixxer 150cc 2016,2016,18000 Km,second owner,delhi,63kmpl,14 bhp,45000
15927,Hero Passion Plus 100cc 2006,2006,17999 Km,third owner,delhi,55 kmpl,7.5 Bhp,14500


In [9]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    if 'mileage' in val.lower():
        return np.nan

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    """
    if not val:
        return ''

    val = str(val)

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)



In [10]:
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce',downcast='integer')
df['price'] = pd.to_numeric(df['price'],errors='coerce',downcast='integer')

In [11]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return str(val).split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [12]:
df.location.value_counts()[:10]

delhi        10662
chennai       6196
mumbai        3514
faridabad     1828
jaipur        1515
vadodara      1241
bangalore     1205
ludhiana      1161
gurgaon       1081
pune           903
Name: location, dtype: int64

In [13]:

def clean_owner(val):
    if not val:
        return val

    val = val.replace('owner','')

    if 'or more' in val.lower():
        return 'fourth'

    return val

df['owner'] = df['owner'].apply(clean_owner)

In [14]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
10362,Honda Dream Yuga 110cc 2015,2015,20000.0,first,ludhiana,84 kmpl,8 bhp,26500.0
2561,Bajaj Avenger Street 220 2017,2017,10800.0,first,delhi,53 kmpl,19 bhp,50000.0
36137,TVS Apache RTR 200 4V Carburetor 2018,2018,,first,delhi,40 Kmpl,20.70 bhp,85000.0
9580,TVS Apache RTR 160cc Matt Red Rear Disc 2019,2019,,first,delhi,60 Kmpl,15 bhp,76000.0
11178,Honda CB Shine 125cc 2011,2011,,first,chennai,65 Kmpl,10 bhp,18000.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38486 entries, 0 to 38485
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  38486 non-null  object 
 1   model_year  38486 non-null  int64  
 2   kms_driven  26660 non-null  float64
 3   owner       38486 non-null  object 
 4   location    38486 non-null  object 
 5   mileage     38473 non-null  object 
 6   power       37851 non-null  object 
 7   price       36479 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 2.3+ MB


### Handle Mileage

In [16]:
def clean_mileage(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('kmpl','')

df['mileage'] = df.mileage.apply(clean_mileage)

### Hanlde power

In [17]:
def clean_power(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('bhp','')

df['power'] = df.power.apply(clean_power)

In [18]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
17424,Royal Enfield Bullet Electra 350cc 2003,2003,29359.0,first,delhi,30,19.8,63375.0
30596,Honda CBR 150R 150cc 2015,2015,,first,pune,38,18.28,65000.0
16899,TVS Apache RTR 160cc 2008,2008,,first,ahmedabad,60,15.2,22000.0
6267,Benelli TNT 300 2016,2016,32000.0,first,mumbai,25,37.73,218100.0
1188,Bajaj Avenger Cruise 220 2016,2016,13589.0,first,bangalore,35,19.0,64674.0


In [19]:
df.duplicated().sum()

30629

In [20]:
df = df[~df.duplicated()]

In [21]:
df.shape

(7857, 8)

## Handle Missing Values

In [22]:
df.isnull().sum()

model_name       0
model_year       0
kms_driven    1988
owner            0
location         0
mileage        727
power            0
price           31
dtype: int64

## Fix column type

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7857 entries, 0 to 37635
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7857 non-null   object 
 1   model_year  7857 non-null   int64  
 2   kms_driven  5869 non-null   float64
 3   owner       7857 non-null   object 
 4   location    7857 non-null   object 
 5   mileage     7130 non-null   object 
 6   power       7857 non-null   object 
 7   price       7826 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 552.4+ KB


In [24]:
df['mileage'] = pd.to_numeric(df['mileage'],errors='coerce') #df.mileage.astype(int,errors='ignore')
df['power'] = pd.to_numeric(df['power'],errors='coerce')

In [25]:
df['mileage'] = df.mileage.astype(int,errors='ignore') #pd.to_numeric(df['mileage'],errors='coerce',downcast='integer') #df.mileage.astype(int,errors='ignore')
df['power'] = df.power.astype(int,errors='ignore')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7857 entries, 0 to 37635
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7857 non-null   object 
 1   model_year  7857 non-null   int64  
 2   kms_driven  5869 non-null   float64
 3   owner       7857 non-null   object 
 4   location    7857 non-null   object 
 5   mileage     6883 non-null   float64
 6   power       7429 non-null   float64
 7   price       7826 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 552.4+ KB


In [27]:
df.shape

(7857, 8)

In [28]:
cols = ['kms_driven', 'price','mileage','power']
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

In [29]:
df.to_csv('../data/processed/data.csv',index=False)