In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [92]:
df = pd.read_csv('../data/raw/data.csv')

In [93]:
df.shape

(38486, 8)

In [94]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
0,Bajaj Avenger Cruise 220 2017,2017,17000 Km,first owner,hyderabad,\n\n 35 kmpl,19 bhp,63500
1,Royal Enfield Classic 350cc 2016,2016,50000 Km,first owner,hyderabad,\n\n 35 kmpl,19.80 bhp,115000
2,Hyosung GT250R 2012,2012,14795 Km,first owner,hyderabad,\n\n 30 kmpl,28 bhp,300000
3,Bajaj Dominar 400 ABS 2017,2017,Mileage 28 Kms,first owner,pondicherry,\n\n 28 Kms,34.50 bhp,100000
4,Jawa Perak 330cc 2020,2020,2000 Km,first owner,bangalore,\n\n,30 bhp,197500


In [95]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
38476,TVS Apache RTR 160cc 2014,2014,Mileage 60 Kmpl,first owner,chennai,\n\n 60 Kmpl,15.2 bhp,30000
38477,Yamaha FZs 150cc 2014,2014,14326 Km,first owner,chennai,\n\n 45 kmpl,13 bhp,48000
38478,Yamaha FZs 150cc 2010,2010,9614 Km,first owner,delhi,\n\n 45 kmpl,13 bhp,24000
38479,Hero CBZ Xtreme 150cc 2011,2011,9500 Km,first owner,delhi,\n\n 65kmpl,14.4PS,21000
38480,Hero Passion Pro 100cc 2017,2017,22000 Km,first owner,delhi,\n\n 5 kmpl,8.2 Bhp,39000
38481,Bajaj V12 125cc 2017,2017,15621 Km,first owner,delhi,\n\n 57 kmpl,12bhp,35000
38482,Bajaj Discover 125cc 2012,2012,16206 Km,first owner,faridabad,\n\n 65 kmpl,12.5 ps,19000
38483,Bajaj Dominar 400 2017,2017,Mileage 28 Kms,first owner,delhi,\n\n 28 Kms,34.50 bhp,114000
38484,UM Renegade Commando Classic 2018,2018,2911 Km,first owner,delhi,\n\n 35 kmpl,25 bhp,165000
38485,Royal Enfield Bullet Twinspark 350cc 2015,2015,Mileage 37 Kmpl,first owner,delhi,\n\n 37 Kmpl,19.80 bhp,87500


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38486 entries, 0 to 38485
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   model_name  38486 non-null  object
 1   model_year  38486 non-null  int64 
 2   kms_driven  38486 non-null  object
 3   owner       38486 non-null  object
 4   location    38462 non-null  object
 5   mileage     38473 non-null  object
 6   power       37851 non-null  object
 7   price       38486 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 2.3+ MB


## Data Cleaning

In [97]:
# remove whitespace from data
df['model_name'] = df.model_name.str.strip()
df['mileage'] = df.mileage.str.strip()

In [98]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
9315,Royal Enfield Thunderbird 500cc 2017,2017,900 Km,first owner,roorkee,25 kmpl,27.20 bhp,150000
22521,Suzuki Gixxer 150cc 2016,2016,18000 Km,second owner,delhi,63kmpl,14 bhp,45000
26568,Royal Enfield Classic 350cc 2010,2010,3000 Km,first owner,chennai,35 kmpl,19.80 Bhp,89889
19596,Bajaj Pulsar 200cc 2013,2013,25000 Km,first owner,mumbai,,,37000
23444,Bajaj Pulsar AS200 2014,2014,Mileage 42 Kmpl,second owner,bangalore,42 Kmpl,23.20 bhp,45000


In [99]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    if 'mileage' in val.lower():
        return np.nan

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    """
    if not val:
        return ''

    val = str(val)

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)



In [100]:
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce',downcast='integer')
df['price'] = pd.to_numeric(df['price'],errors='coerce',downcast='integer')

In [101]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return str(val).split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [102]:
df.location.value_counts()[:10]

delhi        10662
chennai       6196
mumbai        3514
faridabad     1828
jaipur        1515
vadodara      1241
bangalore     1205
ludhiana      1161
gurgaon       1081
pune           903
Name: location, dtype: int64

In [103]:

def clean_owner(val):
    if not val:
        return val

    val = val.replace('owner','')

    if 'or more' in val.lower():
        return 'fourth'

    return val

df['owner'] = df['owner'].apply(clean_owner)

In [104]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
20916,Hero Splendor Plus 100 CC 2014,2014,10000.0,first,jaipur,,,35000.0
10744,Hero Splendor 100cc 2002,2002,55000.0,first,hyderabad,45 kmpl,7.44bhp,20000.0
22352,Bajaj Discover 125cc 2012,2012,16206.0,first,faridabad,65 kmpl,12.5 ps,19000.0
2041,Bajaj Pulsar NS160 2018,2018,71000.0,second,jaipur,,15.5 PS,59000.0
7381,Hero Glamour 125cc 2014,2014,35000.0,first,bhubaneshwar,55 kmpl,8.90 bhp,41000.0


In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38486 entries, 0 to 38485
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  38486 non-null  object 
 1   model_year  38486 non-null  int64  
 2   kms_driven  26660 non-null  float64
 3   owner       38486 non-null  object 
 4   location    38486 non-null  object 
 5   mileage     38473 non-null  object 
 6   power       37851 non-null  object 
 7   price       36479 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 2.3+ MB


### Handle Mileage

In [106]:
def clean_mileage(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('kmpl','')

df['mileage'] = df.mileage.apply(clean_mileage)

### Hanlde power

In [107]:
def clean_power(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('bhp','')

df['power'] = df.power.apply(clean_power)

In [108]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
33195,TVS Apache RTR 160cc 2011,2011,,first,chennai,60,15.2,
2001,Yamaha YZF-R15 V3 150CC Dual Channel ABS BS6 2020,2020,15000.0,first,gurgaon,40,18.3,130000.0
22879,Bajaj V12 125cc 2017,2017,15621.0,first,delhi,57,12,35000.0
18836,Royal Enfield Bullet Twinspark 350cc 2013,2013,,first,rohtak,37,19.80,52000.0
2079,Bajaj Discover 150cc 2010,2010,,first,delhi,70,13.0 ps,16500.0


In [109]:
df.duplicated().sum()

30629

In [110]:
df = df[~df.duplicated()]

In [111]:
df.shape

(7857, 8)

## Handle Missing Values

In [112]:
df.isnull().sum()

model_name       0
model_year       0
kms_driven    1988
owner            0
location         0
mileage        727
power            0
price           31
dtype: int64

## Fix column type

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7857 entries, 0 to 37635
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7857 non-null   object 
 1   model_year  7857 non-null   int64  
 2   kms_driven  5869 non-null   float64
 3   owner       7857 non-null   object 
 4   location    7857 non-null   object 
 5   mileage     7130 non-null   object 
 6   power       7857 non-null   object 
 7   price       7826 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 552.4+ KB


In [114]:
df['mileage'] = pd.to_numeric(df['mileage'],errors='coerce') #df.mileage.astype(int,errors='ignore')
df['power'] = pd.to_numeric(df['power'],errors='coerce')

In [115]:
df['mileage'] = df.mileage.astype(int,errors='ignore') #pd.to_numeric(df['mileage'],errors='coerce',downcast='integer') #df.mileage.astype(int,errors='ignore')
df['power'] = df.power.astype(int,errors='ignore')

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7857 entries, 0 to 37635
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7857 non-null   object 
 1   model_year  7857 non-null   int64  
 2   kms_driven  5869 non-null   float64
 3   owner       7857 non-null   object 
 4   location    7857 non-null   object 
 5   mileage     6883 non-null   float64
 6   power       7429 non-null   float64
 7   price       7826 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 552.4+ KB


In [117]:
df.shape

(7857, 8)

In [118]:
cols = ['kms_driven', 'price','mileage','power']
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

In [119]:
df.to_csv('../data/processed/data.csv',index=False)