In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/raw/data.csv')

In [3]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
0,Yamaha SZR 150cc 2012,2012,34000 Km,first owner,bengaluru,\n\n,,12 bhp,,24000
1,Hero CD Dawn 100cc 2005,2005,28000 Km,first owner,ghaziabad,\n\n 72 kmpl,,7.0 bhp,,10000
2,Bajaj Avenger Street 220 2017,2017,5947 Km,first owner,chennai,\n\n 53 kmpl,,19 bhp,,80000
3,Jawa Standard 295CC Dual ABS BS6 2020,2020,2700 Km,first owner,mumbai,\n\n Liquid Cooled,,27 bhp,,160000
4,Hero Xpulse 200cc BS6 2021,2021,628 Km,first owner,gwalior,\n\n 46.40 kmpl,,17.8 bhp,,110000


In [4]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
4402,Hero Karizma ZMR 223cc 2014,2014,68000 Km,second owner,surat,\n\n 48 kmpl,,20 bhp,,55786
4403,Royal Enfield Thunderbird 350cc 2015,2015,54000 Km,first owner,chennai,\n\n 40 kmpl,,19.80 bhp,,85000
4404,Royal Enfield Thunderbird X 350cc 2018,2018,15000 Km,first owner,bangalore,\n\n,,19.80 bhp,,150000
4405,Royal Enfield Thunderbird 350cc 2017,2017,17000 Km,first owner,jaipur,\n\n 40 kmpl,,19.80 bhp,,135000
4406,Yamaha YZF-R15 V3 150CC ABS 2019,2019,1305 Km,first owner,indore,\n\n,,18.70 bhp,,155000
4407,Yamaha YZF-R15 V3 150CC ABS LIMITED EDITION 2019,2019,3200 Km,first owner,haridwar,\n\n,,18.70 bhp,,145000
4408,Bajaj Pulsar RS200 ABS 2016,2016,Mileage 35 Kmpl,first owner,ratnagiri,\n\n 35 Kmpl,,24.50 bhp,,80000
4409,Royal Enfield Electra 350cc 2017,2017,2000 Km,first owner,bihar shariff,\n\n 25 kmpl,,19.80 bhp,,90000
4410,Suzuki Gixxer 150cc SP ABS 2019,2019,3700 Km,first owner,bangalore,\n\n 63kmpl,,14 bhp,,81800
4411,Bajaj V12 125cc Disc 2017,2017,20400 Km,first owner,rohtak,\n\n 57 kmpl,,10.60 bhp,,35740


In [5]:
df.shape

(4412, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4412 entries, 0 to 4411
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  4412 non-null   object 
 1   model_year  4412 non-null   int64  
 2   kms_driven  4412 non-null   object 
 3   owner       4412 non-null   object 
 4   location    4397 non-null   object 
 5   mileage     4398 non-null   object 
 6   engine      0 non-null      float64
 7   power       4409 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       4412 non-null   int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 344.8+ KB


## Data Cleaning

In [7]:
# remove whitespace from data
df['model_name'] = df.model_name.str.strip()
df['mileage'] = df.mileage.str.strip()

In [8]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
256,Bajaj Pulsar 150cc 2018,2018,Mileage 65 Kmpl,first owner,ahmedabad,65 Kmpl,,15 bhp,,53000
2898,Honda CB ShineSP 125cc CBS 2019,2019,Mileage 65 Kmpl,first owner,delhi,65 Kmpl,,10 bhp,,53000
1077,Royal Enfield Thunderbird 350cc 2015,2015,25000 Km,first owner,delhi,40 kmpl,,19.80 bhp,,78000
4278,Royal Enfield Thunderbird 350cc 2012,2012,21000 Km,first owner,faridabad,40 kmpl,,19.80 bhp,,47000
4161,Yamaha Saluto 125cc 2016,2016,Mileage 70 Kmpl,first owner,delhi,70 Kmpl,,6.1 kW (8.3PS),,42000


In [9]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    if 'mileage' in val.lower():
        return np.nan

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    3. fix the val which is represented as lakh
    """
    if not val:
        return ''

    val = str(val)

    val = val.replace('₹','')

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)



In [10]:
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce',downcast='integer')
df['price'] = pd.to_numeric(df['price'],errors='coerce',downcast='integer')

In [11]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return str(val).split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [12]:
df.location.value_counts()[:10]

delhi         852
bangalore     477
mumbai        437
jaipur        279
pune          199
hyderabad     181
ahmedabad     159
gurgaon       149
chennai       120
chandigarh     94
Name: location, dtype: int64

In [13]:

def clean_owner(val):
    if not val:
        return val

    val = val.replace('owner','')

    if 'or more' in val.lower():
        return 'fourth'

    return val

df['owner'] = df['owner'].apply(clean_owner)

In [14]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,engine,power,wheel_size,price
3506,TVS Apache RTR 200 4V FI 2018,2018,,first,chennai,40 Kmpl,,20.70 bhp,,70000.0
3614,Yamaha YZF-R15 150cc 2015,2015,22000.0,third,bangalore,42 kmpl,,16 bhp,,80000.0
820,Suzuki Intruder 150cc 2018,2018,12000.0,second,delhi,,,14.60 bhp,,60000.0
541,Bajaj Avenger Street 150 2016,2016,13500.0,first,gurgaon,50 kmpl,,14.30 bhp,,45000.0
3197,Yamaha MT-15 150cc Special Edition BS6 2020,2020,1400.0,first,hyderabad,,,18.23 bhp,,150100.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4412 entries, 0 to 4411
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  4412 non-null   object 
 1   model_year  4412 non-null   int64  
 2   kms_driven  3158 non-null   float64
 3   owner       4412 non-null   object 
 4   location    4412 non-null   object 
 5   mileage     4398 non-null   object 
 6   engine      0 non-null      float64
 7   power       4409 non-null   object 
 8   wheel_size  0 non-null      float64
 9   price       4411 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 344.8+ KB


### Handle Mileage

In [16]:
def clean_mileage(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('kmpl','')

df['mileage'] = df.mileage.apply(clean_mileage)

In [17]:
df.drop('engine',axis=1,inplace=True)

### Hanlde power

In [18]:
def clean_power(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('bhp','')

df['power'] = df.power.apply(clean_power)

In [19]:
df.drop('wheel_size',axis=1,inplace=True)

In [20]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
3140,Yamaha YZF-R15 150cc 2014,2014,24535.0,second,chandigarh,42,16.0,61380.0
3157,Yamaha FZs 150cc 2017,2017,15000.0,first,delhi,45,13.0,62900.0
1665,Suzuki Gixxer 150cc SP ABS 2019,2019,9800.0,first,sikar,63,14.0,80000.0
4143,Yamaha YZF-R15 V3 150cc Darknight edition BS6 ...,2020,8650.0,first,dehradun,40,18.3,147000.0
3607,Yamaha FZs 150cc 2016,2016,32000.0,first,thanjavur,45,13.0,65000.0


In [21]:
df.duplicated().sum()

63

In [22]:
df = df[~df.duplicated()]

In [23]:
df.shape

(4349, 8)

## Handle Missing Values

In [24]:
df.isnull().sum()

model_name       0
model_year       0
kms_driven    1223
owner            0
location         0
mileage        461
power            0
price            1
dtype: int64

## Fix column type

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4349 entries, 0 to 4411
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  4349 non-null   object 
 1   model_year  4349 non-null   int64  
 2   kms_driven  3126 non-null   float64
 3   owner       4349 non-null   object 
 4   location    4349 non-null   object 
 5   mileage     3888 non-null   object 
 6   power       4349 non-null   object 
 7   price       4348 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 305.8+ KB


In [26]:
df['mileage'] = pd.to_numeric(df['mileage'],errors='coerce') #df.mileage.astype(int,errors='ignore')
df['power'] = pd.to_numeric(df['power'],errors='coerce')

In [27]:
df['mileage'] = df.mileage.astype(int,errors='ignore') #pd.to_numeric(df['mileage'],errors='coerce',downcast='integer') #df.mileage.astype(int,errors='ignore')
df['power'] = df.power.astype(int,errors='ignore')

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4349 entries, 0 to 4411
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  4349 non-null   object 
 1   model_year  4349 non-null   int64  
 2   kms_driven  3126 non-null   float64
 3   owner       4349 non-null   object 
 4   location    4349 non-null   object 
 5   mileage     3729 non-null   float64
 6   power       4140 non-null   float64
 7   price       4348 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 305.8+ KB


In [29]:
df.to_csv('../data/processed/data.csv',index=False)