In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/raw/data.csv')

In [3]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
0,Hero CBZ X-Treme Rear Disc,2014 model,"26,916 kms",1st Owner,Lucknow,40000
1,Bajaj Pulsar 220 DTS-i Standard,2017 model,"8,500 kms",1st Owner,Mumbai,90000
2,Bajaj Discover 135 Standard,2011 model,"40,000 kms",1st Owner,Kolkata,25000
3,Hero Honda Splendor Plus Drum,2011 model,"40,000 kms",1st Owner,Bangalore,30000
4,Bajaj Platina 100 ES Drum,2018 model,"18,000 kms",1st Owner,Baran,41000


In [4]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
6562,Royal Enfield Bullet Electra 350cc,2015,"40,822 Km",First Owner,Areraj,"\n 90,001\n"
6563,Suzuki Intruder 150cc,2018,"19,000 Km",First Owner,Jajpur,"\n 68,200\n"
6564,Bajaj Pulsar AS150,2015,"12,566 Km",First Owner,Dhanbad,"\n 54,999\n"
6565,Suzuki Gixxer 150cc,2018,"10,815 Km",First Owner,Pune,"\n 75,000\n"
6566,KTM RC 200cc,2018,"11,000 Km",First Owner,Indore,"\n 1,60,000\n"
6567,Hero Impulse 150cc,2016,"40,000 Km",First Owner,Mumbai,"\n 35,000\n"
6568,Royal Enfield Classic Chrome 500cc,2013,"10,000 Km",First Owner,Anand,"\n 1,10,000\n"
6569,Bajaj Avenger Street 220,2017,"16,462 Km",Second Owner,Pune,"\n 60,000\n"
6570,Royal Enfield Classic Desert Storm 500cc,2013,"13,000 Km",Second Owner,Jaipur,"\n 1,04,000\n"
6571,Honda CBR 250R,2016,"17,000 Km",Second Owner,Chennai,"\n 95,999\n"


In [5]:
df.shape

(6572, 6)

## Data Cleaning

In [6]:
# remove whitespace from data
for col in df.columns:
    df[col] = df[col].apply(lambda x: x.strip() if x else x)

In [7]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    3. fix the val which is represented as lakh
    """
    if not val:
        return ''

    val = val.replace('₹','')

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)

In [8]:
# Let's make the model_year columm as number only by removing the 'model' string
df['model_year'] = df['model_year'].str.replace('model','')

In [9]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return val.split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [10]:
df.location.value_counts()[:21]

Delhi          1762
Mumbai          658
Bangalore       647
New Delhi       373
Hyderabad       367
Jaipur          240
Kolkata         207
Ahmedabad       199
Chennai         187
Pune            178
Gurgaon         115
Chandigarh      105
Thane            68
Faridabad        66
Noida            60
Ghaziabad        59
Rajkot           36
Vadodara         28
Navi Mumbai      21
Patna            19
Surat            18
Name: location, dtype: int64

In [11]:
# Fix owner column values
# it is represented in two different way one as 1st owner and other in words like first owner
# let's make them homogeneous

def clean_owner(val):
    if not val:
        return val

    owners = {
        '1st Owner': '1st',
        '2nd Owner': '2nd',
        '3rd Owner': '3rd',
        '4th Owner': '4th',
        '5th Owner': '5th',
        'First Owner': '1st',
        'Second Owner': '2nd',
        'Third Owner': '3rd',
        'Fourth Owner': '4th',
        'Fourth Owner Or More': '5th',
    }

    return owners.get(val,'')

df['owner'] = df['owner'].apply(clean_owner)

In [12]:
df.sample(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
1391,Royal Enfield Bullet 350 ES,2016,31000,1st,New Delhi,95000
1491,Royal Enfield Bullet 350 ES ABS BS VI,2018,19858,1st,North West Delhi,65500
4257,TVS Apache RTR 180cc,2019,88000,1st,Jaipur,61000
4272,Bajaj Pulsar 180cc,2008,24000,1st,Mumbai,14000
2141,Royal Enfield Classic 350 Redditch Rear Disc,2019,22000,1st,Mumbai,65000
4701,Bajaj Pulsar 150cc,2017,56000,1st,Pune,57000
1168,Hero Honda Passion PRO [2012] Standard,2010,40318,2nd,Sitamarhi,30000
2865,TVS Ntorq 125 Super Squad Edition BS VI,2018,7000,1st,Kolkata,25000
1146,Honda Activa [2000-2015] Standard,2008,40000,2nd,Zirakpur,10000
5156,Yamaha FZS FI 150cc,2018,20517,1st,Delhi,68000


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6572 entries, 0 to 6571
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   model_name  6572 non-null   object
 1   model_year  6572 non-null   object
 2   kms_driven  6572 non-null   object
 3   owner       6572 non-null   object
 4   location    6572 non-null   object
 5   price       6572 non-null   object
dtypes: object(6)
memory usage: 308.2+ KB


In [14]:
df['model_year'] = pd.to_numeric(df['model_year'],errors='coerce')
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce')
df['price'] = pd.to_numeric(df['price'],errors='coerce')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6572 entries, 0 to 6571
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  6572 non-null   object 
 1   model_year  6572 non-null   int64  
 2   kms_driven  6569 non-null   float64
 3   owner       6572 non-null   object 
 4   location    6572 non-null   object 
 5   price       6319 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 308.2+ KB


In [16]:
df.duplicated().sum()

858

In [17]:
df = df[~df.duplicated()]

In [18]:
df.shape

(5714, 6)

## Handle Missing Values

In [19]:
df.isnull().sum()

model_name      0
model_year      0
kms_driven      3
owner           0
location        0
price         207
dtype: int64

### It looks like there are empty rows present,let's drop them.

In [21]:
df.dropna(inplace=True)

In [22]:
df.shape

(5504, 6)

In [23]:
df.to_csv('../data/processed/data.csv',index=False)