In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('../data/raw/data.csv')

In [6]:
df.shape

(8064, 8)

In [7]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
0,Royal Enfield Thunderbird 350cc 2016,2016,15144 Km,first owner,pune,\n\n 40 kmpl,19.80 bhp,115320
1,Yamaha FZ16 150cc 2014,2014,Mileage 58 Kmpl,first owner,bangalore,\n\n 58 Kmpl,13 bhp,40000
2,Royal Enfield Classic 350cc 2018,2018,25000 Km,first owner,chennai,\n\n 35 kmpl,19.80 bhp,136900
3,Yamaha MT-15 150cc 2019,2019,5169 Km,first owner,bangalore,\n\n,19 bhp,122374
4,Yamaha YZF-R15 150cc 2010,2010,86728 Km,first owner,pune,\n\n 42 kmpl,16 bhp,38000


In [8]:
df.tail(10)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
8054,Yamaha YZF-R15 2.0 150cc 2014,2014,15910 Km,first owner,gurgaon,,,45000
8055,Yamaha YZF-R15 150cc 2008,2008,33487 Km,first owner,faridabad,,,32000
8056,Royal Enfield Classic 350cc 2017,2017,4932 Km,first owner,ludhiana,,,130000
8057,Hero Passion Xpro Alloy 2016,2016,12009 Km,first owner,bhopal,,,43000
8058,TVS Apache RTR 160cc 2010,2010,60 Kmpl,second owner,chennai,,,23000
8059,Honda CB Hornet 160R CBS 2016,2016,52 Kmpl,first owner,bangalore,,,49000
8060,KTM Duke 390cc 2016,2016,38500 Km,first owner,pune,,,115000
8061,Bajaj Pulsar 150cc 2008,2008,65 Kmpl,first owner,chennai,,,16000
8062,Honda CB Twister 110cc 2010,2010,71 Kmpl,first owner,chennai,,,16500
8063,TVS Apache RTR 200 4V FI Race Edition 2.0 2018,2018,40 Kmpl,first owner,indore,,,78000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8064 entries, 0 to 8063
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   model_name  8064 non-null   object
 1   model_year  8064 non-null   int64 
 2   kms_driven  8064 non-null   object
 3   owner       8064 non-null   object
 4   location    8043 non-null   object
 5   mileage     2872 non-null   object
 6   power       2875 non-null   object
 7   price       8064 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 504.1+ KB


## Data Cleaning

In [10]:
# remove whitespace from data
df['model_name'] = df.model_name.str.strip()
df['mileage'] = df.mileage.str.strip()

In [11]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
7407,Bajaj Pulsar RS200 2015,2015,35 Kmpl,first owner,delhi,,,87000
5582,Hero Splendor Plus 100cc 2019,2019,16057 Km,first owner,bangalore,,,72000
3816,Bajaj Avenger Street 160 ABS 2019,2019,2000 Km,first owner,kolkata,,,95000
1982,Bajaj Pulsar 125cc Disc BS6 2020,2020,66000 Km,third owner,jaipur,62 kmpl,11.64 bhp,54500
1292,TVS Apache RTR 200 4V Carburetor 2017,2017,Mileage 40 Kmpl,first owner,thane,40 Kmpl,20.70 bhp,71600


In [12]:
# Remove commas from kms_driven and price columns

def clean_kms_driven(val):
    if not val:
        return ''

    val = val.lower()

    if 'kms' in val:
        val = val.replace('kms','')

    if 'km' in val:
        val = val.replace('km','')

    if 'mileage' in val.lower():
        return np.nan

    return val.replace(',','')


def clean_price(val):
    """
    1. remove currency symbol.
    2. remove commas
    """
    if not val:
        return ''

    val = str(val)

    val = val.replace(',','')

    if 'lakh' in val.lower():
        val = val.lower().replace('lakh','')
        val = float(val)
        return val * 100000

    return val
    

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

df['price'] = df['price'].apply(clean_price)



In [13]:
df['kms_driven'] = pd.to_numeric(df['kms_driven'],errors='coerce',downcast='integer')
df['price'] = pd.to_numeric(df['price'],errors='coerce',downcast='integer')

In [14]:
# location feature have comma separated values
# need to make it a single value, for that let's take
# the last value as it is the main city name

def clean_location(val):
    if not val:
        return ''
    
    return str(val).split(',')[-1].strip()

df['location'] = df['location'].apply(clean_location)

In [15]:
df.location.value_counts()[:10]

delhi         1523
bangalore      805
mumbai         788
chennai        349
jaipur         338
gurgaon        337
hyderabad      326
pune           321
ahmedabad      250
chandigarh     172
Name: location, dtype: int64

In [16]:

def clean_owner(val):
    if not val:
        return val

    val = val.replace('owner','')

    if 'or more' in val.lower():
        return 'fourth'

    return val

df['owner'] = df['owner'].apply(clean_owner)

In [17]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
5211,Bajaj Avenger Street 150 2016,2016,40000.0,second,pune,,,70000.0
6703,Royal Enfield Standard 500cc 2018,2017,2330.0,first,delhi,,,161793.0
1948,TVS Apache RTR 160cc 2017,2017,,first,chandigarh,60 Kmpl,15.2 bhp,56800.0
4790,Bajaj Pulsar RS200 ABS 2018,2018,,first,jalandhar,,,120000.0
6036,Royal Enfield Electra 350cc 2017,2017,4500.0,first,noida,,,106000.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8064 entries, 0 to 8063
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  8064 non-null   object 
 1   model_year  8064 non-null   int64  
 2   kms_driven  5987 non-null   float64
 3   owner       8064 non-null   object 
 4   location    8064 non-null   object 
 5   mileage     2872 non-null   object 
 6   power       2875 non-null   object 
 7   price       8029 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 504.1+ KB


### Handle Mileage

In [19]:
def clean_mileage(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('kmpl','')

df['mileage'] = df.mileage.apply(clean_mileage)

### Hanlde power

In [20]:
def clean_power(val):
    if not val:
        return np.nan
    
    return str(val).lower().replace('bhp','')

df['power'] = df.power.apply(clean_power)

In [21]:
df.sample(5)

Unnamed: 0,model_name,model_year,kms_driven,owner,location,mileage,power,price
3542,Royal Enfield Classic Gunmetal Grey 350cc 2018,2018,3500.0,first,patiala,,,135000.0
7308,Hero Passion Plus 100cc 2005,2005,30000.0,first,delhi,,,9000.0
7782,Royal Enfield Classic 350cc 2017,2017,13190.0,first,delhi,,,100000.0
3455,Yamaha FZ16 150cc 2011,2011,,first,mumbai,,,25000.0
1667,Hero Xtreme 160R Front Disc BS6 2020,2020,15000.0,first,jaipur,55.47,15.0,93000.0


In [22]:
df.duplicated().sum()

73

In [23]:
df = df[~df.duplicated()]

In [24]:
df.shape

(7991, 8)

## Handle Missing Values

In [25]:
df.isnull().sum()

model_name       0
model_year       0
kms_driven    2043
owner            0
location         0
mileage        312
power            0
price           35
dtype: int64

## Fix column type

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7991 entries, 0 to 8063
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7991 non-null   object 
 1   model_year  7991 non-null   int64  
 2   kms_driven  5948 non-null   float64
 3   owner       7991 non-null   object 
 4   location    7991 non-null   object 
 5   mileage     7679 non-null   object 
 6   power       7991 non-null   object 
 7   price       7956 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 561.9+ KB


In [27]:
df['mileage'] = pd.to_numeric(df['mileage'],errors='coerce') #df.mileage.astype(int,errors='ignore')
df['power'] = pd.to_numeric(df['power'],errors='coerce')

In [28]:
df['mileage'] = df.mileage.astype(int,errors='ignore') #pd.to_numeric(df['mileage'],errors='coerce',downcast='integer') #df.mileage.astype(int,errors='ignore')
df['power'] = df.power.astype(int,errors='ignore')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7991 entries, 0 to 8063
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   model_name  7991 non-null   object 
 1   model_year  7991 non-null   int64  
 2   kms_driven  5948 non-null   float64
 3   owner       7991 non-null   object 
 4   location    7991 non-null   object 
 5   mileage     2436 non-null   float64
 6   power       2722 non-null   float64
 7   price       7956 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 561.9+ KB


In [30]:
df.shape

(7991, 8)

In [31]:
cols = ['kms_driven', 'price','mileage','power']
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

In [32]:
df.to_csv('../data/processed/data.csv',index=False)