<h1> Let's clean and Fix the issues that we found early

In [25]:
%config IPCompleter.use_jedi = False 
%config Completer.evaluation = 'limited'
import warnings
warnings.filterwarnings('ignore')

In [26]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [52]:
cars = pd.read_csv('Data/quikr_car.csv')
cars_backup = cars.copy() # Taking a back-up

**There are some row where year col has invalid values(Not a year) remove these**

In [53]:
# removing these rows where year is not a valid year
cars = cars[cars['year'].str.isnumeric()]

In [54]:
cars[~cars['year'].str.isnumeric()]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type


<h2> **year is in string format. Need's to make it integer**

In [55]:
cars['year'] = cars['year'].astype(int) 

In [56]:
cars.shape

(842, 6)

<h2> There are some duplicate values need's to remove

In [57]:
cars.duplicated().sum() 

np.int64(93)

In [58]:
cars = cars.drop_duplicates()

In [59]:
cars.shape

(749, 6)

In [60]:
# price col has invalid value.'Ask For Price' (non-numeric).
cars[cars['Price'] == 'Ask For Price'].shape

(22, 6)

In [61]:
# There are 22 rows. So remove these
cars = cars[cars['Price'] != 'Ask For Price']

In [62]:
cars.shape

(727, 6)

In [63]:
# Now price col also have another issue. 
# 1. need to remove comma's from price and
# 2. need to convert into numeric

In [64]:
cars['Price'] = cars['Price'].str.replace(',' , '')

In [65]:
cars['Price'] = cars['Price'].astype(float)

In [66]:
cars.shape

(727, 6)

In [67]:
cars['Price'].isna().sum()

np.int64(0)

<h3> Let's Solve the kms_driven column

In [68]:
# There are 2 issue's:
# 1. need to remove the kms word
# 2. need to convert into numerical value

In [69]:
cars['kms_driven'].unique()

array(['45,000 kms', '40 kms', '28,000 kms', '36,000 kms', '41,000 kms',
       '25,000 kms', '24,530 kms', '60,000 kms', '30,000 kms',
       '32,000 kms', '48,660 kms', '4,000 kms', '16,934 kms',
       '43,000 kms', '35,550 kms', '39,522 kms', '39,000 kms',
       '55,000 kms', '72,000 kms', '15,975 kms', '70,000 kms',
       '23,452 kms', '35,522 kms', '48,508 kms', '15,487 kms',
       '82,000 kms', '20,000 kms', '68,000 kms', '38,000 kms',
       '27,000 kms', '33,000 kms', '46,000 kms', '16,000 kms',
       '47,000 kms', '35,000 kms', '30,874 kms', '15,000 kms',
       '29,685 kms', '1,30,000 kms', '19,000 kms', '54,000 kms',
       '13,000 kms', '38,200 kms', '22,000 kms', '50,000 kms',
       '13,500 kms', '3,600 kms', '45,863 kms', '60,500 kms',
       '12,500 kms', '18,000 kms', '13,349 kms', '29,000 kms',
       '44,000 kms', '42,000 kms', '14,000 kms', '49,000 kms',
       '36,200 kms', '51,000 kms', '1,04,000 kms', '33,333 kms',
       '33,600 kms', '5,600 kms', '7,500 km

In [70]:
cars[cars['kms_driven'] == 'Petrol']

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
890,Honda Amaze 1.2 E i VTEC,Honda,2014,180000.0,Petrol,
891,Chevrolet Sail 1.2 LT ABS,Chevrolet,2014,160000.0,Petrol,


In [71]:
# removing row where kms_driven is 'petrol'
cars = cars[cars['kms_driven'] != 'Petrol']

In [77]:
# remove kms ans extra space from first and last 
cars['kms_driven'] = cars['kms_driven'].str.split(' ').str.get(0)

In [80]:
cars['kms_driven'] = cars['kms_driven'].str.replace(',' , '')

In [81]:
cars['kms_driven']

0       45000
1          40
3       28000
4       36000
6       41000
        ...  
883     50000
885     30000
886    132000
888     27000
889     40000
Name: kms_driven, Length: 725, dtype: object

In [83]:
# convert into integers
cars['kms_driven'] = cars['kms_driven'].astype(int)

In [84]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 725 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        725 non-null    object 
 1   company     725 non-null    object 
 2   year        725 non-null    int64  
 3   Price       725 non-null    float64
 4   kms_driven  725 non-null    int64  
 5   fuel_type   724 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 39.6+ KB


<h3> Now solve fuel_type col's issue's

In [85]:
# Issue's: 
# has missing values
cars['fuel_type'].isna().sum()

np.int64(1)

In [86]:
cars[cars['fuel_type'].isna()]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
132,Toyota Corolla,Toyota,2009,275000.0,26000,


In [88]:
# remove this one row
cars = cars[~cars['fuel_type'].isna()]

In [89]:
cars.shape

(724, 6)

<h2> Let's Start with name column

In [94]:
cars['name']

0        Hyundai Santro Xing XO eRLX Euro III
1                     Mahindra Jeep CL550 MDI
3      Hyundai Grand i10 Magna 1.2 Kappa VTVT
4            Ford EcoSport Titanium 1.5L TDCi
6                                   Ford Figo
                        ...                  
883                Maruti Suzuki Ritz VXI ABS
885                 Tata Indica V2 DLE BS III
886                      Toyota Corolla Altis
888                       Tata Zest XM Diesel
889                        Mahindra Quanto C8
Name: name, Length: 724, dtype: object

In [98]:
# name col is very spam type. So let's take 1st 3 words as car_name
''' 
we will spit the name col based on space. Then we will take 1st 3 word's as car name. Then we will join them with a space
'''
cars['name'] = cars['name'].str.split(' ').str.slice(0 , 3).str.join(' ')

In [99]:
cars

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000.0,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000.0,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000.0,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000.0,36000,Diesel
6,Ford Figo,Ford,2012,175000.0,41000,Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz,Maruti,2011,270000.0,50000,Petrol
885,Tata Indica V2,Tata,2009,110000.0,30000,Diesel
886,Toyota Corolla Altis,Toyota,2009,300000.0,132000,Petrol
888,Tata Zest XM,Tata,2018,260000.0,27000,Diesel


In [101]:
# As some indice's are removed so let's reset them
cars = cars.reset_index(drop = True)

In [102]:
cars.shape

(724, 6)

In [103]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        724 non-null    object 
 1   company     724 non-null    object 
 2   year        724 non-null    int64  
 3   Price       724 non-null    float64
 4   kms_driven  724 non-null    int64  
 5   fuel_type   724 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 34.1+ KB


## Now Our Data is cleaned So now it's time to perform some EDA. Before doing this let's save the cleaned Data

In [105]:
file_path = r'Data/cleaned_car_data.csv'
cars.to_csv(file_path , index = False)