In [1]:
import pandas as pd
import numpy as np

In [4]:
dataframe = pd.read_csv("train.csv")

dataframe.isnull().sum()

Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5032
Price                   0
dtype: int64

In [5]:
dataframe.columns[dataframe.isna().any()].tolist()

['Mileage', 'Engine', 'Power', 'Seats', 'New_Price']

In [6]:
# As mileage, Engine, Power, Seats has minimal data missing values. So, removing these rows does not affect the original data frame for performing Data Analysis.

# Checking number of missing values for Mileage
print(dataframe['Mileage'].isna().value_counts())

# Removing 2 rows 
dataframe = dataframe[dataframe['Mileage'].notna()]

# checking number of missing values for Engine
print(dataframe['Engine'].isna().value_counts())

#removing 36 rows 
dataframe = dataframe[dataframe['Engine'].notna()]

# checking number of missing values for Power
print(dataframe['Power'].isna().value_counts())

#removing 36 rows 
dataframe = dataframe[dataframe['Power'].notna()]

# checking number of missing values for Seats
print(dataframe['Seats'].isna().value_counts())

#drop those 2 rows 
dataframe = dataframe[dataframe['Seats'].notna()]

#checking number of missing values for New_Price
print(dataframe['New_Price'].isnull().value_counts())

dataframe['New_Price'] = dataframe['New_Price'].str.replace(" Lakh", "").str.replace(" Cr","").astype(float)

#Since the values of  New_Price have higher variance, I have substituted the median for the NA values. Because there are a lot of NA values in the new_price dataset and replacing with the mean is not optimal, eliminating the rows eliminates lost data.

new_price_median = dataframe[dataframe['New_Price'].notna()]['New_Price'].median()

print(new_price_median)

#Replacing NA values with median 
dataframe['New_Price'] = dataframe['New_Price'].fillna(value = new_price_median)

dataframe.head()


False    5845
True        2
Name: Mileage, dtype: int64
False    5809
True       36
Name: Engine, dtype: int64
False    5809
Name: Power, dtype: int64
False    5807
True        2
Name: Seats, dtype: int64
True     4993
False     814
Name: New_Price, dtype: int64
11.475000000000001


Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,11.475,12.5
1,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,13 km/kg,1199 CC,88.7 bhp,5.0,8.61,4.5
2,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,11.475,6.0
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,11.475,17.74
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,11.475,3.5


In [7]:
# Removing the units from some of the attributes and keeping only numerical values

dataframe['Mileage'] = dataframe['Mileage'].str.replace(" kmpl","").str.replace(" km/kg","").astype(float)
dataframe['Engine'] = dataframe['Engine'].str.replace(" CC","").astype(int)
dataframe['Power'] = dataframe['Power'].str.replace(" bhp","").astype(float)

dataframe.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582,126.2,5.0,11.475,12.5
1,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,13.0,1199,88.7,5.0,8.61,4.5
2,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248,88.76,7.0,11.475,6.0
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968,140.8,5.0,11.475,17.74
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461,63.1,5.0,11.475,3.5


In [8]:
# Change the categorical variables (“Fuel_Type” and “Transmission”) into numerical one hot encoded value


dataframe = pd.get_dummies(dataframe, columns=['Fuel_Type', 'Transmission'], drop_first=True)


In [9]:
dataframe

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,Fuel_Type_Petrol,Transmission_Manual
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,First,19.67,1582,126.20,5.0,11.475,12.50,0,1
1,2,Honda Jazz V,Chennai,2011,46000,First,13.00,1199,88.70,5.0,8.610,4.50,1,1
2,3,Maruti Ertiga VDI,Chennai,2012,87000,First,20.77,1248,88.76,7.0,11.475,6.00,0,1
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Second,15.20,1968,140.80,5.0,11.475,17.74,0,0
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,First,23.08,1461,63.10,5.0,11.475,3.50,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5842,6014,Maruti Swift VDI,Delhi,2014,27365,First,28.40,1248,74.00,5.0,7.880,4.75,0,1
5843,6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,First,24.40,1120,71.00,5.0,11.475,4.00,0,1
5844,6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Second,14.00,2498,112.00,8.0,11.475,2.90,0,1
5845,6017,Maruti Wagon R VXI,Kolkata,2013,46000,First,18.90,998,67.10,5.0,11.475,2.65,1,1


In [11]:
# Creating one more feature i.e., current age and calculating current age of car

dataframe['Current_Age'] = 2023 - dataframe['Year']

dataframe.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,Fuel_Type_Petrol,Transmission_Manual,Current_Age_In_Years,Current_Age
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,First,19.67,1582,126.2,5.0,11.475,12.5,0,1,8,8
1,2,Honda Jazz V,Chennai,2011,46000,First,13.0,1199,88.7,5.0,8.61,4.5,1,1,12,12
2,3,Maruti Ertiga VDI,Chennai,2012,87000,First,20.77,1248,88.76,7.0,11.475,6.0,0,1,11,11
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Second,15.2,1968,140.8,5.0,11.475,17.74,0,0,10,10
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,First,23.08,1461,63.1,5.0,11.475,3.5,0,1,10,10
