# Indian Cars Data Cleaning
Cleaning the extensive Indian Cars dataset in this file

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('indian_cars.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,...,Leather_Wrapped_Steering,Automatic_Headlamps,Engine_Type,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,Battery,Electric_Range
0,0,Tata,Nano Genx,Xt,"Rs. 2,92,667",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,...,,,,,,,,,,
1,1,Tata,Nano Genx,Xe,"Rs. 2,36,447",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,...,,,,,,,,,,
2,2,Tata,Nano Genx,Emax Xm,"Rs. 2,96,661",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,...,,,,,,,,,,
3,3,Tata,Nano Genx,Xta,"Rs. 3,34,768",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,...,,,,,,,,,,
4,4,Tata,Nano Genx,Xm,"Rs. 2,72,223",624 cc,2.0,2.0,RWD (Rear Wheel Drive),In-line,...,,,,,,,,,,


In [4]:
df.shape

(1276, 141)

In [5]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1276 entries, 0 to 1275
Data columns (total 141 columns):
 #    Column                                     Non-Null Count  Dtype  
---   ------                                     --------------  -----  
 0    Unnamed: 0                                 1276 non-null   int64  
 1    Make                                       1201 non-null   object 
 2    Model                                      1276 non-null   object 
 3    Variant                                    1276 non-null   object 
 4    Ex-Showroom_Price                          1276 non-null   object 
 5    Displacement                               1264 non-null   object 
 6    Cylinders                                  1210 non-null   float64
 7    Valves_Per_Cylinder                        1174 non-null   float64
 8    Drivetrain                                 1269 non-null   object 
 9    Cylinder_Configuration                     1263 non-null   object 
 10   Emission_N

In [6]:
# Deleting extra column with indexes
del df['Unnamed: 0']

### Dealing with duplicates

In [7]:
df.duplicated().sum()

9

In [8]:
df = df.drop_duplicates()
df.duplicated().sum()

0

### Shape and nulls

In [9]:
df.shape

(1267, 140)

In [10]:
df.isnull().sum()

Make                   75
Model                   0
Variant                 0
Ex-Showroom_Price       0
Displacement           12
                     ... 
USB_Ports            1238
Heads-Up_Display     1216
Welcome_Lights       1198
Battery              1254
Electric_Range       1250
Length: 140, dtype: int64

# Data Cleaning Process Outline 

1. **Categorical Columns**, depending on the circumstances, have been dealt with in four ways:

        i. The Nulls have been filled with the mode
        ii. The data is too varied for the nulls to be filled with mode, so nulls have been filled with empty strings. Example: Basic warranty column has extremely varied data with no landslide majority.
        iii. Columns with 'Yes' and Nulls have been had nulls being replaced by 'No'.
        iv. Based on the column name and car feature availability across the models in the market, certain null values have been replaced with 'No'. Example: Power Steering column had multiple types of power steering listed along with null values. Since not all cars have power steering, the nulls were replaced with no.
        
2. **Numerical columns** were cleaned after removing extra string characters, converted to float or int datatypes and nulls were filled with the measures of central tendency.
        

## Columns with "yes" and Nulls

In [11]:
# Converting the columns with only 'Yes' and nulls into a list:

columns_to_fill = [
    "Start_/_Stop_Button", "Aux-in_Compatibility", "Average_Fuel_Consumption", "Bluetooth",
    "CD_/_MP3_/_DVD_Player", "Central_Locking", "Child_Safety_Locks", "Distance_to_Empty",
    "Engine_Malfunction_Light", "FM_Radio", "Low_Fuel_Warning", "Multifunction_Display",
    "Auto-Dimming_Rear-View_Mirror", "Hill_Assist", "Gear_Indicator",
    "3_Point_Seat-Belt_in_Middle_Rear_Seat", "Ambient_Lightning", "Cargo/Boot_Lights",
    "Engine_Immobilizer", "High_Speed_Alert_System", "Lane_Watch_Camera/_Side_Mirror_Camera",
    "Passenger_Side_Seat-Belt_Reminder", "Seat_Back_Pockets", "Voice_Recognition",
    "Walk_Away_Auto_Car_Lock", "ABS_(Anti-lock_Braking_System)", "Door_Ajar_Warning",
    "EBD_(Electronic_Brake-force_Distribution)", "Fasten_Seat_Belt_Warning",
    "Gear_Shift_Reminder", "Key_Off_Reminder", "USB_Compatibility", "Android_Auto",
    "Apple_CarPlay", "Cigarette_Lighter", "Average_Speed", "EBA_(Electronic_Brake_Assist)",
    "Navigation_System", "Second_Row_AC_Vents", "Tyre_Pressure_Monitoring_System",
    "iPod_Compatibility", "ESP_(Electronic_Stability_Program)", "Cooled_Glove_Box",
    "Turbocharger", "ISOFIX_(Child-Seat_Mount)", "Rain_Sensing_Wipers", "Paddle_Shifters",
    "Leather_Wrapped_Steering", "Automatic_Headlamps", "ASR_/_Traction_Control",
    "Cruise_Control", "USB_Ports", "Heads-Up_Display", "Welcome_Lights"   
]

In [12]:
# user defined function to fill the null values with 'No'

def replace_nulls(df, columns_to_replace):
    for column in columns_to_replace:
        if column in df.columns:
            df[column].fillna("No", inplace=True)
    return df

In [13]:
# using the function and saving the results in the dataset

df = replace_nulls(df, columns_to_fill)

In [14]:
# Checking a few columns at random to see if our udf worked (it did!)

print(df['ISOFIX_(Child-Seat_Mount)'].value_counts())
print(df['Engine_Immobilizer'].value_counts())
print(df['Child_Safety_Locks'].value_counts())

ISOFIX_(Child-Seat_Mount)
No     657
Yes    610
Name: count, dtype: int64
Engine_Immobilizer
Yes    1207
No       60
Name: count, dtype: int64
Child_Safety_Locks
Yes    1192
No       75
Name: count, dtype: int64


## Make

In [15]:
df[df['Make'].isnull()]

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm,...,Leather_Wrapped_Steering,Automatic_Headlamps,Engine_Type,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,Battery,Electric_Range
252,,Mercedes-Benz B-Class,B 180 Sport,"Rs. 29,88,311",1595 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
253,,Mercedes-Benz B-Class,B 200 Cdi Sport,"Rs. 30,89,851",2143 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
254,,Mercedes-Benz B-Class,B 180 Night Edition,"Rs. 31,98,662",1595 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
255,,Mercedes-Benz B-Class,B 200 Cdi Night Edition,"Rs. 33,03,282",2143 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
256,,Mercedes-Benz Cla-Class,200 Sport,"Rs. 35,99,000",1991 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151,,Mercedes-Benz Gla-Class,45 Aero Edition,"Rs. 80,67,000",1991 cc,4.0,4.0,4WD,In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
1219,,Rolls-Royce Phantom,Sedan,"Rs. 9,50,00,000",6749 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
1220,,Rolls-Royce Phantom,Ewb,"Rs. 7,54,00,000",6749 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
1251,,Mercedes-Benz Glc,200 Progressive,"Rs. 52,75,000",1991 cc,4.0,4.0,RWD (Rear Wheel Drive),In-line,BS 6,...,Yes,Yes,,Yes,Yes,No,Yes,Yes,,


In [16]:
df[df['Model'].str.startswith('Mercedes-Benz')]

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm,...,Leather_Wrapped_Steering,Automatic_Headlamps,Engine_Type,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,Battery,Electric_Range
252,,Mercedes-Benz B-Class,B 180 Sport,"Rs. 29,88,311",1595 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
253,,Mercedes-Benz B-Class,B 200 Cdi Sport,"Rs. 30,89,851",2143 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
254,,Mercedes-Benz B-Class,B 180 Night Edition,"Rs. 31,98,662",1595 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
255,,Mercedes-Benz B-Class,B 200 Cdi Night Edition,"Rs. 33,03,282",2143 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
256,,Mercedes-Benz Cla-Class,200 Sport,"Rs. 35,99,000",1991 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
257,,Mercedes-Benz Cla-Class,200 Cdi Style,"Rs. 31,72,000",2143 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
258,,Mercedes-Benz Cla-Class,200 D Sport,"Rs. 36,99,000",2143 cc,4.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
303,,Mercedes-Benz E-Class Cabriolet,E400,"Rs. 77,11,338",2996 cc,6.0,4.0,,V,BS IV,...,No,Yes,,No,Yes,No,No,No,,
307,,Mercedes-Benz Gls,350 Cdi,"Rs. 88,20,000",2987 cc,6.0,4.0,AWD (All Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
308,,Mercedes-Benz Gls,63 Amg,"Rs. 1,91,99,181",5461 cc,8.0,4.0,AWD (All Wheel Drive),V,BS IV,...,Yes,No,,Yes,Yes,No,No,No,,


In [17]:
#updating the 'Make' column for the rows where the 'Model' column starts with 'Mercedes-Benz'
df.loc[df['Model'].str.startswith('Mercedes-Benz'), 'Make'] = 'Mercedes-Benz'

In [18]:
#removing 'Mercedes-Benz ' from the beginning of the 'model' column for those rows
df['Model'] = df['Model'].str.replace('Mercedes-Benz ', '', regex=False)

Repeating the steps for Rolls Royce

In [19]:
df[df['Model'].str.startswith('Rolls-Royce')]

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm,...,Leather_Wrapped_Steering,Automatic_Headlamps,Engine_Type,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,Battery,Electric_Range
349,,Rolls-Royce Ghost Series Ii,6.6,"Rs. 4,67,09,733",6593 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
350,,Rolls-Royce Ghost Series Ii,Ewb,"Rs. 5,32,47,201",6592 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
351,,Rolls-Royce Wraith,Coupe,"Rs. 5,00,25,840",6592 cc,12.0,4.0,AWD (All Wheel Drive),V,BS IV,...,No,Yes,,Yes,Yes,No,No,No,,
353,,Rolls-Royce Cullinan,Suv,"Rs. 6,95,00,000",6750 cc,12.0,4.0,AWD (All Wheel Drive),V,BS 6,...,No,No,,No,No,No,No,No,,
354,,Rolls-Royce Phantom Coupe,6.8 L,"Rs. 7,73,12,661",6749 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
879,,Rolls-Royce Dawn,Convertible,"Rs. 5,92,16,193",6598 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
880,,Rolls-Royce Drophead Coupe,Phantom,"Rs. 8,37,55,383",6749 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
1219,,Rolls-Royce Phantom,Sedan,"Rs. 9,50,00,000",6749 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,
1220,,Rolls-Royce Phantom,Ewb,"Rs. 7,54,00,000",6749 cc,12.0,4.0,RWD (Rear Wheel Drive),V,BS IV,...,Yes,Yes,,Yes,Yes,No,No,No,,


In [20]:
df.loc[df['Model'].str.startswith('Rolls-Royce'), 'Make'] = 'Rolls-Royce'

df['Model'] = df['Model'].str.replace('Rolls-Royce ', '', regex=False)

Checking for remaining nulls, if any

In [21]:
df[df['Make'].isnull()]

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm,...,Leather_Wrapped_Steering,Automatic_Headlamps,Engine_Type,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,Battery,Electric_Range
883,,Go+,Datsun D,"Rs. 4,12,292",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
884,,Go+,Datsun T,"Rs. 5,52,656",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
885,,Go+,Datsun T (O),"Rs. 5,74,448",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
886,,Go+,Datsun A (O),"Rs. 5,55,196",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
887,,Go+,Datsun A,"Rs. 5,00,575",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
888,,Go+,Datsun T Vdc,"Rs. 5,93,361",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
889,,Go+,Datsun T (O) Vdc,"Rs. 6,15,153",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
890,,Go+,Datsun T Cvt,"Rs. 6,58,361",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,
891,,Go+,Datsun T (O) Cvt,"Rs. 6,80,153",1198 cc,3.0,4.0,FWD (Front Wheel Drive),In-line,BS IV,...,No,No,,No,No,No,No,No,,


In [22]:
# Go+ is a Datsun Brand. Filling the empty spaces with Datsun

In [23]:
df.loc[df['Model'].str.startswith('Go+'), 'Make'] = 'Datsun'

In [24]:
df['Make'].shape

(1267,)

## Car Name 

In [25]:
df['carname'] = df['Make'] + '-' + df['Model']
df['carname']

0           Tata-Nano Genx
1           Tata-Nano Genx
2           Tata-Nano Genx
3           Tata-Nano Genx
4           Tata-Nano Genx
               ...        
1271            Honda-City
1272            Honda-City
1273            Honda-City
1274            Honda-City
1275    Mitsubishi-Montero
Name: carname, Length: 1267, dtype: object

## Ex-Showroom Price

In [26]:
df['Ex-Showroom_Price'] = df['Ex-Showroom_Price'].str.replace(',','')
df['Ex-Showroom_Price'] = df['Ex-Showroom_Price'].str.replace('Rs. ','')
df['Ex-Showroom_Price'] = df['Ex-Showroom_Price'].astype(int)

In [27]:
df['Ex-Showroom_Price']

0        292667
1        236447
2        296661
3        334768
4        272223
         ...   
1271    1302000
1272    1421000
1273    1431000
1274    1201000
1275    6862560
Name: Ex-Showroom_Price, Length: 1267, dtype: int32

## Displacement 

In [28]:
df['Displacement'] = df['Displacement'].str.replace('cc','')

In [29]:
df['Displacement'].isnull().sum()

12

In [30]:
df['Displacement'] = df['Displacement'].astype(float)

In [31]:
print('Displacement Mean', df['Displacement'].mean())
print('Displacement Median', df['Displacement'].median())
print('Displacement Mode', df['Displacement'].mode())

Displacement Mean 1856.5083665338645
Displacement Median 1497.0
Displacement Mode 0    1197.0
Name: Displacement, dtype: float64


In [32]:
df["Displacement"].fillna(1497.0, inplace = True)

In [33]:
print(df['Displacement'].isnull().sum())
print()
print(df['Displacement'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Displacement
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Cylinders

In [34]:
print('Cylinder Information')
print('Nulls:', df['Cylinders'].isnull().sum())
print('Data Type:', df['Cylinders'].dtype)

Cylinder Information
Nulls: 66
Data Type: float64


In [35]:
print('Cylinders Mean', df['Cylinders'].mean())
print('Cylinders Median', df['Cylinders'].median())
print('Cylinders Mode', df['Cylinders'].mode())

Cylinders Mean 4.3838467943380515
Cylinders Median 4.0
Cylinders Mode 0    4.0
Name: Cylinders, dtype: float64


In [36]:
df["Cylinders"].fillna(4.0, inplace = True)


print(df['Cylinders'].isnull().sum())
print()
print(df['Cylinders'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Cylinders
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


In [37]:
## Valves_Per_Cylinder

In [38]:
print('Valves_Per_Cylinder Information')
print('Nulls:', df['Valves_Per_Cylinder'].isnull().sum())
print('Data Type:', df['Valves_Per_Cylinder'].dtype)

Valves_Per_Cylinder Information
Nulls: 102
Data Type: float64


In [39]:
print('Valves_Per_Cylinder Mean', df['Valves_Per_Cylinder'].mean())
print('Valves_Per_Cylinders Median', df['Valves_Per_Cylinder'].median())
print('Valves_Per_Cylinder Mode', df['Valves_Per_Cylinder'].mode())

Valves_Per_Cylinder Mean 3.9776824034334766
Valves_Per_Cylinders Median 4.0
Valves_Per_Cylinder Mode 0    4.0
Name: Valves_Per_Cylinder, dtype: float64


In [40]:
df["Valves_Per_Cylinder"].fillna(4.0, inplace = True)


print(df['Valves_Per_Cylinder'].isnull().sum())
print()
print(df['Valves_Per_Cylinder'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Valves_Per_Cylinder
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Drivetrain

In [41]:
df['Drivetrain'].value_counts()

Drivetrain
FWD (Front Wheel Drive)    878
RWD (Rear Wheel Drive)     170
AWD (All Wheel Drive)      153
4WD                         59
Name: count, dtype: int64

In [42]:
df['Drivetrain'].isnull().sum()

7

In [43]:
df["Drivetrain"].fillna(df["Drivetrain"].mode()[0], inplace=True)

In [44]:
print(df['Drivetrain'].isnull().sum())
print()
print(df['Drivetrain'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Drivetrain
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Cylinder_Configuration

In [45]:
df['Cylinder_Configuration'].value_counts()

Cylinder_Configuration
In-line    1068
V           176
W             6
Flat          4
Name: count, dtype: int64

In [46]:
df['Cylinder_Configuration'].isnull().sum()

13

In [47]:
df["Cylinder_Configuration"].fillna(df["Cylinder_Configuration"].mode()[0], inplace=True)

print(df['Cylinder_Configuration'].isnull().sum())
print()
print(df['Cylinder_Configuration'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Cylinder_Configuration
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Emission_Norm

In [48]:
print('Emission_Norm Value Counts', df['Emission_Norm'].value_counts())
print()
print('Emission_Norm Nulls', df['Emission_Norm'].isnull().sum())

Emission_Norm Value Counts Emission_Norm
BS IV     870
BS 6      358
BS VI      19
BS III      9
Name: count, dtype: int64

Emission_Norm Nulls 11


In [49]:
df["Emission_Norm"].fillna(df["Emission_Norm"].mode()[0], inplace=True)

print(df['Emission_Norm'].isnull().sum())
print()
print(df['Emission_Norm'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Emission_Norm
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Engine_Location

In [50]:
print('Engine_Location Value Counts', df['Engine_Location'].value_counts())
print()
print('Engine_Location Nulls', df['Engine_Location'].isnull().sum())

Engine_Location Value Counts Engine_Location
Front, Transverse       796
Front, Longitudinal     404
Rear, Transverse         16
Rear Mid, Transverse      3
Mid, Longitudinal         2
Mid, Transverse           2
Rear, Longitudinal        1
Name: count, dtype: int64

Engine_Location Nulls 43


In [51]:
df["Engine_Location"].fillna(df["Engine_Location"].mode()[0], inplace=True)

print(df['Engine_Location'].isnull().sum())
print()
print(df['Engine_Location'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Engine_Location
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Fuel_System

In [52]:
print('Fuel_System Value Counts', df['Fuel_System'].value_counts())
print()
print('Fuel_Systemn Nulls', df['Fuel_System'].isnull().sum())

Fuel_System Value Counts Fuel_System
Injection    1255
PGM - Fi        4
Name: count, dtype: int64

Fuel_Systemn Nulls 8


In [53]:
df["Fuel_System"].fillna(df["Fuel_System"].mode()[0], inplace=True)

print(df['Fuel_System'].isnull().sum())
print()
print(df['Fuel_System'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Fuel_System
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Fuel_Tank_Capacity

In [54]:
print('Fuel_Tank_Capacity Information')
print('Nulls:', df['Fuel_Tank_Capacity'].isnull().sum())
print('Data Type:', df['Fuel_Tank_Capacity'].dtype)

Fuel_Tank_Capacity Information
Nulls: 69
Data Type: object


In [55]:
df['Fuel_Tank_Capacity'] = df['Fuel_Tank_Capacity'].str.replace('litres','')
df['Fuel_Tank_Capacity'] = df['Fuel_Tank_Capacity'].astype(float)

print('Data Type:', df['Fuel_Tank_Capacity'].dtype)

Data Type: float64


In [56]:
print('Fuel_Tank_Capacity Mean', df['Fuel_Tank_Capacity'].mean())
print('Fuel_Tank_Capacity Median', df['Fuel_Tank_Capacity'].median())
print('Fuel_Tank_Capacity Mode', df['Fuel_Tank_Capacity'].mode())

Fuel_Tank_Capacity Mean 51.98489148580968
Fuel_Tank_Capacity Median 46.0
Fuel_Tank_Capacity Mode 0    45.0
Name: Fuel_Tank_Capacity, dtype: float64


In [57]:
df["Fuel_Tank_Capacity"].fillna(df["Fuel_Tank_Capacity"].median(), inplace=True)

print(df['Fuel_Tank_Capacity'].isnull().sum())
print()
print(df['Fuel_Tank_Capacity'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Fuel_Tank_Capacity
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Fuel_Type

In [58]:
print('Fuel_Type Value Counts', df['Fuel_Type'].value_counts())
print()
print('Fuel_Type Nulls', df['Fuel_Type'].isnull().sum())
print()
print(df['Fuel_Type'].info())

Fuel_Type Value Counts Fuel_Type
Petrol          642
Diesel          574
CNG              16
Hybrid           15
Electric         14
CNG + Petrol      6
Name: count, dtype: int64

Fuel_Type Nulls 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Fuel_Type
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Height

In [59]:
print('Height Information')
print('Nulls:', df['Height'].isnull().sum())
print('Data Type:', df['Height'].dtype)

Height Information
Nulls: 1
Data Type: object


In [60]:
df['Height'] = df['Height'].str.replace('mm','')
df['Height'] = df['Height'].astype(float)

print('Height Data Type:', df['Height'].dtype)

Height Data Type: float64


In [61]:
print('Height Mean', df['Height'].mean())
print('Height Median', df['Height'].median())
print('Height Mode', df['Height'].mode())

Height Mean 1589.715517377567
Height Median 1555.0
Height Mode 0    1520.0
Name: Height, dtype: float64


In [62]:
df["Height"].fillna(df["Height"].median(), inplace=True)

print(df['Height'].isnull().sum())
print()
print(df['Height'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Height
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Length

In [63]:
print('Length Information')
print('Nulls:', df['Length'].isnull().sum())
print('Data Type:', df['Length'].dtype)
print()
print('Info', df['Height'].info())

Length Information
Nulls: 0
Data Type: object

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Height
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
Info None


In [64]:
df['Length'] = df['Length'].str.replace('mm','')
df['Length'] = df['Length'].astype(float)

print('Data Type:', df['Length'].dtype)

Data Type: float64


## Width

In [65]:
print('Width Information')
print('Nulls:', df['Width'].isnull().sum())
print('Data Type:', df['Width'].dtype)

Width Information
Nulls: 12
Data Type: object


In [66]:
df['Width'] = df['Width'].str.replace('mm','')
df['Width'] = df['Width'].astype(float)

print('Data Type:', df['Width'].dtype)

Data Type: float64


In [67]:
print('Width Mean', df['Width'].mean())
print('Width Median', df['Width'].median())
print('Width Mode', df['Width'].mode())

Width Mean 1787.8229840637448
Width Median 1770.0
Width Mode 0    1695.0
Name: Width, dtype: float64


In [68]:
df["Height"].fillna(df["Height"].median(), inplace=True)

print(df['Height'].isnull().sum())
print()
print(df['Height'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Height
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Body_Type

In [69]:
print('Body_Type Value Counts', df['Body_Type'].value_counts())
print()
print('Body_Type Nulls', df['Body_Type'].isnull().sum())
print()
print(df['Body_Type'].info())

Body_Type Value Counts Body_Type
SUV                    438
Sedan                  333
Hatchback              316
Coupe                   41
MUV                     39
MPV                     39
Convertible             20
Crossover               18
Sports                   3
Pick-up                  3
Sports, Convertible      2
Sedan, Coupe             2
Crossover, SUV           2
SUV, Crossover           2
Sports, Hatchback        1
Sedan, Crossover         1
Coupe, Convertible       1
Name: count, dtype: int64

Body_Type Nulls 6

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Body_Type
Non-Null Count  Dtype 
--------------  ----- 
1261 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


In [70]:
df["Body_Type"].fillna(df["Body_Type"].mode()[0], inplace=True)

print(df['Body_Type'].isnull().sum())
print()
print(df['Body_Type'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Body_Type
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Doors

In [71]:
print('Doors Information')
print('Nulls:', df['Doors'].isnull().sum())
print('Data Type:', df['Doors'].dtype)
print()
print('Info', df['Doors'].info())

Doors Information
Nulls: 4
Data Type: float64

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Doors
Non-Null Count  Dtype  
--------------  -----  
1263 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
Info None


In [72]:
print('Doors Mean', df['Doors'].mean())
print('Doors Median', df['Doors'].median())
print('Doors Mode', df['Doors'].mode())

Doors Mean 4.547110055423595
Doors Median 5.0
Doors Mode 0    5.0
Name: Doors, dtype: float64


In [73]:
df["Doors"].fillna(df["Doors"].median(), inplace=True)

print(df['Doors'].isnull().sum())
print()
print(df['Doors'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Doors
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## City_Mileage

In [74]:
print('City_Mileage Information')
print('Nulls:', df['City_Mileage'].isnull().sum())
print('Data Type:', df['City_Mileage'].dtype)

City_Mileage Information
Nulls: 554
Data Type: object


In [75]:
df['City_Mileage'] = df['City_Mileage'].str.replace('?','')
df['City_Mileage'] = df['City_Mileage'].str.replace('km/litre','')
df['City_Mileage'] = df['City_Mileage'].str.replace(',','.')

In [76]:
df['City_Mileage'] = df['City_Mileage'].str.replace('12.5-12.7','12.6')

In [77]:
df['City_Mileage'] = df['City_Mileage'].astype(float)

print('Data Type:', df['City_Mileage'].dtype)

Data Type: float64


In [78]:
print('City_Mileage Mean', df['City_Mileage'].mean())
print('City_Mileage Median', df['City_Mileage'].median())
print('City_Mileage Mode', df['City_Mileage'].mode())

City_Mileage Mean 52.28705469845722
City_Mileage Median 15.1
City_Mileage Mode 0    12.0
Name: City_Mileage, dtype: float64


In [79]:
df["City_Mileage"].fillna(df["City_Mileage"].median(), inplace=True)

print(df['City_Mileage'].isnull().sum())
print()
print(df['City_Mileage'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: City_Mileage
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Highway_Mileage

In [80]:
print('Highway_Mileage Information')
print('Nulls:', df['Highway_Mileage'].isnull().sum())
print('Data Type:', df['Highway_Mileage'].dtype)

Highway_Mileage Information
Nulls: 792
Data Type: object


In [81]:
df['Highway_Mileage'] = df['Highway_Mileage'].str.replace('km/litre','')

In [82]:
df['Highway_Mileage'] = df['Highway_Mileage'].str.replace('8.3-8.4 ','8.3')

In [83]:
df['Highway_Mileage'] = df['Highway_Mileage'].astype(float)

print('Data Type:', df['Highway_Mileage'].dtype)

Data Type: float64


In [84]:
print('Highway_Mileage Mean', df['Highway_Mileage'].mean())
print('Highway_Mileage Median', df['Highway_Mileage'].median())
print('Highway_Mileage Mode', df['Highway_Mileage'].mode())

Highway_Mileage Mean 16.88701052631579
Highway_Mileage Median 17.19
Highway_Mileage Mode 0    22.0
Name: Highway_Mileage, dtype: float64


In [85]:
df["Highway_Mileage"].fillna(df["Highway_Mileage"].median(), inplace=True)

print(df['Highway_Mileage'].isnull().sum())
print()
print(df['Highway_Mileage'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Highway_Mileage
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## ARAI Certificate Mileage + CNG
Upon visual inspection, the ARAI Certificate Mileage column calculates the values in km/litres whereas the ARAI Certificate Mileage for CNG calculates in km/kg.

Furthermore, if the CNG column is filled, the regular mileage column is null and wise versa.

Therefore, according to the needs of these two columns, we will remove the extra texts from both columns and combine them into one to leave no null values.

The original columns will be deleted and the new column will be placed at their index value.

The null values (cells which were empty in both the original columns) will be then filled as per the procedures followed above.

In [86]:
# Removing extra texts
df['ARAI_Certified_Mileage'] = df['ARAI_Certified_Mileage'].str.replace(' km/litre', '').str.replace(' km/kg', '')
df['ARAI_Certified_Mileage_for_CNG'] = df['ARAI_Certified_Mileage_for_CNG'].str.replace(' km/litre', '').str.replace(' km/kg', '')

In [87]:
# Combining the two columns and making a new column at col. index 22
df.insert(22, 'Combined_ARIA_Mileage', df['ARAI_Certified_Mileage'].fillna('') 
          + df['ARAI_Certified_Mileage_for_CNG'].fillna(''))

#Deleting original columns
df.drop(['ARAI_Certified_Mileage', 'ARAI_Certified_Mileage_for_CNG'], axis=1, inplace=True)

In [88]:
df['Combined_ARIA_Mileage']

0        23.6
1        23.6
2          36
3        21.9
4        23.6
        ...  
1271     25.1
1272     25.1
1273     22.6
1274     17.8
1275    11.56
Name: Combined_ARIA_Mileage, Length: 1267, dtype: object

In [89]:
print('Combined_ARIA_Mileage Information')
print('Nulls:', df['Combined_ARIA_Mileage'].isnull().sum())
print('Data Type:', df['Combined_ARIA_Mileage'].dtype)

Combined_ARIA_Mileage Information
Nulls: 0
Data Type: object


In [90]:
df['Combined_ARIA_Mileage'] = pd.to_numeric(df['Combined_ARIA_Mileage'], errors='coerce')
df['Combined_ARIA_Mileage'] = df['Combined_ARIA_Mileage'].astype(float)

In [91]:
print('Combined_ARIA_Mileage Information')
print('Nulls:', df['Combined_ARIA_Mileage'].isnull().sum())
print('Data Type:', df['Combined_ARIA_Mileage'].dtype)

Combined_ARIA_Mileage Information
Nulls: 92
Data Type: float64


In [92]:
print('Combined_ARIA_Mileage Mean', df['Combined_ARIA_Mileage'].mean())
print('Combined_ARIA_Mileage Median', df['Combined_ARIA_Mileage'].median())
print('Combined_ARIA_Mileage Mode', df['Combined_ARIA_Mileage'].mode())

Combined_ARIA_Mileage Mean 20.03491914893617
Combined_ARIA_Mileage Median 18.27
Combined_ARIA_Mileage Mode 0    23.0
Name: Combined_ARIA_Mileage, dtype: float64


In [93]:
df["Combined_ARIA_Mileage"].fillna(df["Combined_ARIA_Mileage"].mean(), inplace=True)

print(df['Combined_ARIA_Mileage'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Combined_ARIA_Mileage
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Kerb_Weight

In [94]:
print('Kerb_Weight Information')
print('Nulls:', df['Kerb_Weight'].isnull().sum())
print('Data Type:', df['Kerb_Weight'].dtype)
print()
print('Info', df['Kerb_Weight'].info())

Kerb_Weight Information
Nulls: 365
Data Type: object

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Kerb_Weight
Non-Null Count  Dtype 
--------------  ----- 
902 non-null    object
dtypes: object(1)
memory usage: 19.8+ KB
Info None


In [95]:
df['Kerb_Weight'] = df['Kerb_Weight'].str.replace('kg','')
df['Kerb_Weight'] = df['Kerb_Weight'].str.replace('1016-1043','')

df['Kerb_Weight'] = pd.to_numeric(df['Kerb_Weight'], errors='coerce')
df['Kerb_Weight'].dtype

dtype('float64')

In [96]:
print('Kerb_Weight Mean', df['Kerb_Weight'].mean())
print('Kerb_Weight Median', df['Kerb_Weight'].median())
print('Kerb_Weight Mode', df['Kerb_Weight'].mode())

Kerb_Weight Mean 1387.8539325842696
Kerb_Weight Median 1228.0
Kerb_Weight Mode 0    935.0
Name: Kerb_Weight, dtype: float64


In [97]:
df["Kerb_Weight"].fillna(df["Kerb_Weight"].median(), inplace=True)

print(df['Kerb_Weight'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Kerb_Weight
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Gears

In [98]:
print('Gears Information')
print('Nulls:', df['Gears'].isnull().sum())
print('Data Type:', df['Gears'].dtype)
print()
print('Info', df['Gears'].info())

Gears Information
Nulls: 105
Data Type: object

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Gears
Non-Null Count  Dtype 
--------------  ----- 
1162 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
Info None


In [99]:
df['Gears'] = df['Gears'].str.replace('7 Dual Clutch','7')

In [100]:
df['Gears'] = df['Gears'].str.replace('Single Speed Reduction Gear','1')

In [101]:
df['Gears'] = df['Gears'].astype(float)

In [102]:
print('Gears Mean', df['Gears'].mean())
print('Gears Median', df['Gears'].median())
print('Gears Mode', df['Gears'].mode())

Gears Mean 5.8752151462994835
Gears Median 5.0
Gears Mode 0    5.0
Name: Gears, dtype: float64


In [103]:
df["Gears"].fillna(df["Gears"].median(), inplace=True)

print(df['Gears'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Gears
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Ground_Clearance

In [104]:
print('Ground_Clearance Information')
print('Nulls:', df['Ground_Clearance'].isnull().sum())
print('Data Type:', df['Ground_Clearance'].dtype)

Ground_Clearance Information
Nulls: 289
Data Type: object


In [105]:
df['Ground_Clearance'] = df['Ground_Clearance'].str.replace('mm','')
df['Ground_Clearance'] = df['Ground_Clearance'].astype(float)

print('Data Type:', df['Ground_Clearance'].dtype)

Data Type: float64


In [106]:
print('Ground_Clearance Mean', df['Ground_Clearance'].mean())
print('Ground_Clearance Median', df['Ground_Clearance'].median())
print('Ground_Clearance Mode', df['Ground_Clearance'].mode())

Ground_Clearance Mean 179.42126789366054
Ground_Clearance Median 170.0
Ground_Clearance Mode 0    165.0
Name: Ground_Clearance, dtype: float64


In [107]:
df["Ground_Clearance"].fillna(df["Ground_Clearance"].median(), inplace=True)

print(df['Ground_Clearance'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Ground_Clearance
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Front_Brakes

In [108]:
print('Front_Brakes Value Counts', df['Front_Brakes'].value_counts())
print()
print('Front_Brakes Nulls', df['Front_Brakes'].isnull().sum())

Front_Brakes Value Counts Front_Brakes
Ventilated Disc    1159
Solid Disc           73
Drum                 10
Name: count, dtype: int64

Front_Brakes Nulls 25


In [109]:
df["Front_Brakes"].fillna(df["Front_Brakes"].mode()[0], inplace=True)

print(df['Front_Brakes'].isnull().sum())
print()
print(df['Front_Brakes'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Front_Brakes
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Rear_Brakes

In [110]:
print('Rear_Brakes Value Counts', df['Rear_Brakes'].value_counts())
print()
print('Rear_Brakes Nulls', df['Rear_Brakes'].isnull().sum())

Rear_Brakes Value Counts Rear_Brakes
Drum               782
Ventilated Disc    417
Solid Disc          43
Name: count, dtype: int64

Rear_Brakes Nulls 25


In [111]:
df["Rear_Brakes"].fillna(df["Rear_Brakes"].mode()[0], inplace=True)

print(df['Rear_Brakes'].isnull().sum())
print()
print(df['Rear_Brakes'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Rear_Brakes
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Front_Suspension

In [112]:
print('Front_Suspension Value Counts', df['Front_Suspension'].value_counts())
print()
print('Front_Suspension Nulls', df['Front_Suspension'].isnull().sum())

Front_Suspension Value Counts Front_Suspension
MacPherson Strut                                            119
McPherson Strut                                              89
McPherson Strut, Coil Spring                                 54
McPherson strut with coil spring                             53
Independent McPherson                                        53
                                                           ... 
Twin wishbones                                                1
MacPherson strut type (Front)                                 1
adaptive air suspension                                       1
3-link with McPherson struts, Torsion bar & coil springs      1
McPherson strut,coil spring                                   1
Name: count, Length: 140, dtype: int64

Front_Suspension Nulls 59


In [113]:
df["Front_Suspension"].fillna(df["Front_Suspension"].mode()[0], inplace=True)

print(df['Front_Suspension'].isnull().sum())
print()
print(df['Front_Suspension'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Front_Suspension
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Rear_Suspension

In [114]:
print('Rear_Suspension Value Counts', df['Rear_Suspension'].value_counts())
print()
print('Rear_Suspension Nulls', df['Rear_Suspension'].isnull().sum())

Rear_Suspension Value Counts Rear_Suspension
Torsion Beam                                                       123
Coupled Torsion Beam Axle                                           57
Twist Beam with Coil spring and Shock Absorber                      53
Coupled Torsion Beam Axle with coil spring                          50
Torsion Beam Coil Spring                                            37
                                                                  ... 
Independent rear suspensions with coil-over springs                  1
multi-link set-up                                                    1
rear multi link,Coil spring, anti roll bar and adaptive dampers      1
adaptive air suspension                                              1
Multi-link coil springs with stabilizer bar                          1
Name: count, Length: 173, dtype: int64

Rear_Suspension Nulls 46


In [115]:
df["Rear_Suspension"].fillna(df["Rear_Suspension"].mode()[0], inplace=True)

print(df['Rear_Suspension'].isnull().sum())
print()
print(df['Rear_Suspension'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Rear_Suspension
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Front_Track and Rear_Track

In [116]:
print('Front_Track Information')
print('Nulls:', df['Front_Track'].isnull().sum())
print('Data Type:', df['Front_Track'].dtype)
print()
print('Rear_Track Information')
print('Nulls:', df['Rear_Track'].isnull().sum())
print('Data Type:', df['Rear_Track'].dtype)

Front_Track Information
Nulls: 667
Data Type: object

Rear_Track Information
Nulls: 676
Data Type: object


In [117]:
df['Front_Track'] = df['Front_Track'].str.replace('mm','')
df['Front_Track'] = df['Front_Track'].str.replace(',','')
df['Front_Track'] = df['Front_Track'].str.replace('1476 (R13) ','1476')
df['Front_Track'] = df['Front_Track'].astype(float)

print('Front_Track Data Type:', df['Front_Track'].dtype)

df['Rear_Track'] = df['Rear_Track'].str.replace('mm','')
df['Rear_Track'] = df['Rear_Track'].str.replace('1,494 (R13) ','')
df['Rear_Track'] = pd.to_numeric(df['Rear_Track'], errors='coerce')
df['Rear_Track'] = df['Rear_Track'].astype(float)

print('Rear_Track Data Type:', df['Rear_Track'].dtype)

Front_Track Data Type: float64
Rear_Track Data Type: float64


In [118]:
print('Front_Track Mean', df['Front_Track'].mean())
print('Front_Track Median', df['Front_Track'].median())
print('Front_Track Mode', df['Front_Track'].mode())
print()
print('Rear_Track Mean', df['Rear_Track'].mean())
print('Rear_Track Median', df['Rear_Track'].median())
print('Rear_Track Mode', df['Rear_Track'].mode())

Front_Track Mean 1527.0446666666667
Front_Track Median 1520.0
Front_Track Mode 0    1490.0
Name: Front_Track, dtype: float64

Rear_Track Mean 1526.4322033898304
Rear_Track Median 1525.0
Rear_Track Mode 0    1540.0
Name: Rear_Track, dtype: float64


In [119]:
df["Front_Track"].fillna(df["Front_Track"].median(), inplace=True)
print(df['Front_Track'].info())

df["Rear_Track"].fillna(df["Rear_Track"].median(), inplace=True)
print(df['Rear_Track'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Front_Track
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None
<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Rear_Track
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Front and Rear Tyre & Rim

In [120]:
print('Front_Tyre_&_Rim Value Counts', df['Front_Tyre_&_Rim'].value_counts())
print()
print('Front_Tyre_&_Rim Nulls', df['Front_Tyre_&_Rim'].isnull().sum())
print()
print('Rear_Tyre_&_Rim Value Counts', df['Rear_Tyre_&_Rim'].value_counts())
print()
print('Rear_Tyre_&_Rim Nulls', df['Rear_Tyre_&_Rim'].isnull().sum())

Front_Tyre_&_Rim Value Counts Front_Tyre_&_Rim
185/60R15     66
185/65R15     43
175/65R14     38
165/70R14     32
205/65R16     28
              ..
255/35         1
245/30 R20     1
275/35 R19     1
265/35ZR19     1
115/90R17      1
Name: count, Length: 234, dtype: int64

Front_Tyre_&_Rim Nulls 49

Rear_Tyre_&_Rim Value Counts Rear_Tyre_&_Rim
185/60R15        66
185/65R15        43
175/65R14        38
165/70R14        32
205/65R16        28
                 ..
285/35 R20        1
275/40 R22        1
275/35R20         1
315 / 35 ZR21     1
255/35 R18        1
Name: count, Length: 241, dtype: int64

Rear_Tyre_&_Rim Nulls 48


In [121]:
df["Front_Tyre_&_Rim"].fillna(df["Front_Tyre_&_Rim"].mode()[0], inplace=True)

print('Front_Tyre_&_Rim Nulls:',df['Front_Tyre_&_Rim'].isnull().sum())
print()
print(df['Front_Tyre_&_Rim'].info())
print('_______________')


df["Rear_Tyre_&_Rim"].fillna(df["Rear_Tyre_&_Rim"].mode()[0], inplace=True)

print('Rear_Tyre_&_Rim Nulls:',df['Rear_Tyre_&_Rim'].isnull().sum())
print()
print(df['Rear_Tyre_&_Rim'].info())

Front_Tyre_&_Rim Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Front_Tyre_&_Rim
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None
_______________
Rear_Tyre_&_Rim Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Rear_Tyre_&_Rim
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Power_Steering

In [122]:
print(df['Power_Steering'].value_counts())
print()
print('Power_Steering Nulls', df['Power_Steering'].isnull().sum())

Power_Steering
Electric Power                     916
Electro-Hydraulic                  137
Yes                                 85
Hydraulic Power                     71
Electric Power, Hydraulic Power      1
Name: count, dtype: int64

Power_Steering Nulls 57


Along with 57 nulls, we have a unique value named 'Yes' with 85 counts. 

Furthermore, small cars may not have power steering at all and the presence of yes suggests the existence of a 'no' category. Therefore, instead of mode, the nulls will be replaced by 'No'

In [123]:
df["Power_Steering"].fillna('No', inplace=True)
print(df['Power_Steering'].isnull().sum())
print()
print(df['Power_Steering'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Power_Steering
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Power_Windows

In [124]:
print(df['Power_Windows'].value_counts())
print()
print('Power_Windows Nulls', df['Power_Windows'].isnull().sum())

Power_Windows
All Windows           1026
Only Front Windows     144
Name: count, dtype: int64

Power_Windows Nulls 97


Many cars, even today do not have power windows in India. Therefore, instead of mode, the nulls will be replaced by 'No'

In [125]:
df["Power_Windows"].fillna('No', inplace=True)

print(df['Power_Windows'].isnull().sum())
print()
print(df['Power_Windows'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Power_Windows
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Power_Seats

In [126]:
print(df['Power_Seats'].value_counts())
print()
print('Power_Seats Nulls', df['Power_Seats'].isnull().sum())

Power_Seats
Yes                 208
Yes, with memory    108
Power seats          66
Name: count, dtype: int64

Power_Seats Nulls 885


Many cars do not have power seats in India. Therefore, instead of mode, the nulls will be replaced by 'No'

In [127]:
df["Power_Seats"].fillna('No', inplace=True)

print(df['Power_Seats'].isnull().sum())
print()
print(df['Power_Seats'].info())

0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Power_Seats
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Keyless_Entry

In [128]:
print(df['Keyless_Entry'].value_counts())
print()
print('Keyless_Entry Nulls', df['Keyless_Entry'].isnull().sum())

Keyless_Entry
Smart Key            449
Remote               421
Yes                  101
Remote, Smart Key     21
Smart Key, Remote      1
Name: count, dtype: int64

Keyless_Entry Nulls 274


Many cars do not have keyless entry. Therefore, instead of mode, the nulls will be replaced by 'No'


In [129]:
df['Keyless_Entry'] = df['Keyless_Entry'].str.replace('Smart Key, Remote','Remote, Smart Key')

df["Keyless_Entry"].fillna('No', inplace=True)
print('Value counts' ,df['Keyless_Entry'].value_counts())
print('Nulls:',df['Keyless_Entry'].isnull().sum())
print()
print(df['Keyless_Entry'].info())

Value counts Keyless_Entry
Smart Key            449
Remote               421
No                   274
Yes                  101
Remote, Smart Key     22
Name: count, dtype: int64
Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Keyless_Entry
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Power

In [130]:
print(df['Power'].value_counts())
print()
print('Power Nulls', df['Power'].isnull().sum())
print()
print(df['Power'].info())

Power
83PS@6000rpm           52
75PS@4000rpm           37
90PS@4000rpm           29
100PS@3750RPM          28
68PS@6000rpm           25
                       ..
248bhp@5200rpm          1
268PS@6350rpm           1
258bhp@5800-6100rpm     1
197hp@4500-6500rpm      1
202ps@3800rpm           1
Name: count, Length: 372, dtype: int64

Power Nulls 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Power
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Torque

In [131]:
print(df['Torque'].value_counts())
print()
print('Torque Nulls', df['Torque'].isnull().sum())

Torque
200Nm@1750rpm         42
90Nm@3500rpm          42
114Nm@4000rpm         27
350Nm@1750-2500rpm    26
104Nm@4000rpm         22
                      ..
340 Nm @ 1750 rpm      1
280Nm@1350-4600rpm     1
400 Nm @ 1750 RPM      1
400NM@1550rpm          1
441Nm@2000rpm          1
Name: count, Length: 342, dtype: int64

Torque Nulls 2


In [132]:
df["Torque"].fillna(df["Torque"].mode()[0], inplace=True)

print(df['Torque'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Torque
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Odometer

In [133]:
print(df['Odometer'].value_counts())
print()
print('Odometer Nulls', df['Odometer'].isnull().sum())

Odometer
Digital            1051
Analog               90
Yes                  83
Digital, Analog       1
Name: count, dtype: int64

Odometer Nulls 42


All cars do have an Odometer, hence we will replace the nulls with the mode

In [134]:
df["Odometer"].fillna(df["Odometer"].mode()[0], inplace=True)

print(df['Odometer'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Odometer
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Speedometer

In [135]:
print(df['Speedometer'].value_counts())
print()
print('Speedometer Nulls', df['Speedometer'].isnull().sum())

Speedometer
Analog             974
Digital            111
Yes                 79
Analog, Digital     57
Digital, Analog      2
Name: count, dtype: int64

Speedometer Nulls 44


All cars do have an Speedometer, hence we will replace the nulls with the mode

In [136]:
df['Speedometer'] = df['Speedometer'].str.replace('Digital, Analog','Analog, Digital')

df["Speedometer"].fillna(df["Speedometer"].mode()[0], inplace=True)

print('Value Counts',df['Speedometer'].value_counts())
print('_____________________')
print(df['Speedometer'].info())

Value Counts Speedometer
Analog             1018
Digital             111
Yes                  79
Analog, Digital      59
Name: count, dtype: int64
_____________________
<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Speedometer
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Tachometer

In [137]:
print(df['Tachometer'].value_counts())
print()
print('Tachometer Nulls', df['Tachometer'].isnull().sum())

Tachometer
Analog                  933
Digital                 112
Yes                     109
Not on offer             72
Analog, Digital          22
Digital, Analog           5
Analog, Not on offer      1
Name: count, dtype: int64

Tachometer Nulls 13


All cars do have an Tachometer, hence we will replace the nulls with the mode

In [138]:
df['Tachometer'] = df['Tachometer'].str.replace('Digital, Analog','Analog, Digital')

df['Tachometer'] = df['Tachometer'].str.replace('Analog, Not on offer','Not on offer')

df["Tachometer"].fillna(df["Tachometer"].mode()[0], inplace=True)

print('Value Counts',df['Tachometer'].value_counts())
print('_____________________')
print(df['Tachometer'].info())

Value Counts Tachometer
Analog             946
Digital            112
Yes                109
Not on offer        73
Analog, Digital     27
Name: count, dtype: int64
_____________________
<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Tachometer
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Tripmeter

In [139]:
print(df['Tripmeter'].value_counts())
print()
print('Tripmeter Nulls', df['Tripmeter'].isnull().sum())

Tripmeter
Yes     978
2       197
1        26
1, 2      7
Name: count, dtype: int64

Tripmeter Nulls 59


A Tripmeter can record short trip distances and there can be upto two installed. Not all cars have it, therefore, nulls will be replaced with No


In [140]:
df['Tripmeter'] = df['Tripmeter'].str.replace('1, 2','2')

df["Tripmeter"].fillna('No', inplace=True)
print('Value counts' ,df['Tripmeter'].value_counts())
print()
print('Nulls:',df['Tripmeter'].isnull().sum())
print()
print(df['Tripmeter'].info())

Value counts Tripmeter
Yes    978
2      204
No      59
1       26
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Tripmeter
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Seating_Capacity

In [141]:
print(df['Seating_Capacity'].value_counts())
print()
print('Seating_Capacity Nulls', df['Seating_Capacity'].isnull().sum())

Seating_Capacity
5.0     915
7.0     174
4.0      70
2.0      39
6.0      26
9.0      19
8.0      17
16.0      1
Name: count, dtype: int64

Seating_Capacity Nulls 6


In [142]:
df["Seating_Capacity"].fillna(df["Seating_Capacity"].mode()[0], inplace=True)
print(df['Seating_Capacity'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Seating_Capacity
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Seats_Material

In [143]:
print(df['Seats_Material'].value_counts())
print()
print('Seats_Material Nulls', df['Seats_Material'].isnull().sum())

Seats_Material
Fabric          737
Leather         505
Vinyl             9
Polyurethene      4
Name: count, dtype: int64

Seats_Material Nulls 12


In [144]:
df["Seats_Material"].fillna(df["Seats_Material"].mode()[0], inplace=True)
print(df['Seats_Material'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Seats_Material
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Type

In [145]:
print(df['Type'].value_counts())
print()
print('Type Nulls', df['Type'].isnull().sum())

Type
Manual       721
Automatic    517
AMT           18
DCT            7
CVT            3
Name: count, dtype: int64

Type Nulls 1


In [146]:
df["Type"].fillna(df["Type"].mode()[0], inplace=True)
print(df['Type'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Type
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Wheelbase

In [147]:
print('Wheelbase Information')
print('Nulls:', df['Wheelbase'].isnull().sum())
print('Data Type:', df['Wheelbase'].dtype)


Wheelbase Information
Nulls: 20
Data Type: object


In [148]:
df['Wheelbase'] = df['Wheelbase'].str.replace('mm','')
df['Wheelbase'] = df['Wheelbase'].astype(float)
print('Wheelbase Data Type:', df['Wheelbase'].dtype)

Wheelbase Data Type: float64


In [149]:
print('Wheelbase Mean', df['Wheelbase'].mean())
print('Wheelbase Median', df['Wheelbase'].median())
print('Wheelbase Mode', df['Wheelbase'].mode())

Wheelbase Mean 2631.0
Wheelbase Median 2600.0
Wheelbase Mode 0    2450.0
Name: Wheelbase, dtype: float64


In [150]:
df["Wheelbase"].fillna(df["Wheelbase"].median(), inplace=True)
print(df['Wheelbase'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Wheelbase
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Wheels_Size

In [151]:
print(df['Wheels_Size'].value_counts())
print()
print('Wheels_Size Nulls', df['Wheels_Size'].isnull().sum())

Wheels_Size
185/60R15     67
185/65R15     43
175/65R14     38
165/70R14     32
205/65R16     28
              ..
115/90R17      1
275/35 R20     1
275/50 R20     1
285/40ZR19     1
245/45R18      1
Name: count, Length: 209, dtype: int64

Wheels_Size Nulls 56


In [152]:
df["Wheels_Size"].fillna(df["Wheels_Size"].mode()[0], inplace=True)
print(df['Wheels_Size'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Wheels_Size
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## 12v_Power_Outlet

In [153]:
print(df['12v_Power_Outlet'].value_counts())
print()
print('12v_Power_Outlet Nulls', df['12v_Power_Outlet'].isnull().sum())

12v_Power_Outlet
Yes    883
2      133
1       58
3       34
4        6
Name: count, dtype: int64

12v_Power_Outlet Nulls 153


Many cars do not have a 12v_Power_Outlet. Therefore, instead of mode, the nulls will be replaced by 'No'

In [154]:
df["12v_Power_Outlet"].fillna('No', inplace=True)
print('Value counts' ,df['12v_Power_Outlet'].value_counts())
print('Nulls:',df['12v_Power_Outlet'].isnull().sum())
print()
print(df['12v_Power_Outlet'].info())

Value counts 12v_Power_Outlet
Yes    883
No     153
2      133
1       58
3       34
4        6
Name: count, dtype: int64
Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: 12v_Power_Outlet
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Audiosystem

In [155]:
print(df['Audiosystem'].value_counts())
print()
print('Audiosystem Nulls', df['Audiosystem'].isnull().sum())

Audiosystem
CD Player with USB & Aux-in            435
CD/MP3/DVD Player with USB & Aux-in    367
Not on offer                           213
DVD Player with USB & Aux-in           101
USB & Aux-in                            84
CD/MP3 Player                           20
CD Player with Aux-in                    2
CD Player with USB Only                  1
Name: count, dtype: int64

Audiosystem Nulls 44


In [156]:
df["Audiosystem"].fillna(df["Audiosystem"].mode()[0], inplace=True)
print(df['Audiosystem'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Audiosystem
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Basic_Warranty

In [157]:
print(df['Basic_Warranty'].value_counts())
print()
print('Basic_Warranty Nulls', df['Basic_Warranty'].isnull().sum())

Basic_Warranty
2 years / Unlimited Kms                                            132
3 years / Unlimited Kms                                            118
2 years /40000 Kms (years/distance whichever comes first)          110
3 Years/1,00,000 Kms (Whichever comes earlier)                      77
3 years /100000 Kms (years/distance whichever comes first)          76
2 Years / 40,000 Kms (Whichever comes earlier)                      54
2 Years / 100,000 Kms (whichever comes first)                       51
2 years /75000 Kms (years/distance whichever comes first)           31
2 years /1,00,000 Kms (years/distance whichever comes first)        30
24 months /50000 Kms (whichever comes first)                        20
2 years /50000 Kms (years/distance whichever comes first)           18
2 Years / 75,000 KM (whichever is earlier)                          17
2 Years / 50,000 Kms (Whichever comes earlier)                      16
3 Years / 100000 km (whichever comes first)                   

Due to the highly varied nature of basic warranties, we will fill the nulls with empty strings.

In [158]:
df["Basic_Warranty"].fillna('', inplace=True)
print(df['Basic_Warranty'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Basic_Warranty
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Boot-lid_Opener

In [159]:
print(df['Boot-lid_Opener'].value_counts())
print()
print('Boot-lid_Opener Nulls', df['Boot-lid_Opener'].isnull().sum())

Boot-lid_Opener
Internal                         535
With Remote, Internal            212
Internal, With Remote            204
Manual                           171
With Remote                       63
Manual, With Remote               26
Manual, Internal                  22
With Remote, Manual               12
With Remote, Manual, Internal      3
Internal, Manual                   3
Manual, Internal, With Remote      1
Manual, With Remote, Internal      1
Name: count, dtype: int64

Boot-lid_Opener Nulls 14


In [160]:
df["Boot-lid_Opener"].fillna(df["Boot-lid_Opener"].mode()[0], inplace=True)
print(df['Boot-lid_Opener'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Boot-lid_Opener
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Boot_Space

In [161]:
print('Boot_Space Information')
print('Nulls:', df['Boot_Space'].isnull().sum())
print('Data Type:', df['Boot_Space'].dtype)


Boot_Space Information
Nulls: 248
Data Type: object


In [162]:
df['Boot_Space'] = df['Boot_Space'].str.replace('litres','')
df['Boot_Space'] = df['Boot_Space'].str.replace('209(All3RowsUp).550(3rdRowFolded)&803(2ndRowand3rdRowFolded) ','')
df['Boot_Space'] = pd.to_numeric(df['Boot_Space'], errors='coerce')

df['Boot_Space'] = df['Boot_Space'].astype(float)
print('Boot_Space Data Type:', df['Boot_Space'].dtype)

Boot_Space Data Type: float64


In [163]:
print('Boot_Space Mean', df['Boot_Space'].mean())
print('Boot_Space Median', df['Boot_Space'].median())
print('Boot_Space Mode', df['Boot_Space'].mode())

Boot_Space Mean 391.4866336633663
Boot_Space Median 378.0
Boot_Space Mode 0    350.0
Name: Boot_Space, dtype: float64


In [164]:
df["Boot_Space"].fillna(df["Boot_Space"].median(), inplace=True)
print(df['Boot_Space'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Boot_Space
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Clock

In [165]:
print(df['Clock'].value_counts())
print()
print('Clock Nulls', df['Clock'].isnull().sum())

Clock
Digital            1049
Analog               41
Yes                  39
Digital, Analog       4
Name: count, dtype: int64

Clock Nulls 134


Since all modern cars have clocks, we will replace the nulls with the mode.

In [166]:
df["Clock"].fillna(df["Clock"].mode()[0], inplace=True)
print(df['Clock'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Clock
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Cup_Holders

In [167]:
print(df['Cup_Holders'].value_counts())
print()
print('Cup_Holders nulls', df['Cup_Holders'].isnull().sum())

Cup_Holders
Front & Rear                  583
Front                         459
Yes                           111
Not on offer                   51
Centre                         14
Front, Front & Rear             4
Front & Rear, Centre            2
Centre, Front & Rear            1
Not on offer, Front & Rear      1
Name: count, dtype: int64

Cup_Holders nulls 41


There are numerous modern cars, especially budget ones that forego cupholders. Therefore, we will replace nulls with 'No'.

In [168]:
df['Cup_Holders'] = df['Cup_Holders'].str.replace('Front, Front & Rear','Front & Rear')
df['Cup_Holders'] = df['Cup_Holders'].str.replace('Centre, Front & Rear','Front & Rear, Centre')

df["Cup_Holders"].fillna('No', inplace=True)

print('Value counts' ,df['Cup_Holders'].value_counts())
print()
print('Nulls:',df['Cup_Holders'].isnull().sum())
print()
print(df['Cup_Holders'].info())

Value counts Cup_Holders
Front & Rear                  587
Front                         459
Yes                           111
Not on offer                   51
No                             41
Centre                         14
Front & Rear, Centre            3
Not on offer, Front & Rear      1
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Cup_Holders
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Door_Pockets

In [169]:
print(df['Door_Pockets'].value_counts())
print()
print('Door_Pockets nulls', df['Door_Pockets'].isnull().sum())

Door_Pockets
Front & Rear           689
Front                  309
Yes                    144
Front, Front & Rear      8
Name: count, dtype: int64

Door_Pockets nulls 117


Cars may not have door pockets, hence the nulls will be replaced by No

In [170]:
df['Door_Pockets'] = df['Door_Pockets'].str.replace('Front, Front & Rear','Front & Rear')
df["Door_Pockets"].fillna('No', inplace=True)

print('Value counts' ,df['Door_Pockets'].value_counts())
print()
print('Nulls:',df['Door_Pockets'].isnull().sum())
print()
print(df['Door_Pockets'].info())

Value counts Door_Pockets
Front & Rear    697
Front           309
Yes             144
No              117
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Door_Pockets
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Extended_Warranty

In [171]:
print(df['Extended_Warranty'].value_counts())
print()
print('Extended_Warranty Nulls', df['Extended_Warranty'].isnull().sum())

Extended_Warranty
3rd/4th year/up to 80,000 Kms (whichever comes first)                                                                                         110
2 years /100000 Kms (years/distance whichever comes first)                                                                                     40
4th year /with (absolutely no mileage restriction)                                                                                             39
1 years /20000 Kms (years/distance whichever comes first)                                                                                      35
2 Years/Unlimited Kms (Whichever comes earlier)                                                                                                31
Up to 5 Year /                                                                                                                                 30
3 Years/Unlimited km Warranty*                                                                            

Due to the highly varied nature of extended warranties and no landslace majority, we will fill the nulls with empty strings.

In [172]:
df["Extended_Warranty"].fillna('', inplace=True)
print(df['Extended_Warranty'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Extended_Warranty
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Fuel-lid_Opener

In [173]:
print(df['Fuel-lid_Opener'].value_counts())
print()
print('Fuel-lid_Opener Nulls', df['Fuel-lid_Opener'].isnull().sum())

Fuel-lid_Opener
Internal                               915
Internal, With Remote                   92
With Remote                             77
Manual                                  62
With Remote, Internal                   51
Push to Open                            27
Internal, Manual                        15
With Remote, Manual                      7
Manual, Internal                         3
With Remote, Push to Open, Internal      2
Push to Open, Internal                   1
With Remote, Push to Open                1
Name: count, dtype: int64

Fuel-lid_Opener Nulls 14


In [174]:
df['Fuel-lid_Opener'] = df['Fuel-lid_Opener'].str.replace('With Remote, Internal','Internal, With Remote')
df['Fuel-lid_Opener'] = df['Fuel-lid_Opener'].str.replace('Manual, Internal','Internal, Manual')
df['Fuel-lid_Opener'] = df['Fuel-lid_Opener'].str.replace('Manual, Internal','Internal, Manual')

df["Fuel-lid_Opener"].fillna(df["Fuel-lid_Opener"].mode()[0], inplace=True)

print('Value counts' ,df['Fuel-lid_Opener'].value_counts())
print()
print('Nulls:',df['Fuel-lid_Opener'].isnull().sum())
print()
print(df['Fuel-lid_Opener'].info())

Value counts Fuel-lid_Opener
Internal                               929
Internal, With Remote                  143
With Remote                             77
Manual                                  62
Push to Open                            27
Internal, Manual                        18
With Remote, Manual                      7
With Remote, Push to Open, Internal      2
Push to Open, Internal                   1
With Remote, Push to Open                1
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Fuel-lid_Opener
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Fuel_Gauge

In [175]:
print(df['Fuel_Gauge'].value_counts())
print()
print('Fuel_Gauge Nulls', df['Fuel_Gauge'].isnull().sum())

Fuel_Gauge
Digital            701
Analog             443
Yes                 81
Digital, Analog      3
Analog, Digital      1
Name: count, dtype: int64

Fuel_Gauge Nulls 38


In [176]:
df['Fuel_Gauge'] = df['Fuel_Gauge'].str.replace('Analog, Digital','Digital, Analog')

df["Fuel_Gauge"].fillna(df["Fuel_Gauge"].mode()[0], inplace=True)

print('Value counts' ,df['Fuel_Gauge'].value_counts())
print()
print('Nulls:',df['Fuel_Gauge'].isnull().sum())
print()
print(df['Fuel_Gauge'].info())

Value counts Fuel_Gauge
Digital            739
Analog             443
Yes                 81
Digital, Analog      4
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Fuel_Gauge
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Handbrake

In [177]:
print(df['Handbrake'].value_counts())
print()
print('Handbrake Nulls', df['Handbrake'].isnull().sum())

Handbrake
Manual       904
Automatic    306
Name: count, dtype: int64

Handbrake Nulls 57


In [178]:
df["Handbrake"].fillna(df["Handbrake"].mode()[0], inplace=True)
print(df['Handbrake'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Handbrake
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Instrument_Console

In [179]:
print(df['Instrument_Console'].value_counts())
print()
print('Instrument_Console Nulls', df['Instrument_Console'].isnull().sum())

Instrument_Console
Analog + Digital    1010
Analog               167
Digital               76
Name: count, dtype: int64

Instrument_Console Nulls 14


In [180]:
df["Instrument_Console"].fillna(df["Instrument_Console"].mode()[0], inplace=True)
print(df['Instrument_Console'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Instrument_Console
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Minimum_Turning_Radius

In [181]:
print('Minimum_Turning_Radius Information')
print('Nulls:', df['Minimum_Turning_Radius'].isnull().sum())
print('Data Type:', df['Minimum_Turning_Radius'].dtype)


Minimum_Turning_Radius Information
Nulls: 381
Data Type: object


In [182]:
df['Minimum_Turning_Radius'] = df['Minimum_Turning_Radius'].str.replace('meter','')
df['Minimum_Turning_Radius'] = df['Minimum_Turning_Radius'].astype(float)
print('Minimum_Turning_Radius Data Type:', df['Minimum_Turning_Radius'].dtype)

Minimum_Turning_Radius Data Type: float64


In [183]:
print('Minimum_Turning_Radius Mean', df['Minimum_Turning_Radius'].mean())
print('Minimum_Turning_Radius Median', df['Minimum_Turning_Radius'].median())
print('Minimum_Turning_Radius Mode', df['Minimum_Turning_Radius'].mode())

Minimum_Turning_Radius Mean 13.217279909706546
Minimum_Turning_Radius Median 5.25
Minimum_Turning_Radius Mode 0    5.2
Name: Minimum_Turning_Radius, dtype: float64


In [184]:
df["Minimum_Turning_Radius"].fillna(df["Minimum_Turning_Radius"].median(), inplace=True)
print(df['Minimum_Turning_Radius'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Minimum_Turning_Radius
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Sun_Visor

In [185]:
print(df['Sun_Visor'].value_counts())
print()
print('Sun_Visor Nulls', df['Sun_Visor'].isnull().sum())

Sun_Visor
Driver & Front Passenger    1163
Driver Only                   27
Co-Driver Only                15
Name: count, dtype: int64

Sun_Visor Nulls 62


In [186]:
df["Sun_Visor"].fillna(df["Sun_Visor"].mode()[0], inplace=True)
print(df['Sun_Visor'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Sun_Visor
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Third_Row_AC_Vents

In [187]:
print(df['Third_Row_AC_Vents'].value_counts())
print()
print('Third_Row_AC_Vents Nulls', df['Third_Row_AC_Vents'].isnull().sum())

Third_Row_AC_Vents
Not Applicable    787
Yes               114
Name: count, dtype: int64

Third_Row_AC_Vents Nulls 366


In [188]:
df["Third_Row_AC_Vents"].fillna(df["Third_Row_AC_Vents"].mode()[0], inplace=True)
print(df['Third_Row_AC_Vents'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Third_Row_AC_Vents
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Ventilation_System

In [189]:
print(df['Ventilation_System'].value_counts())
print()
print('Ventilation_System Nulls', df['Ventilation_System'].isnull().sum())

Ventilation_System
Fully automatic climate control                                                      478
Manual Air conditioning with cooling and heating                                     447
2 Zone Climate Control                                                               148
4 Zone climate control                                                                59
3 Zone climate control                                                                35
Air Conditioning with cooling only                                                    22
Fully automatic climate control, 2 Zone Climate Control                               10
Yes                                                                                    8
Heater, Manual Air conditioning with cooling and heating                               6
Fully automatic climate control, 4 Zone climate control                                6
Heater                                                                                 4
He

In [190]:
df["Ventilation_System"].fillna(df["Ventilation_System"].mode()[0], inplace=True)
print(df['Ventilation_System'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Ventilation_System
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Auto-Dimming_Rear-View_Mirror

In [191]:
print(df['Drive_Modes'].value_counts())
print()
print('Drive_Modes Nulls', df['Drive_Modes'].isnull().sum())

Drive_Modes
Normal                                     92
Normal, Comfort, Eco, Sport                34
Sport                                      18
Normal, Eco                                11
Normal, Eco, Sport                          9
Normal, Comfort, Sport                      6
Normal, Sport                               6
Comfort, Eco, Sport                         4
Comfort, Eco, Sport, Normal                 4
Eco, Sport, Normal, Comfort                 3
Eco, Sport                                  2
Normal, Comfort, Eco, Sport, Power Mode     2
Eco                                         1
Eco, Sport, Normal                          1
Sport, Normal, Eco                          1
Sport, Normal, Comfort                      1
Name: count, dtype: int64

Drive_Modes Nulls 1072


In [192]:
df['Drive_Modes'] = df['Drive_Modes'].str.replace('Comfort, Eco, Sport, Normal','Normal, Comfort, Eco, Sport')
df['Drive_Modes'] = df['Drive_Modes'].str.replace('Eco, Sport, Normal, Comfort','Normal, Comfort, Eco, Sport')
df['Drive_Modes'] = df['Drive_Modes'].str.replace('Sport, Normal, Eco','Normal, Eco, Sport')
df['Drive_Modes'] = df['Drive_Modes'].str.replace('Eco, Sport, Normal','Normal, Eco, Sport')
df['Drive_Modes'] = df['Drive_Modes'].str.replace('Sport, Normal, Comfort','Normal, Comfort, Sport')

df["Drive_Modes"].fillna(df["Drive_Modes"].mode()[0], inplace=True)

print('Value counts' ,df['Drive_Modes'].value_counts())
print()
print('Nulls:',df['Drive_Modes'].isnull().sum())
print()
print(df['Drive_Modes'].info())

Value counts Drive_Modes
Normal                                     1164
Normal, Comfort, Eco, Sport                  41
Sport                                        18
Normal, Eco, Sport                           11
Normal, Eco                                  11
Normal, Comfort, Sport                        7
Normal, Sport                                 6
Comfort, Eco, Sport                           4
Eco, Sport                                    2
Normal, Comfort, Eco, Sport, Power Mode       2
Eco                                           1
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Drive_Modes
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Headlight_Reminder

In [193]:
print(df['Headlight_Reminder'].value_counts())
print()
print('Headlight_Reminder Nulls', df['Headlight_Reminder'].isnull().sum())

Headlight_Reminder
Yes          1006
Automatic       4
Name: count, dtype: int64

Headlight_Reminder Nulls 257


Headlight reminders are, in essence, automatic. Therefore, the 'Automatic' values will be replaced by 'Yes' and Nulls will be replaced by 'No'.

In [194]:
df['Headlight_Reminder'] = df['Headlight_Reminder'].str.replace('Automatic','Yes')
df["Headlight_Reminder"].fillna('No', inplace=True)

print('Value counts' ,df['Headlight_Reminder'].value_counts())
print()
print('Nulls:',df['Headlight_Reminder'].isnull().sum())
print()
print(df['Headlight_Reminder'].info())

Value counts Headlight_Reminder
Yes    1010
No      257
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Headlight_Reminder
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Adjustable_Headrests

In [195]:
print(df['Adjustable_Headrests'].value_counts())
print()
print('Adjustable_Headrests Nulls', df['Adjustable_Headrests'].isnull().sum())

Adjustable_Headrests
All Rows                 693
Front Row                198
Yes                       83
Second Row                36
Front Row, Second Row      6
Second Row, Front Row      1
Name: count, dtype: int64

Adjustable_Headrests Nulls 250


Since all cars, especially low budget ones may not have adjustable headrests, we will replace the nulls with 'No'.

In [196]:
df['Adjustable_Headrests'] = df['Adjustable_Headrests'].str.replace('Second Row, Front Row','Front Row, Second Row')

df["Adjustable_Headrests"].fillna('No', inplace=True)

print('Value counts' ,df['Adjustable_Headrests'].value_counts())
print()
print('Nulls:',df['Adjustable_Headrests'].isnull().sum())
print()
print(df['Adjustable_Headrests'].info())

Value counts Adjustable_Headrests
All Rows                 693
No                       250
Front Row                198
Yes                       83
Second Row                36
Front Row, Second Row      7
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Adjustable_Headrests
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Gross_Vehicle_Weight

In [197]:
print('Gross_Vehicle_Weight')
print('Nulls:', df['Gross_Vehicle_Weight'].isnull().sum())
print('Data Type:', df['Gross_Vehicle_Weight'].dtype)


Gross_Vehicle_Weight
Nulls: 595
Data Type: object


In [198]:
df['Gross_Vehicle_Weight'] = df['Gross_Vehicle_Weight'].str.replace('kg','')
df['Gross_Vehicle_Weight'].replace('NA ', pd.NA, inplace=True)

df['Gross_Vehicle_Weight'] = pd.to_numeric(df['Gross_Vehicle_Weight'], errors='coerce')

print('Gross_Vehicle_Weight Data Type:', df['Gross_Vehicle_Weight'].dtype)

Gross_Vehicle_Weight Data Type: float64


In [199]:
print('Gross_Vehicle_Weight Mean', df['Gross_Vehicle_Weight'].mean())
print('Gross_Vehicle_Weight Median', df['Gross_Vehicle_Weight'].median())
print('Gross_Vehicle_Weight Mode', df['Gross_Vehicle_Weight'].mode())

Gross_Vehicle_Weight Mean 1897.8432835820895
Gross_Vehicle_Weight Median 1750.0
Gross_Vehicle_Weight Mode 0    1170.0
Name: Gross_Vehicle_Weight, dtype: float64


In [200]:
df["Gross_Vehicle_Weight"].fillna(df["Gross_Vehicle_Weight"].median(), inplace=True)
print(df['Gross_Vehicle_Weight'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Gross_Vehicle_Weight
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Airbags

In [201]:
print(df['Airbags'].value_counts())
print()
print('Airbags Nulls', df['Airbags'].isnull().sum())

Airbags
Driver frontal airbag, Front passenger frontal airbag                                                                                                                                                                                                          575
Driver frontal airbag                                                                                                                                                                                                                                           61
Front passenger frontal airbag, Driver frontal airbag                                                                                                                                                                                                           39
Curtain airbags, Driver frontal airbag, Front passenger frontal airbag, Front passenger side airbag, Drive side airbag                                                                                                 

While airbags have been made compulsory as of Oct 2023, many cars in India have lacked the airbag feature. Therefore, the nulls will be replaced by 'No'.

In [202]:
df["Airbags"].fillna('No', inplace=True)

print('Nulls:',df['Airbags'].isnull().sum())
print()
print(df['Airbags'].info())

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Airbags
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Number_of_Airbags

In [203]:
print(df['Number_of_Airbags'].value_counts())
print()
print('Number_of_Airbags Nulls', df['Number_of_Airbags'].isnull().sum())

Number_of_Airbags
2.0     623
6.0     197
8.0      92
7.0      65
1.0      57
4.0      40
9.0      21
3.0      17
10.0     16
14.0      3
5.0       1
Name: count, dtype: int64

Number_of_Airbags Nulls 135


While airbags have been made compulsory as of Oct 2023, many cars in India have lacked the airbag feature. Therefore, the nulls will be replaced by 0.

In [204]:
df["Number_of_Airbags"].fillna(0, inplace=True)

print('Nulls:',df['Number_of_Airbags'].isnull().sum())
print()
print(df['Number_of_Airbags'].info())

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Number_of_Airbags
Non-Null Count  Dtype  
--------------  -----  
1267 non-null   float64
dtypes: float64(1)
memory usage: 19.8 KB
None


## Compression_Ratio

In [205]:
print(df['Compression_Ratio'].value_counts())
print()
print('Compression_Ratio Nulls', df['Compression_Ratio'].isnull().sum())

Compression_Ratio
16.5:1     35
11.0:1     33
10.5:1     27
16.7:1     21
17.6:1     20
           ..
9.3:1       1
11:25:1     1
9.7:1       1
15.8        1
18.5        1
Name: count, Length: 66, dtype: int64

Compression_Ratio Nulls 917


Due to the highly varied nature of Compression_Ratios and no large majority, we will fill the nulls with empty strings.

In [206]:
df["Compression_Ratio"].fillna('', inplace=True)
print(df['Compression_Ratio'].info())

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Compression_Ratio
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Adjustable_Steering_Column

In [207]:
print(df['Adjustable_Steering_Column'].value_counts())
print()
print('Adjustable_Steering_Column Nulls', df['Adjustable_Steering_Column'].isnull().sum())

Adjustable_Steering_Column
Rake, Reach    538
Rake           477
Reach, Rake     45
Yes             16
Name: count, dtype: int64

Adjustable_Steering_Column Nulls 191


In [208]:
df['Adjustable_Steering_Column'] = df['Adjustable_Steering_Column'].str.replace('Reach, Rake','Rake, Reach')

df["Adjustable_Steering_Column"].fillna('No', inplace=True)

print(df['Adjustable_Steering_Column'].value_counts())
print()
print('Nulls:',df['Adjustable_Steering_Column'].isnull().sum())
print()
print(df['Adjustable_Steering_Column'].info())

Adjustable_Steering_Column
Rake, Reach    583
Rake           477
No             191
Yes             16
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Adjustable_Steering_Column
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Other_Specs

In [209]:
print(df['Other_Specs'].value_counts())
print()
print('Other_Specs Nulls', df['Other_Specs'].isnull().sum())

Other_Specs
2-Tone Beige & Black Interior Key Colour Blue Interior Illumination Front & Rear Door Mapping Pockets Front Room Lamp                                                                                                                                         4
tubeless tyre                                                                                                                                                                                                                                                 3
SportShift Selector – Jet Leather, 3-spoke leather steering wheel with Ignis paddles, Dark Hex Aluminium Centre Console, Sports Seats, Instrument Cluster - Twin dials with 5" colour TFT display, Configurable Ambient Interior Lighting – with selectabl    2
Jaguar Sequential ShiftTM, Active Sports Exhaust, Limited Slip Differential, Dynamic Stability Control (with Trac DSC mode), Jaguar High Performance Braking System with Brake Disc diameter, Front: 380mm, Rear: 325mm with

There are only 10 cells with values filled in this column while the rest (1257) are empty. In this case, it makes sense to delete this column since the cell values are descriptive and not exactly categorical in nature.

In [210]:
del df['Other_Specs']

## Other_specs

In [211]:
print(df['Other_specs'].value_counts())
print()
print('Other_specs Nulls', df['Other_specs'].isnull().sum())

Other_specs
Air Conditioner : Standard                                                                                                                                                                                                                                   46
Chrome Radiator Grille Wraparound Clear Lens Headlamps & Tail Lamps, Bumpers Tubeless Tyres                                                                                                                                                                   4
Electric Power Steering Manual AC With Heater Tinted Glass Front Power Outlet Front Power Windows Internally Adjustable ORVM Front Washer & Wiper                                                                                                             3
Internally Adjustable ORVM Semi Drive Computer Adjustable Rear Headrest Assist Grip 12 V Power Outlet                                                                                                                       

Once again, this column is more than 90% null and the values offered are descriptive with no usecases in our case. Therefore, this column shall also be deleted.

In [212]:
del df['Other_specs']

## Parking_Assistance

In [213]:
print(df['Parking_Assistance'].value_counts())
print()
print('Parking_Assistance Nulls', df['Parking_Assistance'].isnull().sum())

Parking_Assistance
Rear sensors with camera                                                         360
Rear sensors                                                                     276
Front and rear sensors with camera                                               181
Front & rear sensors with 360 degree view                                        120
Rear sensors, Rear sensors with camera                                            17
Yes                                                                               15
Front sensors, Rear sensors                                                        5
Front and rear sensors with camera, Front & rear sensors with 360 degree view      2
Rear sensors with camera, Rear sensors                                             1
Rear sensors, Front sensors                                                        1
Name: count, dtype: int64

Parking_Assistance Nulls 289


In [214]:
df['Parking_Assistance'] = df['Parking_Assistance'].str.replace('Rear sensors, Rear sensors with camera','Rear sensors with camera')
df['Parking_Assistance'] = df['Parking_Assistance'].str.replace('Rear sensors with camera, Rear sensors','Rear sensors with camera')
df['Parking_Assistance'] = df['Parking_Assistance'].str.replace('Front sensors, Rear sensors','Rear sensors, Front sensors')


df["Parking_Assistance"].fillna('No', inplace=True)

print(df['Parking_Assistance'].value_counts())
print()
print('Nulls:',df['Parking_Assistance'].isnull().sum())
print()
print(df['Parking_Assistance'].info())

Parking_Assistance
Rear sensors with camera                                                         378
No                                                                               289
Rear sensors                                                                     276
Front and rear sensors with camera                                               181
Front & rear sensors with 360 degree view                                        120
Yes                                                                               15
Rear sensors, Front sensors                                                        6
Front and rear sensors with camera, Front & rear sensors with 360 degree view      2
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Parking_Assistance
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Infotainment_Screen

In [215]:
print(df['Infotainment_Screen'].value_counts())
print()
print('Infotainment_Screen Nulls', df['Infotainment_Screen'].isnull().sum())

Infotainment_Screen
Touch Sensitive    728
Yes                 77
Name: count, dtype: int64

Infotainment_Screen Nulls 462


In [216]:
df["Infotainment_Screen"].fillna('No', inplace=True)

print(df['Infotainment_Screen'].value_counts())
print()
print('Nulls:',df['Infotainment_Screen'].isnull().sum())
print()
print(df['Infotainment_Screen'].info())

Infotainment_Screen
Touch Sensitive    728
No                 462
Yes                 77
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Infotainment_Screen
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Multifunction_Steering_Wheel

In [217]:
print(df['Multifunction_Steering_Wheel'].value_counts())
print()
print('Multifunction_Steering_Wheel Nulls', df['Multifunction_Steering_Wheel'].isnull().sum())

Multifunction_Steering_Wheel
Multifunction Steering Wheel                             715
Yes                                                      113
Multifunction Steering Wheel, With gear shift paddles     74
With gear shift paddles, Multifunction Steering Wheel      9
With gear shift paddles                                    5
Name: count, dtype: int64

Multifunction_Steering_Wheel Nulls 351


In [218]:
df['Multifunction_Steering_Wheel'] = df['Multifunction_Steering_Wheel'].str.replace('Multifunction Steering Wheel, With gear shift paddles', 
                                                                                    'With gear shift paddles')
df['Multifunction_Steering_Wheel'] = df['Multifunction_Steering_Wheel'].str.replace('With gear shift paddles, Multifunction Steering Wheel',
                                                                                   'With gear shift paddles')

df["Multifunction_Steering_Wheel"].fillna('No', inplace=True)

print(df['Multifunction_Steering_Wheel'].value_counts())
print()
print('Nulls:',df['Multifunction_Steering_Wheel'].isnull().sum())
print()
print(df['Multifunction_Steering_Wheel'].info())

Multifunction_Steering_Wheel
Multifunction Steering Wheel    715
No                              351
Yes                             113
With gear shift paddles          88
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Multifunction_Steering_Wheel
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Seat_Height_Adjustment

In [219]:
print(df['Seat_Height_Adjustment'].value_counts())
print()
print('Seat_Height_Adjustment Nulls', df['Seat_Height_Adjustment'].isnull().sum())

Seat_Height_Adjustment
Manual Adjustment                  529
Electric Adjustment with Memory    239
Electric Adjustment                159
Semi Automatic Adjustment           14
Name: count, dtype: int64

Seat_Height_Adjustment Nulls 326


In [220]:
df["Seat_Height_Adjustment"].fillna('No', inplace=True)

print(df['Seat_Height_Adjustment'].value_counts())
print()
print('Nulls:',df['Seat_Height_Adjustment'].isnull().sum())
print()
print(df['Seat_Height_Adjustment'].info())

Seat_Height_Adjustment
Manual Adjustment                  529
No                                 326
Electric Adjustment with Memory    239
Electric Adjustment                159
Semi Automatic Adjustment           14
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Seat_Height_Adjustment
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Rear_Center_Armrest

In [221]:
print(df['Rear_Center_Armrest'].value_counts())
print()
print('Rear_Center_Armrest Nulls', df['Rear_Center_Armrest'].isnull().sum())

Rear_Center_Armrest
Cup Holders                    489
Yes                            193
Audio Controls                  13
Audio Controls, Cup Holders     11
Cup Holders, Audio Controls      3
Name: count, dtype: int64

Rear_Center_Armrest Nulls 558


In [222]:
df['Rear_Center_Armrest'] = df['Rear_Center_Armrest'].str.replace('Cup Holders, Audio Controls', 'Audio Controls, Cup Holders')

df["Rear_Center_Armrest"].fillna('No', inplace=True)

print(df['Rear_Center_Armrest'].value_counts())
print()
print('Nulls:',df['Rear_Center_Armrest'].isnull().sum())
print()
print(df['Rear_Center_Armrest'].info())

Rear_Center_Armrest
No                             558
Cup Holders                    489
Yes                            193
Audio Controls, Cup Holders     14
Audio Controls                  13
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Rear_Center_Armrest
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Recommended_Tyre_Pressure

In [223]:
print(df['Recommended_Tyre_Pressure'].value_counts())
print()
print('Recommended_Tyre_Pressure Nulls', df['Recommended_Tyre_Pressure'].isnull().sum())

Recommended_Tyre_Pressure
2.4 PSI          9
175/60R15 psi    1
Name: count, dtype: int64

Recommended_Tyre_Pressure Nulls 1257


Since 99% of the column is null, we will delete it.

In [224]:
del df['Recommended_Tyre_Pressure']

## Heated_Seats

In [225]:
print(df['Heated_Seats'].value_counts())
print()
print('Heated_Seats Nulls', df['Heated_Seats'].isnull().sum())

Heated_Seats
Only front    104
Yes            86
All            73
Driver          3
Name: count, dtype: int64

Heated_Seats Nulls 1001


In [226]:
df["Heated_Seats"].fillna('No', inplace=True)

print(df['Heated_Seats'].value_counts())
print()
print('Nulls:',df['Heated_Seats'].isnull().sum())
print()
print(df['Heated_Seats'].info())

Heated_Seats
No            1001
Only front     104
Yes             86
All             73
Driver           3
Name: count, dtype: int64

Nulls: 0

<class 'pandas.core.series.Series'>
Index: 1267 entries, 0 to 1275
Series name: Heated_Seats
Non-Null Count  Dtype 
--------------  ----- 
1267 non-null   object
dtypes: object(1)
memory usage: 19.8+ KB
None


## Engine_Type

In [227]:
print(df['Engine_Type'].value_counts())
print()
print('Engine_Type Nulls', df['Engine_Type'].isnull().sum())

Engine_Type
Revotron 1.2 L, BS6 Engine                                                                                                                           6
1.2 L Revotron (BS VI)                                                                                                                               5
1.5L Turbocharged Revotorq (BS VI)                                                                                                                   5
CRDi                                                                                                                                                 5
M1 Category                                                                                                                                          3
2.0 Diesel                                                                                                                                           2
D180                                                                              

Since 97% of the column is null, we will delete it.

In [228]:
del df['Engine_Type']

## Battery

In [229]:
print(df['Battery'].value_counts())
print()
print('Battery Nulls', df['Battery'].isnull().sum())

Battery
21.5 kWh,Battery Placed Under Rear Seats                                                 3
200 ampere-hour                                                                          3
30.2 kWh* Advanced Li-ion polymer, liquid cooled battery pack with IP67 certification    3
210 ampere-hour                                                                          2
44.5 (kWh), Lithium Ion                                                                  2
Name: count, dtype: int64

Battery Nulls 1254


Since 98% of the column is null, we will delete it.

In [230]:
del df['Battery']

## Electric_Range

In [231]:
print(df['Electric_Range'].value_counts())
print()
print('Electric_Range Nulls', df['Electric_Range'].isnull().sum())

Electric_Range
110 km/full charge      5
213 km/full charge      3
340 km/full charge      2
11.68 km/full charge    2
312 km/full charge      2
39.53 km/full charge    1
300 km/full charge      1
462 km/full charge      1
Name: count, dtype: int64

Electric_Range Nulls 1250


In [232]:
df[~df['Electric_Range'].isna()].head()

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm,...,Paddle_Shifters,Leather_Wrapped_Steering,Automatic_Headlamps,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,Electric_Range,carname
319,Bmw,7-Series,745Le Xdrive,16500000,2998.0,6.0,4.0,AWD (All Wheel Drive),V,BS 6,...,Yes,Yes,Yes,Yes,Yes,3.0,Yes,Yes,39.53 km/full charge,Bmw-7-Series
615,Mahindra,E2O Plus,P4,881425,1497.0,4.0,4.0,RWD (Rear Wheel Drive),In-line,BS IV,...,No,No,No,No,No,No,No,No,110 km/full charge,Mahindra-E2O Plus
616,Mahindra,E2O Plus,P6,957177,1497.0,4.0,4.0,RWD (Rear Wheel Drive),In-line,BS IV,...,No,No,No,No,No,No,No,No,110 km/full charge,Mahindra-E2O Plus
617,Tata,Tigor Ev,Xm+,960868,1497.0,4.0,4.0,FWD (Front Wheel Drive),In-line,BS VI,...,No,No,No,No,No,No,No,No,213 km/full charge,Tata-Tigor Ev
618,Tata,Tigor Ev,Xt+,975868,1497.0,4.0,4.0,FWD (Front Wheel Drive),In-line,BS VI,...,No,No,No,No,No,No,No,No,213 km/full charge,Tata-Tigor Ev


The Electric_Range of all electric/hybrid cars is not covered in the column that contains only 17 values as compared to the 29 entries of electric and hybrid cars. Furthermore, 2 of the entries in the electric_range column are for petrol cars. 

Since 98% of the column is null and only about 50% of electric/hybrid cars are covered, we may not be able to draw conclusive results from this column. 

Therefore, it shall be deleted.

In [233]:
del df['Electric_Range']

# Exporting the cleaned dataset

In [234]:
df.head()

Unnamed: 0,Make,Model,Variant,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Drivetrain,Cylinder_Configuration,Emission_Norm,...,Rain_Sensing_Wipers,Paddle_Shifters,Leather_Wrapped_Steering,Automatic_Headlamps,ASR_/_Traction_Control,Cruise_Control,USB_Ports,Heads-Up_Display,Welcome_Lights,carname
0,Tata,Nano Genx,Xt,292667,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,No,No,No,No,No,No,No,No,No,Tata-Nano Genx
1,Tata,Nano Genx,Xe,236447,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,No,No,No,No,No,No,No,No,No,Tata-Nano Genx
2,Tata,Nano Genx,Emax Xm,296661,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,No,No,No,No,No,No,No,No,No,Tata-Nano Genx
3,Tata,Nano Genx,Xta,334768,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,No,No,No,No,No,No,No,No,No,Tata-Nano Genx
4,Tata,Nano Genx,Xm,272223,624.0,2.0,2.0,RWD (Rear Wheel Drive),In-line,BS IV,...,No,No,No,No,No,No,No,No,No,Tata-Nano Genx


In [235]:
df.shape

(1267, 134)

In [236]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1267 entries, 0 to 1275
Data columns (total 134 columns):
 #    Column                                     Non-Null Count  Dtype  
---   ------                                     --------------  -----  
 0    Make                                       1267 non-null   object 
 1    Model                                      1267 non-null   object 
 2    Variant                                    1267 non-null   object 
 3    Ex-Showroom_Price                          1267 non-null   int32  
 4    Displacement                               1267 non-null   float64
 5    Cylinders                                  1267 non-null   float64
 6    Valves_Per_Cylinder                        1267 non-null   float64
 7    Drivetrain                                 1267 non-null   object 
 8    Cylinder_Configuration                     1267 non-null   object 
 9    Emission_Norm                              1267 non-null   object 
 10   Engine_Location

In [238]:
df.to_csv('indian_cars_cleaned.csv', index=False)