# 1. Load & Inspect Dataset

In [5]:
import pandas as pd

# loading the file
dataset = pd.read_csv("data.csv")

# checking size of data
print("Total rows:", dataset.shape[0])
print("Total columns:", dataset.shape[1])

print("\n------------------ Missing Values ------------")
print(dataset.isnull().sum())

print("\n------------------ Data Summary --------------")
print(dataset.describe())

Total rows: 11914
Total columns: 16

------------------ Missing Values ------------
Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

------------------ Data Summary --------------
               Year    Engine HP  Engine Cylinders  Number of Doors  \
count  11914.000000  11845.00000      11884.000000     11908.000000   
mean    2010.384338    249.38607          5.628829         3.436093   
std        7.579740    109.19187          1.780559         0.881315   
min     1990.000000     55.00000          0.000000         2.000000   
25%     2007.000000    170.00000          4.000000         2.000000   
50%     2015.

# 2 Handle Missing Data

In [7]:
# numeric columns alag karne ke liye
num_cols = dataset.select_dtypes(include=['number']).columns
print("\nNumeric columns found:", num_cols)

print("\nChecking Null Percentage:")
for c in num_cols:
    # calculation ko thoda simple aur break karke likha hai
    null_count = dataset[c].isnull().sum()
    total_rows = len(dataset)
    percent = (null_count / total_rows) * 100
    print(c, ":", round(percent, 2), "%")

print("\n--- Handling Null Values ---\n")
for c in num_cols:
    # Agar 1.5% se kam missing hai toh mean se bhar do
    p = (dataset[c].isnull().sum() / len(dataset)) * 100
    
    if p < 1.5:
        m = dataset[c].mean()
        dataset[c].fillna(m, inplace=True)
        print("Filled", c, "with mean:", round(m, 2))

# check results
print("\nFinal Null Count:")
print(dataset.isnull().sum())


Numeric columns found: Index(['Year', 'Engine HP', 'Engine Cylinders', 'Number of Doors',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

Checking Null Percentage:
Year : 0.0 %
Engine HP : 0.0 %
Engine Cylinders : 0.0 %
Number of Doors : 0.0 %
highway MPG : 0.0 %
city mpg : 0.0 %
Popularity : 0.0 %
MSRP : 0.0 %

--- Handling Null Values ---

Filled Year with mean: 2010.38
Filled Engine HP with mean: 249.39
Filled Engine Cylinders with mean: 5.63
Filled Number of Doors with mean: 3.44
Filled highway MPG with mean: 26.64
Filled city mpg with mean: 19.73
Filled Popularity with mean: 1554.91
Filled MSRP with mean: 40594.74

Final Null Count:
Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP               0
Engine Cylinders        0
Transmission Type       0
Driven_Wheels           0
Number of Doors         0
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway M

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[c].fillna(m, inplace=True)


In [None]:
# categorical columns check karne ke liye
cat_cols = dataset.select_dtypes(include=['object']).columns
print("\nCategorical columns list:", cat_cols)

print("\n--- Missing percentage ---")
for c in cat_cols:
    count = dataset[c].isnull().sum()
    # len(dataset) use karna zyada natural hai
    p = (count / len(dataset)) * 100
    print(c, "missing:", round(p, 2), "%")

print("\n--- Cleaning Categorical Data ---")
for c in cat_cols:
    miss_p = (dataset[c].isnull().sum() / len(dataset)) * 100
    
    if miss_p > 1.5:
        # mode se fill karna
        val_mode = dataset[c].mode()[0]
        dataset[c].fillna(val_mode, inplace=True)
        print("Filled", c, "with mode:", val_mode)
        
    elif miss_p > 0:
        # kam missing values hain toh delete kar diya
        dataset.dropna(subset=[c], inplace=True)
        print("Dropped rows for column:", c)

# final check
print("\nChecking nulls again:")
print(dataset.isnull().sum())

[92m 
 Categorical columns 
[0m Index(['Make', 'Model', 'Engine Fuel Type', 'Transmission Type',
       'Driven_Wheels', 'Market Category', 'Vehicle Size', 'Vehicle Style'],
      dtype='object')
[92m null  value in % [0m
Make: 0.00%
Model: 0.00%
Engine Fuel Type: 0.03%
Transmission Type: 0.00%
Driven_Wheels: 0.00%
Market Category: 31.41%
Vehicle Size: 0.00%
Vehicle Style: 0.00%
[92m 
 Null value Handel [0m
Dropped rows with null values in column 'Engine Fuel Type'
Filled null values in column 'Market Category' with mode value 'Crossover'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(mode_value,inplace=True)   # fill null value with mode


Make                 0
Model                0
Year                 0
Engine Fuel Type     0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Market Category      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
Popularity           0
MSRP                 0
dtype: int64

# Remove Duplicates

In [8]:
# checking rows and columns
print("Rows:", dataset.shape[0])
print("Columns:", dataset.shape[1])
print("\nMissing values in each column:")
print(dataset.isnull().sum())

# duplicate rows check
dup_count = dataset.duplicated().sum()
total = len(dataset)
percent = (dup_count / total) * 100

print("\nDuplicate rows percentage:", round(percent, 2), "%")
print("Total duplicate rows found:", dup_count, "out of", total)

# removing duplicates
print("Dropping duplicate rows...")
dataset.drop_duplicates(inplace=True)

print("Rows count after cleaning:", dataset.shape[0])

Rows: 11914
Columns: 16

Missing values in each column:
Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP               0
Engine Cylinders        0
Transmission Type       0
Driven_Wheels           0
Number of Doors         0
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

Duplicate rows percentage: 6.0 %
Total duplicate rows found: 715 out of 11914
Dropping duplicate rows...
Rows count after cleaning: 11199


# Fix Data Inconsistencies

In [11]:
# making string values uniform (lowercase)
cat_cols = dataset.select_dtypes(include=['object']).columns
for c in cat_cols:
    dataset[c] = dataset[c].str.lower()
    print("Changed to lowercase for column:", c)

Changed to lowercase for column: Make
Changed to lowercase for column: Model
Changed to lowercase for column: Engine Fuel Type
Changed to lowercase for column: Transmission Type
Changed to lowercase for column: Driven_Wheels
Changed to lowercase for column: Market Category
Changed to lowercase for column: Vehicle Size
Changed to lowercase for column: Vehicle Style


In [12]:
# Cleaning text data in columns
title_cols = ["Make", "Model", "Vehicle Style", "Market Category"]

for col in title_cols:
    # simple way to clean: strip spaces and convert to title case
    dataset[col] = dataset[col].astype(str).str.strip()
    dataset[col] = dataset[col].str.title()

lower_cols = ["Engine Fuel Type", "Transmission Type", "Driven_Wheels", "Vehicle Size"]

for col in lower_cols:
    # simple way to clean: strip spaces and convert to lowercase
    dataset[col] = dataset[col].astype(str).str.strip()
    dataset[col] = dataset[col].str.lower()

# fixing specific car names manually
dataset["Make"] = dataset["Make"].replace("Bmw", "BMW")
dataset["Make"] = dataset["Make"].replace("Gmc", "GMC")
dataset["Make"] = dataset["Make"].replace("Vw", "Volkswagen")
dataset["Make"] = dataset["Make"].replace("Mclaren", "McLaren")

print("Done: String cleaning and standardization finished.")
dataset.info()

Done: String cleaning and standardization finished.
<class 'pandas.core.frame.DataFrame'>
Index: 11199 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11199 non-null  object 
 1   Model              11199 non-null  object 
 2   Year               11199 non-null  int64  
 3   Engine Fuel Type   11199 non-null  object 
 4   Engine HP          11199 non-null  float64
 5   Engine Cylinders   11199 non-null  float64
 6   Transmission Type  11199 non-null  object 
 7   Driven_Wheels      11199 non-null  object 
 8   Number of Doors    11199 non-null  float64
 9   Market Category    11199 non-null  object 
 10  Vehicle Size       11199 non-null  object 
 11  Vehicle Style      11199 non-null  object 
 12  highway MPG        11199 non-null  int64  
 13  city mpg           11199 non-null  int64  
 14  Popularity         11199 non-null  int64  
 15  MSRP               1119

# Outlier Detection & Treatment

In [13]:
# Outlier checking using IQR method
num_cols = dataset.select_dtypes(include=['number']).columns
grand_total_outliers = 0

for c in num_cols:
    # calculating bounds
    q1 = dataset[c].quantile(0.25)
    q3 = dataset[c].quantile(0.75)
    iqr = q3 - q1
    
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    
    # finding outliers for this column
    outliers_data = dataset[(dataset[c] < lower) | (dataset[c] > upper)]
    count = len(outliers_data)
    
    # percentage calculation
    percent = (count / len(dataset)) * 100
    grand_total_outliers += count
    
    print("Column:", c, "| Outliers:", count, "|", round(percent, 2), "%")

# showing final result
total_percent = (grand_total_outliers / len(dataset)) * 100
print("\nTotal outliers found in all columns:", grand_total_outliers)
print("Total Outlier Percentage:", round(total_percent, 2), "%")

Column: Year | Outliers: 467 | 4.17 %
Column: Engine HP | Outliers: 502 | 4.48 %
Column: Engine Cylinders | Outliers: 352 | 3.14 %
Column: Number of Doors | Outliers: 0 | 0.0 %
Column: highway MPG | Outliers: 182 | 1.63 %
Column: city mpg | Outliers: 305 | 2.72 %
Column: Popularity | Outliers: 825 | 7.37 %
Column: MSRP | Outliers: 960 | 8.57 %

Total outliers found in all columns: 3593
Total Outlier Percentage: 32.08 %


In [None]:
# replacing outliers with median
num_cols = dataset.select_dtypes(include=['number']).columns

for c in num_cols:
    # 1. Finding bounds again
    q1 = dataset[c].quantile(0.25)
    q3 = dataset[c].quantile(0.75)
    iqr = q3 - q1
    
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    
    # 2. Calculating median
    med = dataset[c].median()
    
    # 3. Replacing values
    # simple condition use karke replace kiya hai
    dataset.loc[dataset[c] < lower, c] = med
    dataset.loc[dataset[c] > upper, c] = med
    
    print("Replaced outliers in", c, "with median:", med)

print("\nAll outliers have been handled.")

[92m Replaced outliers in column 'Year' with median value 2015.0.[0m
[92m Replaced outliers in column 'Engine HP' with median value 240.0.[0m
[92m Replaced outliers in column 'Engine Cylinders' with median value 6.0.[0m
[92m Replaced outliers in column 'Number of Doors' with median value 4.0.[0m
[92m Replaced outliers in column 'highway MPG' with median value 25.0.[0m
[92m Replaced outliers in column 'city mpg' with median value 18.0.[0m
[92m Replaced outliers in column 'Popularity' with median value 1385.0.[0m
[92m Replaced outliers in column 'MSRP' with median value 30695.0.[0m


# Feature Engineering

In [14]:
import datetime

# 1. Calculating car age
this_year = datetime.datetime.now().year
dataset["car_age"] = this_year - dataset["Year"]

# 2. Creating a weighted MPG score
# highway 60% and city 40% weightage
dataset["fuel_score"] = (dataset["highway MPG"] * 0.6) + (dataset["city mpg"] * 0.4)

# 3. HP per cylinder logic
dataset["hp_per_cylinder"] = dataset["Engine HP"] / dataset["Engine Cylinders"]

# 4. Simple average MPG
dataset["avg_mpg"] = (dataset["highway MPG"] + dataset["city mpg"]) / 2

# 5. checking for luxury cars
# if market category has 'luxury' then 1 else 0
dataset["is_luxury"] = dataset["Market Category"].str.contains("luxury", case=False, na=False).astype(int)

# 6. Hybrid and Electric markers
dataset["hybrid_flag"] = dataset["Engine Fuel Type"].str.contains("hybrid", case=False, na=False).astype(int)
dataset["electric_flag"] = dataset["Engine Fuel Type"].str.contains("electric", case=False, na=False).astype(int)

# checking new columns
print(dataset.head())

  Make       Model  Year             Engine Fuel Type  Engine HP  \
0  BMW  1 Series M  2011  premium unleaded (required)      335.0   
1  BMW    1 Series  2011  premium unleaded (required)      300.0   
2  BMW    1 Series  2011  premium unleaded (required)      300.0   
3  BMW    1 Series  2011  premium unleaded (required)      230.0   
4  BMW    1 Series  2011  premium unleaded (required)      230.0   

   Engine Cylinders Transmission Type     Driven_Wheels  Number of Doors  \
0               6.0            manual  rear wheel drive              2.0   
1               6.0            manual  rear wheel drive              2.0   
2               6.0            manual  rear wheel drive              2.0   
3               6.0            manual  rear wheel drive              2.0   
4               6.0            manual  rear wheel drive              2.0   

                         Market Category  ... city mpg Popularity   MSRP  \
0  Factory Tuner,Luxury,High-Performance  ...       19    

In [10]:
# Dropping 'Model' isn’t necessary because it contains useful information and can be encoded instead of removed.
dataset.drop('Model', axis=1, inplace=True)

# Encode Categorical Variables

In [15]:
from sklearn.preprocessing import LabelEncoder

# 1. Handling 'Market Category' using Label Encoding
# (Kyuki isme categories bahut zyada hain)
encoder = LabelEncoder()
df = dataset.copy() # original data safe rakhne ke liye
df['Market Category'] = encoder.fit_transform(df['Market Category'])

# 2. Converting other categorical columns to dummies
# In columns ko 0 aur 1 mein convert kar rahe hain
cols_to_convert = ['Make', 'Engine Fuel Type', 'Transmission Type', 
                   'Driven_Wheels', 'Vehicle Size', 'Vehicle Style']

df = pd.get_dummies(df, columns=cols_to_convert)

# final check
print("New Data Shape:", df.shape)
df.head()

New Data Shape: (11199, 104)


Unnamed: 0,Model,Year,Engine HP,Engine Cylinders,Number of Doors,Market Category,highway MPG,city mpg,Popularity,MSRP,...,Vehicle Style_Convertible,Vehicle Style_Convertible Suv,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
0,1 Series M,2011,335.0,6.0,2.0,38,26,19,3916,46135,...,False,False,True,False,False,False,False,False,False,False
1,1 Series,2011,300.0,6.0,2.0,67,28,19,3916,40650,...,True,False,False,False,False,False,False,False,False,False
2,1 Series,2011,300.0,6.0,2.0,64,28,20,3916,36350,...,False,False,True,False,False,False,False,False,False,False
3,1 Series,2011,230.0,6.0,2.0,67,28,18,3916,29450,...,False,False,True,False,False,False,False,False,False,False
4,1 Series,2011,230.0,6.0,2.0,63,28,18,3916,34500,...,True,False,False,False,False,False,False,False,False,False


In [16]:
# checking the columns we have now
print("New columns after encoding:")
print(df.columns)

# note: get_dummies already removes the old categorical columns, 
# so we don't need to manually drop them again.

# just verifying the shape
print("\nFinal shape of the processed data:", df.shape)

New columns after encoding:
Index(['Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Number of Doors',
       'Market Category', 'highway MPG', 'city mpg', 'Popularity', 'MSRP',
       ...
       'Vehicle Style_Convertible', 'Vehicle Style_Convertible Suv',
       'Vehicle Style_Coupe', 'Vehicle Style_Crew Cab Pickup',
       'Vehicle Style_Extended Cab Pickup', 'Vehicle Style_Passenger Minivan',
       'Vehicle Style_Passenger Van', 'Vehicle Style_Regular Cab Pickup',
       'Vehicle Style_Sedan', 'Vehicle Style_Wagon'],
      dtype='object', length=104)

Final shape of the processed data: (11199, 104)


In [23]:
# Final merging of original data and encoded columns
# axis=1 ka matlab hai side-by-side join karna
final_data = df.copy()

print("\n--- Final Dataset Columns ---")
print(final_data.columns)

print("\nFinal shape:", final_data.shape)


--- Final Dataset Columns ---
Index(['Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Number of Doors',
       'Market Category', 'highway MPG', 'city mpg', 'Popularity', 'MSRP',
       ...
       'Vehicle Style_Convertible', 'Vehicle Style_Convertible Suv',
       'Vehicle Style_Coupe', 'Vehicle Style_Crew Cab Pickup',
       'Vehicle Style_Extended Cab Pickup', 'Vehicle Style_Passenger Minivan',
       'Vehicle Style_Passenger Van', 'Vehicle Style_Regular Cab Pickup',
       'Vehicle Style_Sedan', 'Vehicle Style_Wagon'],
      dtype='object', length=104)

Final shape: (11199, 104)


In [24]:
final_data.head()

Unnamed: 0,Model,Year,Engine HP,Engine Cylinders,Number of Doors,Market Category,highway MPG,city mpg,Popularity,MSRP,...,Vehicle Style_Convertible,Vehicle Style_Convertible Suv,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
0,1 Series M,2011,335.0,6.0,2.0,38,26,19,3916,46135,...,False,False,True,False,False,False,False,False,False,False
1,1 Series,2011,300.0,6.0,2.0,67,28,19,3916,40650,...,True,False,False,False,False,False,False,False,False,False
2,1 Series,2011,300.0,6.0,2.0,64,28,20,3916,36350,...,False,False,True,False,False,False,False,False,False,False
3,1 Series,2011,230.0,6.0,2.0,67,28,18,3916,29450,...,False,False,True,False,False,False,False,False,False,False
4,1 Series,2011,230.0,6.0,2.0,63,28,18,3916,34500,...,True,False,False,False,False,False,False,False,False,False


# save file 

In [25]:
final_data.to_csv("new_clean.csv", index=False, encoding="utf-8-sig")


In [26]:
da=pd.read_csv("new_clean.csv")

In [27]:
da.head()

Unnamed: 0,Model,Year,Engine HP,Engine Cylinders,Number of Doors,Market Category,highway MPG,city mpg,Popularity,MSRP,...,Vehicle Style_Convertible,Vehicle Style_Convertible Suv,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
0,1 Series M,2011,335.0,6.0,2.0,38,26,19,3916,46135,...,False,False,True,False,False,False,False,False,False,False
1,1 Series,2011,300.0,6.0,2.0,67,28,19,3916,40650,...,True,False,False,False,False,False,False,False,False,False
2,1 Series,2011,300.0,6.0,2.0,64,28,20,3916,36350,...,False,False,True,False,False,False,False,False,False,False
3,1 Series,2011,230.0,6.0,2.0,67,28,18,3916,29450,...,False,False,True,False,False,False,False,False,False,False
4,1 Series,2011,230.0,6.0,2.0,63,28,18,3916,34500,...,True,False,False,False,False,False,False,False,False,False
