In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer


In [37]:
df = pd.read_csv('car data.csv')


In [50]:
df.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Car_Age,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3.35,-0.236215,0.083876,0,-0.128897,False,True,False,True
1,4.75,0.221505,0.54384,0,0.217514,True,False,False,True
2,7.25,0.257427,-1.264545,0,-1.168129,False,True,False,True
3,2.85,-0.403079,-1.544088,0,0.910335,False,True,False,True
4,4.6,-0.08789,0.531116,0,-0.128897,True,False,False,True


In [38]:
# handle Missing Value
print(df.isnull().sum())


Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64


In [39]:
# Filling numerical missing values with median
num_cols = ['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner']
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Filling categorical missing values with mode
cat_cols = ['Fuel_Type', 'Seller_Type', 'Transmission']
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])


In [40]:
df['Car_Age'] = 2025 - df['Year']
df.drop(['Year'], axis=1, inplace=True)


In [41]:
df['Kms_Driven'] = np.log1p(df['Kms_Driven'])  


In [42]:
df.drop(['Car_Name'], axis=1, inplace=True)


In [43]:
# encoding categorical value...
df = pd.get_dummies(df, columns=['Fuel_Type', 'Seller_Type', 'Transmission'], drop_first=True)


In [44]:
le = LabelEncoder()
df['Owner'] = le.fit_transform(df['Owner'])


In [45]:
# Standardization
scaler = StandardScaler()
num_features = ['Present_Price', 'Kms_Driven', 'Car_Age']
df[num_features] = scaler.fit_transform(df[num_features])


In [46]:
X = df.drop('Selling_Price', axis=1)  # Features
y = df['Selling_Price']  # Target Variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [47]:
df.to_csv('processed_car_data.csv', index=False)


In [48]:
newdf = pd.read_csv('processed_car_data.csv')

In [49]:
newdf.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Car_Age,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3.35,-0.236215,0.083876,0,-0.128897,False,True,False,True
1,4.75,0.221505,0.54384,0,0.217514,True,False,False,True
2,7.25,0.257427,-1.264545,0,-1.168129,False,True,False,True
3,2.85,-0.403079,-1.544088,0,0.910335,False,True,False,True
4,4.6,-0.08789,0.531116,0,-0.128897,True,False,False,True
