In [1]:
import os
import pandas as pd
print("Files in directory:")
path=('/Users/mohdsafeenkhan/Desktop/Machine Learning/Project/Car Price Prediction/Data')
for f in os.listdir(path):
    print(f)
data_file = os.path.join(path,'car details v4.csv')
print(data_file)
df = pd.read_csv(data_file)
df.head()

Files in directory:
car details v4.csv
/Users/mohdsafeenkhan/Desktop/Machine Learning/Project/Car Price Prediction/Data/car details v4.csv


Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0


Now before creating our algorithm and make it to learn from this data , we need to do some feature engineering . So we first analyse the data and figure out what are the column that are insignificant , and are not essential for the learning algorithm . Once we have figured those out we can just drop those column from our pandas dataframe. 

In [2]:
df = df.drop(['Location' , 'Color' , 'Model'] , axis = 1)


Now as we are going to use the linear regression here , some column like fuel type , make etc are categorical , we need to make them one hot encoded , for our model to learn better.

In [3]:
df = pd.get_dummies(df , columns=['Make' , 'Fuel Type' , 'Transmission' ,'Owner' ,'Seller Type' ,'Drivetrain'])
df.head()

Unnamed: 0,Price,Year,Kilometer,Engine,Max Power,Max Torque,Length,Width,Height,Seating Capacity,...,Owner_Fourth,Owner_Second,Owner_Third,Owner_UnRegistered Car,Seller Type_Commercial Registration,Seller Type_Corporate,Seller Type_Individual,Drivetrain_AWD,Drivetrain_FWD,Drivetrain_RWD
0,505000,2017,87150,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,3990.0,1680.0,1505.0,5.0,...,False,False,False,False,False,True,False,False,True,False
1,450000,2014,75000,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,3995.0,1695.0,1555.0,5.0,...,False,True,False,False,False,False,True,False,True,False
2,220000,2011,67000,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,3585.0,1595.0,1550.0,5.0,...,False,False,False,False,False,False,True,False,True,False
3,799000,2019,37500,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,3995.0,1745.0,1510.0,5.0,...,False,False,False,False,False,False,True,False,True,False
4,1950000,2018,69000,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,4735.0,1830.0,1795.0,7.0,...,False,False,False,False,False,False,True,False,False,True


Now if we analyse the value of Engine , Max Power and Max Torque , we can see those values are not in single integer , so have to extract the exact  value from here.

In [4]:
df['Engine'] = df['Engine'].str.replace(' cc','').astype('float')
df['Max Power'] = df['Max Power'].str.extract(r'(\d+\.?\d*)').astype('float')
df['Max Torque'] = df['Max Torque'].str.extract(r'(\d+\.?\d*)').astype('float')

If we see , we have column name Year , which indicated the year the car was manufactured , but a more useful feature would be the age of the car , so we can just substract the currect from this to get the age of each car and create a new column for that and drop the year column.

In [5]:
currect_year = 2025
df['Age'] = currect_year - df['Year']
df.drop('Year',axis=1,inplace=True)

Now as we have done all the steps that we see can improve our algorithm , lets check if we have any null value or not , if not then we can move ahead and start with our algorithm

In [6]:
df.isnull().sum()
# print(df[df.isna().any(axis=1)])
pd.set_option('display.max_columns', None)  # Show all columns
#print(df.isna().sum())
print(df.isna().sum()[df.isna().sum() > 0])


Engine                 80
Max Power              80
Max Torque             80
Length                 64
Width                  64
Height                 64
Seating Capacity       64
Fuel Tank Capacity    113
dtype: int64


We see the above columns have null values , so we we will do is we will fill these with the median of that column.

In [7]:
missing_values = ['Engine' , 'Max Power','Max Torque','Length' ,'Width' , 'Height' ,'Seating Capacity' , 'Fuel Tank Capacity']
df[missing_values] = df[missing_values].fillna(df[missing_values].median())

Now once we have our data preprocessed , lets seprate this into 3 parts train 60% , cross validation 20% and test data 20%

In [8]:
from sklearn.model_selection import train_test_split

train_val , test = train_test_split(df , test_size=0.2 , random_state=42)
train , val = train_test_split(train_val , test_size=0.25 , random_state=42)

As we are working on a regression algorithm (supervised learning) , and we have to predict the price so we will make the X as an input parameter which will have all the column values expect price and y as output which will have only the price column

In [9]:
target_col = 'Price'

x_train = train.drop(target_col,axis= 1)
y_train = train[target_col]

x_val = val.drop(target_col,axis= 1)
y_val = val[target_col]

x_test = test.drop(target_col,axis= 1)
y_test = test[target_col]

Just one last thing before we start , we need to scale our data i.e , if we see the value of our columns are very scattered this makes the algorithm to reach the minimum value slowly and might not be correct , so we are going to use the StandardScaler from scikit learn . Also if you see , I have used fit_transform in train data but just transform in cross validation and test data , the reason behind this is when we do fit and transform the mean and the standard deviation values are calculated and then using those the data is transformed , but if we used fit in cross validation and test set that is going to cause data leak and the algorithm might overfit.

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_feature = ['Kilometer','Engine','Max Power','Max Torque','Length','Width','Age','Height','Seating Capacity','Fuel Tank Capacity',]
x_train[num_feature] = scaler.fit_transform(x_train[num_feature])
x_val[num_feature] = scaler.transform(x_val[num_feature])
x_test[num_feature] = scaler.transform(x_test[num_feature])

Now lets last learning , we will use the Linear Regression from scikit learn

In [11]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(x_train , y_train)
y_hat = linear_regression.predict(x_train)


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


using r2 score we can see how well our model is performing.

In [12]:
from sklearn.metrics import r2_score

r2 = r2_score(y_train , y_hat)
print(f"R2 score for algorithm is {r2}")

R2 score for algorithm is 0.7483736076442061


we can see the model is not performing well , it has just 74.837 % accuracy , so we will now use ridge linear regression , which will be penalizing our parameters.

In [13]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.995)
ridge.fit(x_train , y_train)
y_hat = ridge.predict(x_train)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


We can see there is still no improvement in the accuracy , so we will not try to make the feature polynomial 

In [14]:
r2 = r2_score(y_train , y_hat)
print(f"R2 score for algorithm is {r2}")

R2 score for algorithm is 0.742177607734087


In [15]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2 , include_bias=False)
x_train = poly.fit_transform(x_train)
x_val= poly.transform(x_val)
x_test= poly.transform(x_test)


we again train the data , but here the alpha is high , as if we dont't do this the model is going to overfit , we need to fing a value where we can get a bias-varience tradeoff.

In [16]:
ridge = Ridge(alpha=550)
ridge.fit(x_train , y_train)
y_hat = ridge.predict(x_train)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  intercept_ = y_offset - X_offset @ coef_
  intercept_ = y_offset - X_offset @ coef_
  intercept_ = y_offset - X_offset @ coef_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


We can see the models , performance has improved and is now within acceptable limits

In [17]:
r2 = r2_score(y_train , y_hat)
print(f"R2 score for algorithm is {r2}")

R2 score for algorithm is 0.8772187611967448


lets test this on our cross validation and test data as well

In [18]:
y_val_hat = ridge.predict(x_val)
r2 = r2_score(y_val , y_val_hat)
print(f"R2 score for algorithm is {r2}")

R2 score for algorithm is 0.8698873339610372


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [19]:
y_test_hat = ridge.predict(x_test)
r2 = r2_score(y_test , y_test_hat)
print(f"R2 score for algorithm is {r2}")

R2 score for algorithm is 0.8638302556477162


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


As we see the model is at same accuracy for all our three data set. If we further want to improve our model , we can use random forest or XGBoast algorithm , as we can see the data is not linear , and for these type of data non linear algorithm works best