# Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

# Preprocessing of Data

In [8]:
file_path='car_price_prediction.csv'
data=pd.read_csv(file_path)
columns_irrelevant=['ID','Model','Levy','Wheel','Color','Leather interior','Engine volume','Category','Manufacturer','Fuel type','Doors']
data_filtered = data.drop(columns=columns_irrelevant)
# columns_to_encode = ['Gear box type', 'Drive wheels']
# data_encoded = pd.get_dummies(data_filtered, columns=columns_to_encode)
mapping1 = {
    'Automatic': 1,
    'Tiptronic': 2,
    'Variator': 3,
    'Manual': 4
}
mapping2 = {
    '4x4': 1,
    'Front': 2,
    'Rear': 3    
}
data_filtered['Gear box type'] = data_filtered['Gear box type'].map(mapping1)
data_filtered['Drive wheels'] = data_filtered['Drive wheels'].map(mapping2)
data_filtered['Mileage']=data_filtered['Mileage'].str.replace(' km', '')
data_filtered['Mileage'] = pd.to_numeric(data_filtered['Mileage'])
y=data_filtered['Price']
x=data_filtered.drop(columns=['Price'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
#y_train_scaled=scaler.transform(y_train.reshape(-1, 1))
#y_test_scaled=scaler.transform(y_train.reshape(-1, 1))
y_train_numpy = y_train.values
y_test_numpy = y_test.values
num_rows,num_cols = x.shape

In [7]:
data_filtered.head()

Unnamed: 0,Price,Prod. year,Mileage,Cylinders,Gear box type,Drive wheels,Airbags
0,13328,2010,186005,6.0,1,1,12
1,16621,2011,192000,6.0,2,1,8
2,8467,2006,200000,4.0,3,2,2
3,3607,2011,168966,4.0,1,1,0
4,11726,2014,91901,4.0,1,2,4


## Building linear regression model from scratch

In [22]:
def predict(x):
    global w,b
    return np.dot(x,w)+b

def cost_function(y_target,y_pred):
    m=y_target.shape[0]
    cost=0
    for i in range(m):
        diff=(y_pred[i]-y_target[i])**2
        cost+=diff
    total_cost=(1/(2*m))*cost
    return total_cost

def grad_decsend(x,y_target,y_pred,learn_rate):
    global w,b
    error=y_pred-y_target
    m=y_pred.shape[0]
    grad=(1/m)*np.dot(x.T,error)
    w-=learn_rate*grad
    b-=learn_rate*np.mean(error)

learn_rate=0.01
epochs=1000
w = np.zeros(num_cols)
b = 0
for epoch in range(epochs):
    y_pred=predict(x_train_scaled)
    loss=cost_function(y_train_numpy,y_pred)
    grad_decsend(x_train_scaled,y_train_numpy,y_pred,learn_rate)
    if epoch%100==0:
        print(f'Epoch {epoch}, Loss {loss}')


Epoch 0, Loss 22840634070.093735
Epoch 100, Loss 22666774719.39118
Epoch 200, Loss 22639140601.162807
Epoch 300, Loss 22633589231.92864
Epoch 400, Loss 22632073839.97071
Epoch 500, Loss 22631549242.46998
Epoch 600, Loss 22631344940.74606
Epoch 700, Loss 22631261709.30351
Epoch 800, Loss 22631227273.604454
Epoch 900, Loss 22631212953.575687


In [23]:
print(w)
print(b)

[ 6748.64959544  -203.33692987  2897.08200786  7335.64272674
  -715.38362199 -4136.537484  ]
18946.854433703556


## Prediction on test data

In [24]:
y_pr=predict(x_test_scaled)
mse = mean_squared_error(y_test_numpy, y_pr)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test_numpy, y_pr)
print("R-squared Score:", r2)

Mean Squared Error: 299371807.6515068
R-squared Score: 0.03923279796583112
