This is Basic Linear Regression model that predicts car price


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [2]:
df = pd.read_csv('car_price_dataset.csv')
df

Unnamed: 0,Model,Year,Transmission,Fuel,Body,Price
0,Audi A3,2014,Automatic,Petrol,Coupe,12344
1,Audi A3,2007,Manual,Hybrid,Convertible,6674
2,Nissan Altima,2007,Manual,Petrol,Coupe,8305
3,Tesla Model 3,2019,Manual,Hybrid,SUV,20007
4,Mazda CX-5,2005,Manual,Electric,Coupe,7075
...,...,...,...,...,...,...
995,Toyota Corolla,2005,Manual,Hybrid,Convertible,6485
996,Honda Civic,2020,Manual,Diesel,Truck,21508
997,Nissan Altima,2009,Manual,Hybrid,Wagon,10993
998,Chevrolet Malibu,2010,Automatic,Electric,Sedan,10971


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Model         1000 non-null   object
 1   Year          1000 non-null   int64 
 2   Transmission  1000 non-null   object
 3   Fuel          1000 non-null   object
 4   Body          1000 non-null   object
 5   Price         1000 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 47.0+ KB


In [4]:
df.isnull().sum()

Model           0
Year            0
Transmission    0
Fuel            0
Body            0
Price           0
dtype: int64

In [5]:
duplicates = df.duplicated().sum()
print(f"\n number of duplicates: {duplicates}")
df = df.drop_duplicates()
print(f"\n no. duplicates remaining: {df.duplicated().sum()}")


 number of duplicates: 0

 no. duplicates remaining: 0


In [6]:
ohe  = OneHotEncoder(handle_unknown='ignore',  sparse_output= False).set_output(transform='pandas')
ohetransform = ohe.fit_transform(df[['Transmission', 'Fuel', 'Body', 'Model']])
encoded_df = pd.DataFrame(ohetransform, columns = ohe.get_feature_names_out())


In [7]:
df = pd.concat([df, ohetransform], axis = 1).drop(['Transmission', 'Fuel', 'Body', 'Model'], axis = 1)
df.head()

Unnamed: 0,Year,Price,Transmission_Automatic,Transmission_Manual,Fuel_Diesel,Fuel_Electric,Fuel_Hybrid,Fuel_Petrol,Body_Convertible,Body_Coupe,...,Model_Hyundai Tucson,Model_Kia Sportage,Model_Lexus RX,Model_Mazda CX-5,Model_Mercedes-Benz C-Class,Model_Nissan Altima,Model_Subaru Outback,Model_Tesla Model 3,Model_Toyota Corolla,Model_Volkswagen Golf
0,2014,12344,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2007,6674,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2007,8305,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2019,20007,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2005,7075,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
x = df.drop('Price', axis = 1)
y = df['Price']

In [9]:
X_train , X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2,random_state= 42)

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [11]:
predictions = lr.predict(X_test)
predictions

array([19910.89262139,  9922.39599277, 18756.00656866, 16962.65088522,
       19961.93038835, 17821.40431645, 12167.78890258, 17915.00240142,
        8736.01315826,  9916.75848696, 12024.51131295, 10867.85764642,
        4679.34657433,  5310.50465602, 10943.09744021,  7138.59925422,
        5401.37235931, 11271.82506393, 22067.75580367,  9754.17889673,
       23059.16300189, 25588.84535154, 13756.72761789,  4858.7734012 ,
       25013.7488978 , 14629.7964457 , 18004.47196108, 22224.69720048,
        5100.33484427, 23163.75755932,  6174.37345684, 16814.07287625,
       18022.78359183, 11925.84062151, 17363.93223506, 23360.48837024,
       16135.19379578, 23923.08356785, 21073.52658467, 14040.91663131,
       22036.21794132, 12930.72074679, 16262.67961856, 17217.57463753,
        6794.68614153, 10890.83634017, 18043.96309135, 17601.16671242,
       18866.3986152 ,  8765.15617103, 18978.45187828,  9093.97194747,
       24742.23174316, 16014.17788665, 19774.46588476,  9165.82471487,
      

In [12]:
print("Mean Squared Error: ", mean_squared_error(y_test,  predictions))
print("Root Mean Squared Error: ", r2_score(y_test,  predictions))

Mean Squared Error:  1278049.2058490152
Root Mean Squared Error:  0.9686051250643625
