importing the dependencies


In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


data collection and processing

In [2]:
#loading dataset to pandas dataframe
car_dataset = pd.read_csv('car data.csv')

In [3]:
#inspect first 5 row of data frame
car_dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
#checking the number of rows and coloumns
car_dataset.shape

(301, 9)

In [5]:
#getting info from dataset
car_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
#checking number of missing value
car_dataset.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [7]:
#checking the distribution of categorical type
print(car_dataset.Fuel_Type.value_counts())
print(car_dataset.Seller_Type.value_counts())
print(car_dataset.Transmission.value_counts())

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64
Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64
Transmission
Manual       261
Automatic     40
Name: count, dtype: int64


encoding the categorical data

In [8]:
car_dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [9]:
# #encoding "Fuel_type"
# car_dataset.replace({'Fuel_Type': {'Petrol':0, 'Diesel':1, 'CNG':2}}, inplace=True)

# #encoding "Seller_type"
# car_dataset.replace({'Seller_Type': {'Dealer':0, 'Individual':1}}, inplace=True)

# #encoding "Transmission"
# car_dataset.replace({'Transmission': {'Manual':0, 'Automatic':1}}, inplace=True)

categorical_features = ['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission']
numeric_features = ['Year', 'Present_Price', 'Kms_Driven','Owner']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough'  # Keep other numeric features as-is
)


In [10]:
car_dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


splitting data and target

In [11]:
X = car_dataset.drop(['Selling_Price'], axis=1)
y = car_dataset['Selling_Price']

In [12]:
print(X)

    Car_Name  Year  Present_Price  ...  Seller_Type Transmission Owner
0       ritz  2014           5.59  ...       Dealer       Manual     0
1        sx4  2013           9.54  ...       Dealer       Manual     0
2       ciaz  2017           9.85  ...       Dealer       Manual     0
3    wagon r  2011           4.15  ...       Dealer       Manual     0
4      swift  2014           6.87  ...       Dealer       Manual     0
..       ...   ...            ...  ...          ...          ...   ...
296     city  2016          11.60  ...       Dealer       Manual     0
297     brio  2015           5.90  ...       Dealer       Manual     0
298     city  2009          11.00  ...       Dealer       Manual     0
299     city  2017          12.50  ...       Dealer       Manual     0
300     brio  2016           5.90  ...       Dealer       Manual     0

[301 rows x 8 columns]


In [13]:
print(y)

0       3.35
1       4.75
2       7.25
3       2.85
4       4.60
       ...  
296     9.50
297     4.00
298     3.35
299    11.50
300     5.30
Name: Selling_Price, Length: 301, dtype: float64


splitting training and test data 


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1, random_state=2)

model training

In [15]:
#linear regression
# lin_reg_model = LinearRegression()
# Create pipeline
lin_reg_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [16]:
lin_reg_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [17]:
#predict data 
prediction = lin_reg_model.predict(X_train)

In [18]:
#R square error
error_score = metrics.r2_score(y_train, prediction)
print("r square error:", error_score)

r square error: 0.912358564199832


In [19]:
#prediction in training data
test_data_pre = lin_reg_model.predict(X_test)

In [20]:
#R square error
error_score = metrics.r2_score(y_test, test_data_pre)
print("r square error:", error_score)

r square error: 0.8849928809775544


In [21]:
car_dataset.head(10)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0
5,vitara brezza,2018,9.25,9.83,2071,Diesel,Dealer,Manual,0
6,ciaz,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
7,s cross,2015,6.5,8.61,33429,Diesel,Dealer,Manual,0
8,ciaz,2016,8.75,8.89,20273,Diesel,Dealer,Manual,0
9,ciaz,2015,7.45,8.92,42367,Diesel,Dealer,Manual,0


In [22]:
# output = lin_reg_model.predict([['Ciaz',2022,12,20000,0,1,1,0]])
# print(output)

input_data = pd.DataFrame([[
    'fortuner', 2022, 40, 30000, 'Diesel','Dealer', 'Automatic',1
]], columns=['Car_Name', 'Year', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission','Owner'])

output = lin_reg_model.predict(input_data)
print(output)

[25.33193243]


In [23]:
import pickle

with open('car_price_model.pkl', 'wb') as f:
    pickle.dump(lin_reg_model, f)