
Automobile Data Set
This data set consists of three types of entities: (a) the specification of an auto in terms of various characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars. The second rating corresponds to the degree to which the auto is more risky than its price indicates. Cars are initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuarians call this process "symboling". A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.

The third factor is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the average loss per car per year.

Note: Several of the attributes in the database could be used as a "class" attribute.


https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

In [138]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.io import arff
#import scikit-learn as sklearn
from sklearn.linear_model import LinearRegression

In [139]:
raw_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",header=None)


In [140]:
raw_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [141]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
0     205 non-null int64
1     205 non-null object
2     205 non-null object
3     205 non-null object
4     205 non-null object
5     205 non-null object
6     205 non-null object
7     205 non-null object
8     205 non-null object
9     205 non-null float64
10    205 non-null float64
11    205 non-null float64
12    205 non-null float64
13    205 non-null int64
14    205 non-null object
15    205 non-null object
16    205 non-null int64
17    205 non-null object
18    205 non-null object
19    205 non-null object
20    205 non-null float64
21    205 non-null object
22    205 non-null object
23    205 non-null int64
24    205 non-null int64
25    205 non-null object
dtypes: float64(5), int64(5), object(16)
memory usage: 41.8+ KB


In [142]:
raw_df.describe()

Unnamed: 0,0,9,10,11,12,13,16,20,23,24
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [143]:
col_names= ["symboling","normalized-losses", "make", "fuel-type", "aspiration" , "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower","peak-rpm","city-mpg", "highway-mpg","price"]
raw_df.columns= col_names

In [144]:
raw_df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [145]:
raw_df.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [146]:
raw_df.info

<bound method DataFrame.info of      symboling normalized-losses         make fuel-type aspiration  \
0            3                 ?  alfa-romero       gas        std   
1            3                 ?  alfa-romero       gas        std   
2            1                 ?  alfa-romero       gas        std   
3            2               164         audi       gas        std   
4            2               164         audi       gas        std   
..         ...               ...          ...       ...        ...   
200         -1                95        volvo       gas        std   
201         -1                95        volvo       gas      turbo   
202         -1                95        volvo       gas        std   
203         -1                95        volvo    diesel      turbo   
204         -1                95        volvo       gas      turbo   

    num-of-doors   body-style drive-wheels engine-location  wheel-base  ...  \
0            two  convertible          rwd      

In [147]:
#Find all unique values for each col and print
print(raw_df.nunique())

symboling              6
normalized-losses     52
make                  22
fuel-type              2
aspiration             2
num-of-doors           3
body-style             5
drive-wheels           3
engine-location        2
wheel-base            53
length                75
width                 44
height                49
curb-weight          171
engine-type            7
num-of-cylinders       7
engine-size           44
fuel-system            8
bore                  39
stroke                37
compression-ratio     32
horsepower            60
peak-rpm              24
city-mpg              29
highway-mpg           30
price                187
dtype: int64


In [148]:
for col in raw_df:
  print(f"Unique values in {col}: {raw_df[col].unique()}")

Unique values in symboling: [ 3  1  2  0 -1 -2]
Unique values in normalized-losses: ['?' '164' '158' '192' '188' '121' '98' '81' '118' '148' '110' '145' '137'
 '101' '78' '106' '85' '107' '104' '113' '150' '129' '115' '93' '142'
 '161' '153' '125' '128' '122' '103' '168' '108' '194' '231' '119' '154'
 '74' '186' '83' '102' '89' '87' '77' '91' '134' '65' '197' '90' '94'
 '256' '95']
Unique values in make: ['alfa-romero' 'audi' 'bmw' 'chevrolet' 'dodge' 'honda' 'isuzu' 'jaguar'
 'mazda' 'mercedes-benz' 'mercury' 'mitsubishi' 'nissan' 'peugot'
 'plymouth' 'porsche' 'renault' 'saab' 'subaru' 'toyota' 'volkswagen'
 'volvo']
Unique values in fuel-type: ['gas' 'diesel']
Unique values in aspiration: ['std' 'turbo']
Unique values in num-of-doors: ['two' 'four' '?']
Unique values in body-style: ['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']
Unique values in drive-wheels: ['rwd' 'fwd' '4wd']
Unique values in engine-location: ['front' 'rear']
Unique values in wheel-base: [ 88.6  94.5  99.8 

In [149]:
#Replace with NaN
raw_df = raw_df.replace("?",np.nan)

In [150]:
raw_df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [151]:
nan_counts = raw_df.isna().sum()
print(nan_counts)

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64


In [152]:
for col in raw_df:
  print(f"Number of NaN in {col}: {raw_df[col].isna().sum()}")


Number of NaN in symboling: 0
Number of NaN in normalized-losses: 41
Number of NaN in make: 0
Number of NaN in fuel-type: 0
Number of NaN in aspiration: 0
Number of NaN in num-of-doors: 2
Number of NaN in body-style: 0
Number of NaN in drive-wheels: 0
Number of NaN in engine-location: 0
Number of NaN in wheel-base: 0
Number of NaN in length: 0
Number of NaN in width: 0
Number of NaN in height: 0
Number of NaN in curb-weight: 0
Number of NaN in engine-type: 0
Number of NaN in num-of-cylinders: 0
Number of NaN in engine-size: 0
Number of NaN in fuel-system: 0
Number of NaN in bore: 4
Number of NaN in stroke: 4
Number of NaN in compression-ratio: 0
Number of NaN in horsepower: 2
Number of NaN in peak-rpm: 2
Number of NaN in city-mpg: 0
Number of NaN in highway-mpg: 0
Number of NaN in price: 4


In [153]:
#find missing value percentage and decide if it can be used in the process
mis_val_percent = 100 * raw_df.isnull().sum() / len(raw_df)
print (mis_val_percent)

symboling             0.00000
normalized-losses    20.00000
make                  0.00000
fuel-type             0.00000
aspiration            0.00000
num-of-doors          0.97561
body-style            0.00000
drive-wheels          0.00000
engine-location       0.00000
wheel-base            0.00000
length                0.00000
width                 0.00000
height                0.00000
curb-weight           0.00000
engine-type           0.00000
num-of-cylinders      0.00000
engine-size           0.00000
fuel-system           0.00000
bore                  1.95122
stroke                1.95122
compression-ratio     0.00000
horsepower            0.97561
peak-rpm              0.97561
city-mpg              0.00000
highway-mpg           0.00000
price                 1.95122
dtype: float64


In [157]:
#convert object datatype to int64
#raw_df['price'] = raw_df.price.astype(int)
raw_df['price'] = pd.to_numeric(raw_df['price'], errors='coerce')
raw_df = raw_df.dropna(subset=['price'])
raw_df = raw_df.astype({'price': 'int'})

In [159]:
X=raw_df[['symboling']]
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
print("Outlier threshold of Appliances ",IQR)

Outlier threshold of Appliances  symboling    2.0
dtype: float64


In [160]:
#Create a list for feature and target column
All_ColumnNames = raw_df.columns
Target_ColumnNames = ['price']
Feature_ColumnNames = ['symboling','normalized-losses', 'make', 'fuel-type', 'num-of-doors', 'body-style', 
                      'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
                      'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 
                       'horsepower','peak-rpm','city-mpg', 'highway-mpg']
Misc_ColumnNames = ['aspiration']

In [161]:
print(All_ColumnNames)
print(Target_ColumnNames)
print(Feature_ColumnNames)
print(Misc_ColumnNames)

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')
['price']
['symboling', 'normalized-losses', 'make', 'fuel-type', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']
['aspiration']


In [162]:
#feature Data set
feature_df = raw_df[Feature_ColumnNames]
feature_df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,3,,alfa-romero,gas,two,convertible,rwd,front,88.6,168.8,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
1,3,,alfa-romero,gas,two,convertible,rwd,front,88.6,168.8,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
2,1,,alfa-romero,gas,two,hatchback,rwd,front,94.5,171.2,...,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26
3,2,164.0,audi,gas,four,sedan,fwd,front,99.8,176.6,...,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30
4,2,164.0,audi,gas,four,sedan,4wd,front,99.4,176.6,...,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22


In [163]:
#split raw_df data btw training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feature_df, raw_df[Target_ColumnNames], test_size = 0.20, random_state = 0 )

In [165]:
#Feature Engineering
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, RFE, VarianceThreshold

# Load the data
#df = pd.read_csv('data.csv')

# Split the data into features and target
#X = df.drop('target', axis=1)
#y = df['target']

# Remove highly correlated variables
# Set the threshold for correlation coefficient
threshold = 0.9

# Calculate the correlation matrix
corr = x_train.corr()
print(corr)
# Find the pairs of features with correlation coefficient greater than the threshold
correlated_features = np.where(corr > threshold)[0]
print("correlated_features",correlated_features)
# Remove the correlated features
#x_train = x_train.drop(correlated_features, axis=1)

# Stepwise regression
# Set the number of features to select
#k = 5
# Create a logistic regression model
model = LinearRegression()

# Perform stepwise regression

#rfe = RFE(model, k)
#rfe.fit(x_train, y_train)


from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import chi2, f_regression

selector = SelectKBest(f_regression, k=3)
X_new = selector.fit_transform(x_train, y_train)

# Fit a linear regression model on the selected features
model = LinearRegression()
model.fit(X_new, y_train)

# Print the coefficients of the model
print(model.coef_)
print(model.get_params())


# Get the selected features
#selected_features = rfe.support_selected_features
#X = X.iloc[:, selected_features]
#print("selected_features", selected_features)

# Remove near zero variance variables
# Set the threshold for variance
threshold = 0.01

# Calculate the variance of each feature
var = x_train.var()

# Find the features with variance less than the threshold
low_variance_features = np.where(var < threshold)[0]
print("low_variance_features", low_variance_features)
# Remove the low-variance features
#X = X.drop(low_variance_features, axis=1)

# Print the selected features
#print(x_train.columns)

                   symboling  wheel-base    length     width    height  \
symboling           1.000000   -0.543575 -0.368855 -0.251311 -0.603418   
wheel-base         -0.543575    1.000000  0.870601  0.822393  0.604144   
length             -0.368855    0.870601  1.000000  0.856819  0.485351   
width              -0.251311    0.822393  0.856819  1.000000  0.307861   
height             -0.603418    0.604144  0.485351  0.307861  1.000000   
curb-weight        -0.232741    0.774199  0.879430  0.877313  0.323351   
engine-size        -0.109762    0.587734  0.700520  0.753466  0.102715   
compression-ratio  -0.194170    0.278372  0.175641  0.239405  0.230058   
city-mpg           -0.010118   -0.506544 -0.702229 -0.651588 -0.088726   
highway-mpg         0.040955   -0.559141 -0.723215 -0.699834 -0.126377   

                   curb-weight  engine-size  compression-ratio  city-mpg  \
symboling            -0.232741    -0.109762          -0.194170 -0.010118   
wheel-base            0.774199   

ValueError: could not convert string to float: 'mercedes-benz'

In [166]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

num_transformer = Pipeline([
    ('num_imputer', SimpleImputer(strategy='median'))
    ])

char_transformer = Pipeline([
    ('char_imputer', SimpleImputer(strategy='most_frequent')),
    ('OHE',OneHotEncoder(handle_unknown='error',drop='first', sparse =False)),
    ])

scale_transformer = Pipeline([
    ('scaler', StandardScaler())
    ])
#OHE_transformer = OneHotEncoder(handle_unknown='error',drop='first', sparse =False)


#from sklearn.preprocessing import OneHotEncoder
#OH_encoder = OneHotEncoder(handle_unknown='error',drop='first', sparse=False)
#OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(df_train[final_char_feat_names]))
#OH_col_names = OH_encoder.get_feature_names(final_char_feat_names)
#OH_col_names

preprocessor = ColumnTransformer(
    transformers=[
         ("num", num_transformer, Feature_ColumnNames),
        ("scaler",scale_transformer,Feature_ColumnNames)
    ]
)

In [167]:
x_train

Unnamed: 0,symboling,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
69,0,93,mercedes-benz,diesel,two,hardtop,rwd,front,106.7,187.5,...,five,183,idi,3.58,3.64,21.5,123,4350,22,25
27,1,148,dodge,gas,,sedan,fwd,front,93.7,157.3,...,four,98,mpfi,3.03,3.39,7.6,102,5500,24,30
116,0,161,peugot,diesel,four,sedan,rwd,front,107.9,186.7,...,four,152,idi,3.70,3.52,21.0,95,4150,28,33
172,2,134,toyota,gas,two,convertible,rwd,front,98.4,176.2,...,four,146,mpfi,3.62,3.50,9.3,116,4800,24,30
66,0,,mazda,diesel,four,sedan,rwd,front,104.9,175.0,...,four,134,idi,3.43,3.64,22.0,72,4200,31,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,-1,93,mercedes-benz,diesel,four,sedan,rwd,front,115.6,202.6,...,five,183,idi,3.58,3.64,21.5,123,4350,22,25
196,-2,103,volvo,gas,four,sedan,rwd,front,104.3,188.8,...,four,141,mpfi,3.78,3.15,9.5,114,5400,24,28
120,1,154,plymouth,gas,four,hatchback,fwd,front,93.7,157.3,...,four,90,2bbl,2.97,3.23,9.4,68,5500,31,38
50,1,104,mazda,gas,two,hatchback,fwd,front,93.1,159.1,...,four,91,2bbl,3.03,3.15,9.0,68,5000,30,31


In [168]:
x_test

Unnamed: 0,symboling,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
19,1,98.0,chevrolet,gas,two,hatchback,fwd,front,94.5,155.9,...,four,90,2bbl,3.03,3.11,9.6,70,5400,38,43
174,-1,65.0,toyota,diesel,four,sedan,fwd,front,102.4,175.6,...,four,110,idi,3.27,3.35,22.5,73,4500,30,33
110,0,,peugot,diesel,four,wagon,rwd,front,114.2,198.9,...,four,152,idi,3.7,3.52,21.0,95,4150,25,25
101,0,128.0,nissan,gas,four,sedan,fwd,front,100.4,181.7,...,six,181,mpfi,3.43,3.27,9.0,152,5200,17,22
181,-1,,toyota,gas,four,wagon,rwd,front,104.5,187.8,...,six,161,mpfi,3.27,3.35,9.2,156,5200,19,24
186,2,94.0,volkswagen,gas,four,sedan,fwd,front,97.3,171.7,...,four,109,mpfi,3.19,3.4,9.0,85,5250,27,34
5,2,,audi,gas,two,sedan,fwd,front,99.8,177.3,...,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25
150,1,87.0,toyota,gas,two,hatchback,fwd,front,95.7,158.7,...,four,92,2bbl,3.05,3.03,9.0,62,4800,35,39
13,0,188.0,bmw,gas,four,sedan,rwd,front,101.2,176.8,...,six,164,mpfi,3.31,3.19,9.0,121,4250,21,28
156,0,91.0,toyota,gas,four,sedan,fwd,front,95.7,166.3,...,four,98,2bbl,3.19,3.03,9.0,70,4800,30,37


In [169]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
# Create a logistic regression model
linreg = LinearRegression()

# Fit the model to the training data
linreg.fit(x_train, y_train)

# Predict the labels of the test data
y_pred = linreg.predict(x_test)

# Evaluate the model's performance on the test data
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

plt.scatter(range(len(y_test)), y_test, color='blue')
plt.scatter(range(len(y_pred)), y_pred, color='red')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression Model')
plt.show()


ValueError: could not convert string to float: 'mercedes-benz'