
Automobile Data Set
This data set consists of three types of entities: (a) the specification of an auto in terms of various characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars. The second rating corresponds to the degree to which the auto is more risky than its price indicates. Cars are initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuarians call this process "symboling". A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.

The third factor is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the average loss per car per year.

Note: Several of the attributes in the database could be used as a "class" attribute.


https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.io import arff
#import scikit-learn as sklearn
from sklearn.linear_model import LinearRegression

In [70]:
raw_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",header=None)


In [71]:
raw_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [100]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-ratio    205 non-null float64
horsepower           205 non-nul

In [72]:
raw_df.describe()

Unnamed: 0,0,9,10,11,12,13,16,20,23,24
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [75]:
col_names= ["symboling","normalized-losses", "make", "fuel-type", "aspiration" , "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower","peak-rpm","city-mpg", "highway-mpg","price"]
raw_df.columns= col_names

In [76]:
raw_df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [77]:
raw_df.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [101]:
#Find all unique values for each col and print
print(raw_df.nunique())

symboling              6
normalized-losses     52
make                  22
fuel-type              2
aspiration             2
num-of-doors           3
body-style             5
drive-wheels           3
engine-location        2
wheel-base            53
length                75
width                 44
height                49
curb-weight          171
engine-type            7
num-of-cylinders       7
engine-size           44
fuel-system            8
bore                  39
stroke                37
compression-ratio     32
horsepower            60
peak-rpm              24
city-mpg              29
highway-mpg           30
price                187
dtype: int64


In [102]:
#Replace with NaN
raw_df.replace("?","NaN")

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [104]:
#convert object datatype to int64
raw_df['price'] = raw_df.price.astype(int)

ValueError: invalid literal for int() with base 10: '?'

In [None]:
#find missing value percentage and decide if it can be used in the process

In [78]:
#Null values`
raw_df.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [79]:
raw_df.isnull().values.any()

False

In [80]:
raw_df.isnull().sum().sum()

0

In [86]:
X=raw_df[['symboling']]
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
print("Outlier threshold of Appliances ",IQR)

Outlier threshold of Appliances  symboling    2.0
dtype: float64


In [87]:
#Create a list for feature and target column
All_ColumnNames = raw_df.columns
Target_ColumnNames = ['price']
Feature_ColumnNames = ['symboling','normalized-losses', 'make', 'fuel-type', 'num-of-doors', 'body-style', 
                      'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
                      'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 
                       'horsepower','peak-rpm','city-mpg', 'highway-mpg']
Misc_ColumnNames = ['aspiration']

In [88]:
print(All_ColumnNames)
print(Target_ColumnNames)
print(Feature_ColumnNames)
print(Misc_ColumnNames)

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')
['symboling']
['normalized-losses', 'make', 'fuel-type', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
['aspiration']


In [89]:
#feature Data set
feature_df = raw_df[Feature_ColumnNames]
feature_df.head()

Unnamed: 0,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,alfa-romero,gas,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,?,alfa-romero,gas,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,?,alfa-romero,gas,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,164,audi,gas,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,164,audi,gas,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [93]:
#split raw_df data btw training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feature_df, raw_df[Target_ColumnNames], test_size = 0.20, random_state = 0 )

In [94]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

num_transformer = Pipeline([
    ('num_imputer', SimpleImputer(strategy='median'))
    ])

char_transformer = Pipeline([
    ('char_imputer', SimpleImputer(strategy='most_frequent')),
    ('OHE',OneHotEncoder(handle_unknown='error',drop='first', sparse =False)),
    ])

scale_transformer = Pipeline([
    ('scaler', StandardScaler())
    ])
#OHE_transformer = OneHotEncoder(handle_unknown='error',drop='first', sparse =False)


#from sklearn.preprocessing import OneHotEncoder
#OH_encoder = OneHotEncoder(handle_unknown='error',drop='first', sparse=False)
#OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(df_train[final_char_feat_names]))
#OH_col_names = OH_encoder.get_feature_names(final_char_feat_names)
#OH_col_names

preprocessor = ColumnTransformer(
    transformers=[
         ("num", num_transformer, Feature_ColumnNames),
        ("scaler",scale_transformer,Feature_ColumnNames)
    ]
)

In [96]:
x_train

Unnamed: 0,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
4,164,audi,gas,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
71,?,mercedes-benz,gas,four,sedan,rwd,front,115.6,202.6,71.7,...,234,mpfi,3.46,3.10,8.3,155,4750,16,18,34184
134,150,saab,gas,two,hatchback,fwd,front,99.1,186.6,66.5,...,121,mpfi,2.54,2.07,9.3,110,5250,21,28,15040
145,102,subaru,gas,four,sedan,4wd,front,97.0,172.0,65.4,...,108,mpfi,3.62,2.64,7.7,111,4800,24,29,11259
122,154,plymouth,gas,four,sedan,fwd,front,93.7,167.3,63.8,...,98,2bbl,2.97,3.23,9.4,68,5500,31,38,7609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,93,mercedes-benz,diesel,four,sedan,rwd,front,110.0,190.9,70.3,...,183,idi,3.58,3.64,21.5,123,4350,22,25,25552
192,?,volkswagen,diesel,four,sedan,fwd,front,100.4,180.2,66.9,...,97,idi,3.01,3.40,23.0,68,4500,33,38,13845
117,161,peugot,gas,four,sedan,rwd,front,108.0,186.7,68.3,...,134,mpfi,3.61,3.21,7.0,142,5600,18,24,18150
47,145,jaguar,gas,four,sedan,rwd,front,113.0,199.6,69.6,...,258,mpfi,3.63,4.17,8.1,176,4750,15,19,32250


In [97]:
x_test

Unnamed: 0,normalized-losses,make,fuel-type,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
52,104,mazda,gas,two,hatchback,fwd,front,93.1,159.1,64.2,...,91,2bbl,3.03,3.15,9.0,68,5000,31,38,6795
181,?,toyota,gas,four,wagon,rwd,front,104.5,187.8,66.5,...,161,mpfi,3.27,3.35,9.2,156,5200,19,24,15750
5,?,audi,gas,two,sedan,fwd,front,99.8,177.3,66.3,...,136,mpfi,3.19,3.40,8.5,110,5500,19,25,15250
18,121,chevrolet,gas,two,hatchback,fwd,front,88.4,141.1,60.3,...,61,2bbl,2.91,3.03,9.5,48,5100,47,53,5151
188,94,volkswagen,gas,four,sedan,fwd,front,97.3,171.7,65.5,...,109,mpfi,3.19,3.40,10.0,100,5500,26,32,9995
170,134,toyota,gas,two,hardtop,rwd,front,98.4,176.2,65.6,...,146,mpfi,3.62,3.50,9.3,116,4800,24,30,11199
76,161,mitsubishi,gas,two,hatchback,fwd,front,93.7,157.3,64.4,...,92,2bbl,2.97,3.23,9.4,68,5500,37,41,5389
154,81,toyota,gas,four,wagon,4wd,front,95.7,169.7,63.6,...,92,2bbl,3.05,3.03,9.0,62,4800,27,32,7898
104,194,nissan,gas,two,hatchback,rwd,front,91.3,170.7,67.9,...,181,mpfi,3.43,3.27,9.0,160,5200,19,25,17199
33,101,honda,gas,two,hatchback,fwd,front,93.7,150.0,64.0,...,92,1bbl,2.91,3.41,9.2,76,6000,30,34,6529


In [99]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
# Create a logistic regression model
linreg = LinearRegression()

# Fit the model to the training data
linreg.fit(x_train, y_train)

# Predict the labels of the test data
y_pred = linreg.predict(x_test)

# Evaluate the model's performance on the test data
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

plt.scatter(range(len(y_test)), y_test, color='blue')
plt.scatter(range(len(y_pred)), y_pred, color='red')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression Model')
plt.show()


ValueError: could not convert string to float: '?'