### Importing Necessary libraries

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import SGD
from matplotlib import pyplot

### Connecting to Google drive and loading the dataset

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
datasetPath = "/content/drive/MyDrive/Colab Notebooks/data_set.data" #path to the dataset file in google drive

### Loading Dataset
Dataset is loaded using pandas with headers mentioned below

In [52]:
headers = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", 
         "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", 
         "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", 
         "peak-rpm", "city-mpg", "highway-mpg", "price"]

df = pd.read_csv(datasetPath, names = headers)
df.shape

(204, 26)

### Handling Missing Values
The "?" in the above dataset represents missing value. To handle them we do the following.
*First Convert "?" to NaN*

In [53]:
df.replace("?", np.nan, inplace = True)

### Dealing with missing values
We can either 
* Drop row / column
* Replace data 

But dropping rows is not effiecient since we have very less number of rows. So instead replace those missing values with meaningful replacements like average or mode

*In the below columns we replace missing values (NaN) with mean value of the column*

In [54]:
columns = ["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]

for column in columns:
    average = df[column].astype("float").mean(axis = 0)
    df[column].replace(np.nan, average, inplace = True)

*In the below columns we replace values (NaN) with the most frequent occurance in the columns*

In [55]:
df["num-of-doors"].value_counts()
mostFreq = df["num-of-doors"].value_counts().idxmax()
df["num-of-doors"].replace(np.nan, mostFreq, inplace = True)

*In the below columns we replace string 'four', 'five' etc with the corresponding numerical values in the columns 'num-of-doors' and 'num-of-cylinders'* 

In [56]:
df["num-of-doors"] = df["num-of-doors"].apply(lambda x: 4 if x == 'four' else 2)
df.replace({'four': 4,'six': 6, 'five': 5, 'three': 3, 'twelve': 12, 'two': 2, 'eight': 8},inplace=True)

### Checking datatypes of columns and correcting if necessary 

In [57]:
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors           int64
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders       int64
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [58]:
df[["bore"]] = df[["bore"]].astype("float")
df[["stroke"]] = df[["stroke"]].astype("float")
df[["normalized-losses"]] = df[["normalized-losses"]].astype("int")
df[["price"]] = df[["price"]].astype("float")
df[["peak-rpm"]] = df[["peak-rpm"]].astype("float")
df[["horsepower"]] = df[["horsepower"]].astype("float")

*For the below columns we change the string datatype into integer number using factorize*

In [59]:
for i in ['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'fuel-system']:
  codes = None
  unique = None
  codes, uniques = pd.factorize(df[i])
  df[i] = codes

In [60]:
df.dtypes
display(df)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122,0,0,0,2,0,0,0,88.6,168.8,64.1,48.8,2548,0,4,130,0,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122,0,0,0,2,0,0,0,88.6,168.8,64.1,48.8,2548,0,4,130,0,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122,0,0,0,2,1,0,0,94.5,171.2,65.5,52.4,2823,1,6,152,0,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164,1,0,0,4,2,1,0,99.8,176.6,66.2,54.3,2337,2,4,109,0,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164,1,0,0,4,2,2,0,99.4,176.6,66.4,54.3,2824,2,5,136,0,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,-1,95,21,0,0,4,2,0,0,109.1,188.8,68.9,55.5,2952,2,4,141,0,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
200,-1,95,21,0,1,4,2,0,0,109.1,188.8,68.8,55.5,3049,2,4,141,0,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
201,-1,95,21,0,0,4,2,0,0,109.1,188.8,68.9,55.5,3012,1,6,173,0,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
202,-1,95,21,1,1,4,2,0,0,109.1,188.8,68.9,55.5,3217,2,6,145,6,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


### Building the model (without L1 regularizer)

In [61]:
df = df.astype("float")
y = df['symboling'].copy()
X = df.drop('symboling', axis = 1).copy()
scaler = StandardScaler()
X = scaler.fit_transform(X)
trainX, testX, trainy, testy = train_test_split(X, y, train_size = 0.75, random_state = 100)

In [62]:
# Defining the model
model = Sequential()
model.add(Dense(100, input_dim = 25, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(1, activation='softmax'))

# Compiling the model
opt = SGD(learning_rate = 0.1, momentum = 0.9)
model.compile(loss='categorical_crossentropy', optimizer = opt, metrics=['accuracy'])

history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs = 200, verbose = 0)

# Evaluating the model
_, train_acc = model.evaluate(trainX, trainy, verbose=0)
_, test_acc = model.evaluate(testX, testy, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

Train: 0.333, Test: 0.314


### Building the model (with L1 regularizer)

In [63]:
yl = df['symboling'].copy()
Xl = df.drop('symboling', axis = 1).copy()
Xl = scaler.fit_transform(Xl)
trainXl, testXl, trainyl, testyl = train_test_split(Xl, yl, train_size = 0.75, random_state = 100)

In [64]:
# Defining the model
model = Sequential()
model.add(Dense(100, input_dim = 25, activation='relu', kernel_initializer='he_uniform', kernel_regularizer = tf.keras.regularizers.l1(0.001)))
model.add(Dense(1, activation='softmax', kernel_regularizer = tf.keras.regularizers.l1(0.001)))

# Compiling the model
opt = SGD(learning_rate = 0.1, momentum = 0.9)
model.compile(loss='categorical_crossentropy', optimizer = opt, metrics=['accuracy'])

history = model.fit(trainXl, trainyl, validation_data=(testXl, testyl), epochs = 200, verbose = 0)

# Evaluating the model
_, train_acc = model.evaluate(trainXl, trainyl, verbose = 0)
_, test_acc = model.evaluate(testXl, testyl, verbose = 0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

Train: 0.333, Test: 0.314
