In [1]:
# The libraries that I used for the part 2
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Bring out the dataset 
mpg = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original", sep='\s+', header = None)

In [3]:
# Rename the Column from the dataset, and delete the last column which is the name of car
mpg.columns = ['mpg', 'cylinders','displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
mpg = mpg.drop(['car_name'], axis = 1)
# Standarization of dataset
scaler=StandardScaler()
scaler.fit(mpg)
mpg = scaler.transform(mpg)
# Rename the Column
mpg = pd.DataFrame(mpg,columns=['mpg', 'cylinders','displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin'])

In [4]:
# Drop the not suitable columns(Discrete Variables)
mpg = mpg.drop(['origin', 'cylinders', 'model_year'], axis = 1)

In [5]:
# Check out the number of NAs
mpg.isna().sum()

mpg             8
displacement    0
horsepower      6
weight          0
acceleration    0
dtype: int64

In [6]:
# Remove null or NA values
mpg.dropna(inplace = True)
# Remove any redundant rows
mpg.drop_duplicates(inplace = True)

In [7]:
# Find the information about the dataset, and check the NULLs
mpg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 391 entries, 0 to 405
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           391 non-null    float64
 1   displacement  391 non-null    float64
 2   horsepower    391 non-null    float64
 3   weight        391 non-null    float64
 4   acceleration  391 non-null    float64
dtypes: float64(5)
memory usage: 18.3 KB


In [8]:
# The target of this dataset is MPG, so Y is MPG, and X is the others except for Y
Y = mpg['mpg']
X = mpg.drop(['mpg'], axis=1)

In [9]:
# Split the dataset 80(train) : 20(test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

In [10]:
# Change the train and test set into dataframe
x_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(Y_train)
x_test = pd.DataFrame(X_test)
y_test = pd.DataFrame(Y_test)

In [11]:
# add the column names axis which is for constant
axis = [1] * len(x_train)
x_train.insert(0, 'axis', axis)
axis = [1] * len(x_test)
x_test.insert(0, 'axis', axis)

In [12]:
model = SGDRegressor(alpha=0.01, max_iter = 10000)
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.01, max_iter=10000)

In [13]:
# model evaluation for training set
y_train_predict = model.predict(x_train)
mse = (mean_squared_error(y_train, y_train_predict))
r2 = r2_score(y_train, y_train_predict)

print("--------------------------------------")
print("The model performance for training set")
print("--------------------------------------")
print('MSE is {}'.format(round(mse,5)))
print('R2 score is {}'.format(round(r2,5)))
print("--------------------------------------")
print("\n")

--------------------------------------
The model performance for training set
--------------------------------------
MSE is 0.27513
R2 score is 0.70664
--------------------------------------




In [14]:
# model evaluation for test set
y_test_predict = model.predict(x_test)
mse = (mean_squared_error(y_test, y_test_predict))
r2 = r2_score(y_test, y_test_predict)

print("--------------------------------------")
print("The model performance for test set")
print("--------------------------------------")
print('MSE is {}'.format(round(mse,5)))
print('R2 score is {}'.format(round(r2,5)))
print("--------------------------------------")
print("\n")

--------------------------------------
The model performance for test set
--------------------------------------
MSE is 0.39364
R2 score is 0.64176
--------------------------------------




In [15]:
# Doing the linear regression with LinearRegression()
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)

LinearRegression()

In [16]:
# model evaluation for training set
y_train_predict = linear_regression.predict(x_train)
mse = mean_squared_error(y_train, y_train_predict)
r2 = r2_score(y_train, y_train_predict)

print("--------------------------------------")
print("The model performance for training set")
print("--------------------------------------")
print('MSE is {}'.format(round(mse,5)))
print('R2 score is {}'.format(round(r2,5)))
print("--------------------------------------")
print("\n")


--------------------------------------
The model performance for training set
--------------------------------------
MSE is 0.27171
R2 score is 0.71029
--------------------------------------




In [17]:
# model evaluation for testing set
y_test_predict = linear_regression.predict(X_test)
mse = mean_squared_error(Y_test, y_test_predict)
r2 = r2_score(Y_test, y_test_predict)
print("--------------------------------------")
print("The model performance for testing set")
print("--------------------------------------")
print('MSE is {}'.format(round(mse,5)))
print('R2 score is {}'.format(round(r2,5)))
print("--------------------------------------")

--------------------------------------
The model performance for testing set
--------------------------------------
MSE is 0.38762
R2 score is 0.64724
--------------------------------------
