We shall implement Linear Regression using the Scikit for the dataset of Air Quality. Further we shall test our model for the predictions.

In [None]:
# Let's import all required libraries

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt

In [None]:
# Import dataset

air_data = pd.read_csv('https://github.com/rising-entropy/datasets/raw/main/AirQualityUCI.csv')
air_data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10-03-2004,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,10-03-2004,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,
2,10-03-2004,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,10-03-2004,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,10-03-2004,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,


In [None]:
# Looking for missing values in the table.
air_data.isnull().sum()

Date              114
Time              114
CO(GT)            114
PT08.S1(CO)       114
NMHC(GT)          114
C6H6(GT)          114
PT08.S2(NMHC)     114
NOx(GT)           114
PT08.S3(NOx)      114
NO2(GT)           114
PT08.S4(NO2)      114
PT08.S5(O3)       114
T                 114
RH                114
AH                114
Unnamed: 15      9471
Unnamed: 16      9471
dtype: int64

In [None]:
# Handling the missing values

# Drop Date and Time
air_data = air_data.drop(columns='Date', axis=1)
air_data = air_data.drop(columns='Time', axis=1)
air_data = air_data.drop(columns='Unnamed: 15', axis=1)
air_data = air_data.drop(columns='Unnamed: 16', axis=1)
air_data = air_data.drop(columns='RH', axis=1)
air_data = air_data.drop(columns='AH', axis=1)
air_data = air_data.drop(columns='T', axis=1)

# Replacing missing values with mean
air_data['CO(GT)'].fillna(air_data['CO(GT)'].mean(), inplace=True)
air_data['PT08.S1(CO)'].fillna(air_data['PT08.S1(CO)'].mean(), inplace=True)
air_data['NMHC(GT)'].fillna(air_data['NMHC(GT)'].mean(), inplace=True)
air_data['C6H6(GT)'].fillna(air_data['C6H6(GT)'].mean(), inplace=True)
air_data['PT08.S2(NMHC)'].fillna(air_data['PT08.S2(NMHC)'].mean(), inplace=True)
air_data['NOx(GT)'].fillna(air_data['NOx(GT)'].mean(), inplace=True)
air_data['PT08.S3(NOx)'].fillna(air_data['PT08.S3(NOx)'].mean(), inplace=True)
air_data['NO2(GT)'].fillna(air_data['NO2(GT)'].mean(), inplace=True)
air_data['PT08.S4(NO2)'].fillna(air_data['PT08.S4(NO2)'].mean(), inplace=True)
air_data['PT08.S5(O3)'].fillna(air_data['PT08.S5(O3)'].mean(), inplace=True)

air_data.isnull().sum()

CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
dtype: int64

In [None]:
# Get dataset head

air_data.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3)
0,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0
1,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0
2,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0
3,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0
4,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0


In [None]:
# Splitting the dataset
X = air_data.iloc[:, :-1].values
Y = air_data.iloc[:, -1:].values

# We leave 0.2 for testing
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

In [None]:
# Perform Linear Regression training data model
model = LinearRegression()
model.fit(X_train, Y_train)

LinearRegression()

In [None]:
# We now test the model
# We shall take the averages

X_test_prediction = model.predict(X_test)
theCombinedFractionDiff = 0
for i in range(len(Y_test)):
  theDiff = abs(Y_test[i][0] - X_test_prediction[i][0])
  theFractionDiff = theDiff/Y_test[i][0]
  theCombinedFractionDiff += theFractionDiff
theValue = theCombinedFractionDiff/len(Y_test)

print("The Error is", theValue, "fractions average.")


The Error is 0.10333011859351106 fractions average.


Thus, we performed Linear Regression Pre-processing, Training and Testing over Air Quality dataset.