In [35]:
import numpy as np
import pandas as pd
import math, copy

In [36]:
# Set Pandas options to display all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Do not wrap to new line
pd.set_option('display.max_colwidth', None)  # No column width limit

Install the ucimlrepo package

In [37]:
pip install ucimlrepo



**Import the dataset**

In [38]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
air_quality = fetch_ucirepo(id=360)

# data (as pandas dataframes)
air_quality_data = air_quality.data.features
#y = air_quality.data.targets

In [39]:
print(air_quality_data.head())
print()
print(air_quality_data.tail())

        Date      Time  CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH      AH
0  3/10/2004  18:00:00     2.6         1360       150      11.9           1046      166          1056      113          1692         1268  13.6  48.9  0.7578
1  3/10/2004  19:00:00     2.0         1292       112       9.4            955      103          1174       92          1559          972  13.3  47.7  0.7255
2  3/10/2004  20:00:00     2.2         1402        88       9.0            939      131          1140      114          1555         1074  11.9  54.0  0.7502
3  3/10/2004  21:00:00     2.2         1376        80       9.2            948      172          1092      122          1584         1203  11.0  60.0  0.7867
4  3/10/2004  22:00:00     1.6         1272        51       6.5            836      131          1205      116          1490         1110  11.2  59.6  0.7888

          Date      Time  CO(GT)  PT08.S1(CO)  NMHC

In [40]:
print("Shape of the data:", air_quality_data.shape)

Shape of the data: (9357, 15)


In [41]:
# getting more info about the data
print(air_quality_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           9357 non-null   object 
 1   Time           9357 non-null   object 
 2   CO(GT)         9357 non-null   float64
 3   PT08.S1(CO)    9357 non-null   int64  
 4   NMHC(GT)       9357 non-null   int64  
 5   C6H6(GT)       9357 non-null   float64
 6   PT08.S2(NMHC)  9357 non-null   int64  
 7   NOx(GT)        9357 non-null   int64  
 8   PT08.S3(NOx)   9357 non-null   int64  
 9   NO2(GT)        9357 non-null   int64  
 10  PT08.S4(NO2)   9357 non-null   int64  
 11  PT08.S5(O3)    9357 non-null   int64  
 12  T              9357 non-null   float64
 13  RH             9357 non-null   float64
 14  AH             9357 non-null   float64
dtypes: float64(5), int64(8), object(2)
memory usage: 1.1+ MB
None


In [42]:
# checking number of missing values for each features
print("Number of missing values:")
print(air_quality_data.isnull().sum())

Number of missing values:
Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64


In [43]:
# Missing values are tagged with -200 value.
def find_missing_value(data):
  for column in data.columns:
    count = 0
    count = (data[column] == -200).sum()
    print(f"{column:13} has {count:5} occurrences.")

  return data

air_quality_data = find_missing_value(air_quality_data)

Date          has     0 occurrences.
Time          has     0 occurrences.
CO(GT)        has  1683 occurrences.
PT08.S1(CO)   has   366 occurrences.
NMHC(GT)      has  8443 occurrences.
C6H6(GT)      has   366 occurrences.
PT08.S2(NMHC) has   366 occurrences.
NOx(GT)       has  1639 occurrences.
PT08.S3(NOx)  has   366 occurrences.
NO2(GT)       has  1642 occurrences.
PT08.S4(NO2)  has   366 occurrences.
PT08.S5(O3)   has   366 occurrences.
T             has   366 occurrences.
RH            has   366 occurrences.
AH            has   366 occurrences.


In [44]:
# replacing -200 with mean of each features.
def replace_with_mean(data):
  for column in data.columns:
    if pd.api.types.is_numeric_dtype(data[column]):
      mean_value = data[data[column] != -200][column].mean()
      data[column] = data[column].replace(-200, mean_value)
      print(f"Mean of {column:13} is: {mean_value:8.4f}")
  return data

air_quality_data = replace_with_mean(air_quality_data)

Mean of CO(GT)        is:   2.1527
Mean of PT08.S1(CO)   is: 1099.8332
Mean of NMHC(GT)      is: 218.8118
Mean of C6H6(GT)      is:  10.0831
Mean of PT08.S2(NMHC) is: 939.1534
Mean of NOx(GT)       is: 246.8967
Mean of PT08.S3(NOx)  is: 835.4936
Mean of NO2(GT)       is: 113.0913
Mean of PT08.S4(NO2)  is: 1456.2646
Mean of PT08.S5(O3)   is: 1022.9061
Mean of T             is:  18.3178
Mean of RH            is:  49.2342
Mean of AH            is:   1.0255


In [45]:
# after replacing with mean
air_quality_data = find_missing_value(air_quality_data)

Date          has     0 occurrences.
Time          has     0 occurrences.
CO(GT)        has     0 occurrences.
PT08.S1(CO)   has     0 occurrences.
NMHC(GT)      has     0 occurrences.
C6H6(GT)      has     0 occurrences.
PT08.S2(NMHC) has     0 occurrences.
NOx(GT)       has     0 occurrences.
PT08.S3(NOx)  has     0 occurrences.
NO2(GT)       has     0 occurrences.
PT08.S4(NO2)  has     0 occurrences.
PT08.S5(O3)   has     0 occurrences.
T             has     0 occurrences.
RH            has     0 occurrences.
AH            has     0 occurrences.


**To Perform Multivariate linear regression using gradient descent on the Air Quality dataset and report Mean Squared Error (MSE) for training and testing dataset we don't need the date & time features.**

**Now Spliting the dataset 75% for for training data and 25% for test data**

In [49]:
print(air_quality_data.shape)
print(air_quality_data.head())

(9357, 13)
   CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH      AH
0     2.6       1360.0     150.0      11.9         1046.0    166.0        1056.0    113.0        1692.0       1268.0  13.6  48.9  0.7578
1     2.0       1292.0     112.0       9.4          955.0    103.0        1174.0     92.0        1559.0        972.0  13.3  47.7  0.7255
2     2.2       1402.0      88.0       9.0          939.0    131.0        1140.0    114.0        1555.0       1074.0  11.9  54.0  0.7502
3     2.2       1376.0      80.0       9.2          948.0    172.0        1092.0    122.0        1584.0       1203.0  11.0  60.0  0.7867
4     1.6       1272.0      51.0       6.5          836.0    131.0        1205.0    116.0        1490.0       1110.0  11.2  59.6  0.7888


In [47]:
# removing date, time features
air_quality_data = air_quality_data.iloc[:, 2:]

# find index number of 75% data
train_data_index = int(len(air_quality_data) * 0.75)

train_data = air_quality_data[:train_data_index] # train data
test_data = air_quality_data[train_data_index:]  # test data

train_data.to_csv('./AirQualityTrainingData.csv', index=False)
test_data.to_csv('./AirQualityTestData.csv', index=False)

print(f"Shape of Whole Dataset: {air_quality_data.shape}, Length of dataset: {len(air_quality_data)}")
print(f"Shape of Training Dataset: {train_data.shape}, Length of dataset: {len(train_data)}")
print(f"Shape of Test Dataset: {test_data.shape}, Length of dataset: {len(test_data)}")

Shape of Whole Dataset: (9357, 13), Length of dataset: 9357
Shape of Training Dataset: (7017, 13), Length of dataset: 7017
Shape of Test Dataset: (2340, 13), Length of dataset: 2340


In [48]:
print(train_data.head())

   CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH      AH
0     2.6       1360.0     150.0      11.9         1046.0    166.0        1056.0    113.0        1692.0       1268.0  13.6  48.9  0.7578
1     2.0       1292.0     112.0       9.4          955.0    103.0        1174.0     92.0        1559.0        972.0  13.3  47.7  0.7255
2     2.2       1402.0      88.0       9.0          939.0    131.0        1140.0    114.0        1555.0       1074.0  11.9  54.0  0.7502
3     2.2       1376.0      80.0       9.2          948.0    172.0        1092.0    122.0        1584.0       1203.0  11.0  60.0  0.7867
4     1.6       1272.0      51.0       6.5          836.0    131.0        1205.0    116.0        1490.0       1110.0  11.2  59.6  0.7888
