# Air Quality

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data
data = pd.read_excel('AirQualityUCI.xlsx')

data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [2]:
#checking duplicate values
print("Number of duplicate values: ", data.duplicated().sum())
print('\n')

#checking the data types of the columns
data.info()
print('\n')

#checking for null values
data.isnull().sum()
print('\n')

#dropping unnecessary features
data1 = data.drop(['Date','Time'], axis=1)

data1.head()
data1.describe()

Number of duplicate values:  0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           9357 non-null   datetime64[ns]
 1   Time           9357 non-null   object        
 2   CO(GT)         9357 non-null   float64       
 3   PT08.S1(CO)    9357 non-null   float64       
 4   NMHC(GT)       9357 non-null   int64         
 5   C6H6(GT)       9357 non-null   float64       
 6   PT08.S2(NMHC)  9357 non-null   float64       
 7   NOx(GT)        9357 non-null   float64       
 8   PT08.S3(NOx)   9357 non-null   float64       
 9   NO2(GT)        9357 non-null   float64       
 10  PT08.S4(NO2)   9357 non-null   float64       
 11  PT08.S5(O3)    9357 non-null   float64       
 12  T              9357 non-null   float64       
 13  RH             9357 non-null   float64       
 14  AH             9357 non-null   float64 

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,-34.207524,1048.869652,-159.090093,1.865576,894.475963,168.6042,794.872333,58.135898,1391.363266,974.951534,9.7766,39.483611,-6.837604
std,77.65717,329.817015,139.789093,41.380154,342.315902,257.424561,321.977031,126.931428,467.192382,456.922728,43.203438,51.215645,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,0.6,921.0,-200.0,4.004958,711.0,50.0,637.0,53.0,1184.75,699.75,10.95,34.05,0.692275
50%,1.5,1052.5,-200.0,7.886653,894.5,141.0,794.25,96.0,1445.5,942.0,17.2,48.55,0.976823
75%,2.6,1221.25,-200.0,13.636091,1104.75,284.2,960.25,133.0,1662.0,1255.25,24.075,61.875,1.296223
max,11.9,2039.75,1189.0,63.741476,2214.0,1479.0,2682.75,339.7,2775.0,2522.75,44.6,88.725,2.231036


In [3]:
#handling missing values
def handle_missing_value(df, column_name, replace_value):
  # Check if the column exists
  if column_name not in df.columns:
    raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
  # Calculation for replacing values
  filtered_data = df[df[column_name] != replace_value]
  column_mean = filtered_data[column_name].mean()
  df.loc[df[column_name] == replace_value, column_name] = column_mean
  return df

In [4]:
#handling missing values of each column
df = handle_missing_value(data1, 'CO(GT)', -200)
df = handle_missing_value(data1, 'PT08.S1(CO)', -200)
df = handle_missing_value(data1, 'NMHC(GT)', -200)
df = handle_missing_value(data1, 'C6H6(GT)', -200)
df = handle_missing_value(data1, 'PT08.S2(NMHC)', -200)
df = handle_missing_value(data1, 'NOx(GT)', -200)
df = handle_missing_value(data1, 'PT08.S3(NOx)', -200)
df = handle_missing_value(data1, 'NO2(GT)', -200)
df = handle_missing_value(data1, 'PT08.S4(NO2)', -200)
df = handle_missing_value(data1, 'PT08.S5(O3)', -200)
df = handle_missing_value(data1, 'T', -200)
df = handle_missing_value(data1, 'RH', -200)
df = handle_missing_value(data1, 'AH', -200)
df.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,2.15275,1099.707856,218.811816,10.082993,939.029205,246.881252,835.370973,113.075515,1456.143486,1022.780725,18.316054,49.23236,1.02553
std,1.316068,212.796116,63.870229,7.302474,261.557856,193.419417,251.741784,43.911095,339.365351,390.609,8.658396,16.974308,0.395836
min,0.1,647.25,7.0,0.149048,383.25,2.0,322.0,2.0,551.0,221.0,-1.9,9.175,0.184679
25%,1.2,941.25,218.811816,4.591495,742.5,112.0,665.5,85.9,1241.5,741.75,12.025,36.55,0.746115
50%,2.15275,1074.5,218.811816,8.593367,923.25,229.0,817.5,113.075515,1456.143486,982.5,18.275,49.23236,1.015441
75%,2.6,1221.25,218.811816,13.636091,1104.75,284.2,960.25,133.0,1662.0,1255.25,24.075,61.875,1.296223
max,11.9,2039.75,1189.0,63.741476,2214.0,1479.0,2682.75,339.7,2775.0,2522.75,44.6,88.725,2.231036


## Scaling

In [10]:
def min_max_scaler(df, cols):
  df_copy = df.copy()
  for col in cols:
    min_val = df_copy[col].min()
    max_val = df_copy[col].max()
    df_copy[col] = (df_copy[col] - min_val) / (max_val - min_val)
  return df_copy

In [12]:
temp_df = df.drop(['T','RH','AH'], axis=1)
columns_for_scaling = list(temp_df.columns)

scaler = min_max_scaler(df, columns_for_scaling)

scaler.head()


Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,0.211864,0.511849,0.120981,0.184498,0.361737,0.111036,0.311024,0.328694,0.51304,0.454654,13.6,48.875001,0.757754
1,0.161017,0.463196,0.088832,0.145428,0.312167,0.068382,0.360796,0.266509,0.453125,0.326382,13.3,47.7,0.725487
2,0.177966,0.542011,0.068528,0.139148,0.303701,0.087339,0.3465,0.331655,0.451214,0.370588,11.9,53.975,0.750239
3,0.177966,0.52298,0.06176,0.14278,0.308617,0.115098,0.326168,0.355345,0.464366,0.426741,11.0,60.0,0.786713
4,0.127119,0.448833,0.037225,0.100156,0.24703,0.087339,0.374034,0.337578,0.422212,0.386228,11.15,59.575001,0.788794


In [14]:
#separating independent and dependent variables
X = scaler.drop(['T','RH','AH'], axis=1)
y = scaler["RH"]

X_train = X.iloc[:7017]
y_train = y.iloc[:7017]
X_test = X.iloc[7017:]
y_test = y.iloc[7017:]

print(X_train.shape)
print(y_test.shape)

(7017, 10)
(2340,)


## Multivariable Linear Regression with Gradient Descent Algorithm

In [16]:
def gda(X, y, alpha, epoch):
  m = X.shape[0]
  ones = np.ones((m,1))
  X = np.concatenate((ones, X), axis=1)
  n = X.shape[1]
  Theta = np.ones(n)
  h = np.dot(X, Theta)

  cost = np.ones(epoch)
  for i in range (0, epoch):
    Theta[0] = Theta[0] - ((alpha/X.shape[0]) * sum(h-y))
    for j in range (1, n):
      Theta[j] = Theta[j] - ((alpha/X.shape[0]) * sum((h-y) * X[:,j]))
    h = np.dot(X, Theta)
    cost[i] = 1/(2*m) * sum(np.square(h-y))
  return cost, Theta

In [18]:
cost_train, Theta_train = gda(X_train, y_train, 0.0001, 1000)
cost_test, Theta_test = gda(X_test, y_test, 0.0001, 1000)

print(cost_train)
print(cost_test)

[1142.63633577 1142.28289651 1141.92958585 1141.57640373 1141.22335011
 1140.87042494 1140.51762817 1140.16495976 1139.81241966 1139.46000782
 1139.1077242  1138.75556875 1138.40354143 1138.05164218 1137.69987096
 1137.34822772 1136.99671242 1136.64532501 1136.29406544 1135.94293367
 1135.59192965 1135.24105334 1134.89030468 1134.53968363 1134.18919015
 1133.83882418 1133.48858569 1133.13847462 1132.78849093 1132.43863458
 1132.08890551 1131.73930367 1131.38982903 1131.04048154 1130.69126115
 1130.34216781 1129.99320149 1129.64436212 1129.29564967 1128.94706408
 1128.59860532 1128.25027334 1127.90206809 1127.55398952 1127.20603759
 1126.85821225 1126.51051346 1126.16294117 1125.81549534 1125.46817591
 1125.12098284 1124.77391609 1124.42697561 1124.08016135 1123.73347327
 1123.38691132 1123.04047546 1122.69416564 1122.34798181 1122.00192393
 1121.65599195 1121.31018583 1120.96450552 1120.61895097 1120.27352215
 1119.928219   1119.58304147 1119.23798953 1118.89306312 1118.5482622
 1118.2

# MSE

In [19]:
def mse(n, y, y1):
  se = 0
  for i in range(n):
    se1 = (y[i] - y1[i])**2
    se += se1

  mse = se/n
  return mse

In [20]:
mse_value = mse(20, cost_train, cost_test)

print("The MSE is: ", mse_value)

The MSE is:  27943.204982291732
