# **Air Quality**

In [2]:
#importing libraries
import numpy as np
import pandas as pd

#load data
data = pd.read_excel('AirQualityUCI.xlsx')

data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


## Data Preprocessing

In [3]:
#checking duplicate values
print("Number of duplicate values: ", data.duplicated().sum())
print('\n')

#checking the data types of the columns
data.info()
print('\n')

#checking for null values
data.isnull().sum()
print('\n')

#dropping unnecessary features
data1 = data.drop(['Date','Time'], axis=1)

data1.head()
data1.describe()

Number of duplicate values:  0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           9357 non-null   datetime64[ns]
 1   Time           9357 non-null   object        
 2   CO(GT)         9357 non-null   float64       
 3   PT08.S1(CO)    9357 non-null   float64       
 4   NMHC(GT)       9357 non-null   int64         
 5   C6H6(GT)       9357 non-null   float64       
 6   PT08.S2(NMHC)  9357 non-null   float64       
 7   NOx(GT)        9357 non-null   float64       
 8   PT08.S3(NOx)   9357 non-null   float64       
 9   NO2(GT)        9357 non-null   float64       
 10  PT08.S4(NO2)   9357 non-null   float64       
 11  PT08.S5(O3)    9357 non-null   float64       
 12  T              9357 non-null   float64       
 13  RH             9357 non-null   float64       
 14  AH             9357 non-null   float64 

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,-34.207524,1048.869652,-159.090093,1.865576,894.475963,168.6042,794.872333,58.135898,1391.363266,974.951534,9.7766,39.483611,-6.837604
std,77.65717,329.817015,139.789093,41.380154,342.315902,257.424561,321.977031,126.931428,467.192382,456.922728,43.203438,51.215645,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,0.6,921.0,-200.0,4.004958,711.0,50.0,637.0,53.0,1184.75,699.75,10.95,34.05,0.692275
50%,1.5,1052.5,-200.0,7.886653,894.5,141.0,794.25,96.0,1445.5,942.0,17.2,48.55,0.976823
75%,2.6,1221.25,-200.0,13.636091,1104.75,284.2,960.25,133.0,1662.0,1255.25,24.075,61.875,1.296223
max,11.9,2039.75,1189.0,63.741476,2214.0,1479.0,2682.75,339.7,2775.0,2522.75,44.6,88.725,2.231036


In [10]:
import pandas as pd

def handle_missing_value(df, column_name, replace_value):
  # Check if the column exists
  if column_name not in df.columns:
    raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
  # Calculation for replacing values
  filtered_data = df[df[column_name] != replace_value]
  column_mean = filtered_data[column_name].mean()
  df.loc[df[column_name] == replace_value, column_name] = column_mean
  return df


In [11]:
df = handle_missing_value(data1, 'CO(GT)', -200)

df.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,2.15275,1048.869652,-159.090093,1.865576,894.475963,168.6042,794.872333,58.135898,1391.363266,974.951534,9.7766,39.483611,-6.837604
std,1.316068,329.817015,139.789093,41.380154,342.315902,257.424561,321.977031,126.931428,467.192382,456.922728,43.203438,51.215645,38.97667
min,0.1,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,1.2,921.0,-200.0,4.004958,711.0,50.0,637.0,53.0,1184.75,699.75,10.95,34.05,0.692275
50%,2.15275,1052.5,-200.0,7.886653,894.5,141.0,794.25,96.0,1445.5,942.0,17.2,48.55,0.976823
75%,2.6,1221.25,-200.0,13.636091,1104.75,284.2,960.25,133.0,1662.0,1255.25,24.075,61.875,1.296223
max,11.9,2039.75,1189.0,63.741476,2214.0,1479.0,2682.75,339.7,2775.0,2522.75,44.6,88.725,2.231036


## Train and Test Data

In [5]:
#separate independent and dependent variable
X = data1.drop(['T','RH','AH'], axis=1)
y = data1["T"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7017, 10)
(7017,)
(2340, 10)
(2340,)


## Data Scaling

In [6]:
#MiniMax Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
scaler = MinMaxScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)
scaler = MinMaxScaler().fit(y_train.values.reshape(-1,1))
y_train_scaled = scaler.transform(y_train.values.reshape(-1,1))
scaler = MinMaxScaler().fit(y_test.values.reshape(-1,1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1,1))

In [7]:
def minmax(n, original, min, max):
  scaled = 0
  for i in range(n):
    scaled[i] = (original[i] - min) / (max - min)

  return scaled

In [8]:
print(X_train.values.reshape(-1,1))
print(X_train.shape[0])
print(X_train_scaled)
#print(X_train.values)

[[-200.  ]
 [1055.  ]
 [-200.  ]
 ...
 [ 180.3 ]
 [1512.75]
 [1651.  ]]
7017
[[0.         0.56033039 0.         ... 0.         0.59401488 0.47221967]
 [0.9589429  0.56457194 0.         ... 0.65962572 0.54108286 0.50996418]
 [0.95799906 0.66469472 0.449964   ... 0.58921623 0.68872167 0.51271926]
 ...
 [0.96083058 0.7323362  0.         ... 0.56327589 0.76664937 0.60795298]
 [0.96271826 0.72217881 0.53419726 ... 0.60959792 0.72305829 0.59436128]
 [0.96177442 0.72842951 0.         ... 0.70465073 0.59254454 0.67995225]]


In [9]:
X_train_minmax = minmax(X_train.shape[0], X_train.values.reshape(-1,1), float(X_train.values.reshape(-1,1).min()), float(X_train.values.reshape(-1,1).max()))

print(X_train_minmax)

TypeError: 'int' object does not support item assignment

## Multivariable Linear Regression with Gradient Descent Algorithm

In [None]:
def gda(X, y, alpha, epoch):
  m = X.shape[0]
  ones = np.ones((m,1))
  X = np.concatenate((ones, X), axis=1)
  n = X.shape[1]
  Theta = np.ones(n)
  h = np.dot(X, Theta)

  cost = np.ones(epoch)
  for i in range (0, epoch):
    Theta[0] = Theta[0] - (alpha/X.shape[0]) * sum(h-y)
    for j in range (1, n):
      Theta[j] = Theta[j] - (alpha/X.shape[0]) * sum((h-y) * X[:,j])
    h = np.dot(X, Theta)
    cost[i] = 1/(2*m) * sum(np.square(h-y))
  return cost, Theta

In [None]:
cost_train, Theta_train = gda(X_train_scaled, y_train_scaled.squeeze(), 0.01, 7017)
cost_test, Theta_test = gda(X_test_scaled, y_test_scaled.squeeze(), 0.01, 2340)
print(cost_train)
print(cost_test)

## Mean Squared Error (MSE)

In [None]:
def mse(n, y, y1):
  se = 0
  for i in range(n):
    se1 = (y[i] - y1[i])**2
    se += se1

  mse = se/n
  return mse

In [None]:
mse_value = mse(2340, cost_train, cost_test)

print("The MSE is: ", mse_value)