In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Loading the data
data_path = "AirQualityUCI.csv"
data = pd.read_csv(data_path, sep=";", decimal=",", na_values=-200)

In [12]:
# Check if the columns exist before dropping
cols_to_drop = ["Unnamed: 15", "Unnamed: 16"]
for col in cols_to_drop:
    if col in data.columns:
        data.drop(columns=[col], inplace=True)

# Drop rows with missing 'Date' or 'Time'
data.dropna(subset=['Date', 'Time'], inplace=True)

# Combining 'Date' and 'Time' columns into a single datetime column
data['Datetime'] = pd.to_datetime(data['Date'].astype(str) + ' ' + data['Time'].astype(str))

# Setting 'Datetime' as the index
data.set_index('Datetime', inplace=True)


In [13]:
# Imputation of missing values
data_imputed = data.interpolate(method='linear')


In [15]:
# Exclude non-numeric columns and then calculate correlations
numeric_data = data_imputed.select_dtypes(include=[np.number])

# Exploratory Data Analysis
corr = numeric_data.corr()


In [16]:
# Splitting the data into training and test sets
X = data_imputed.drop(columns=['Date', 'Time', 'CO(GT)'])
y = data_imputed['CO(GT)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Building a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [18]:
# Making predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)


In [19]:
# Evaluating the model
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
train_mae = mean_absolute_error(y_train, train_preds)
test_mae = mean_absolute_error(y_test, test_preds)


In [20]:
# Print the results
print(f"Training RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Training MAE: {train_mae:.4f}")
print(f"Test MAE: {test_mae:.4f}")


Training RMSE: 0.5966
Test RMSE: 0.5932
Training MAE: 0.3992
Test MAE: 0.3975
