In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np

# Load the data
file_path = 'clean.csv'
data = pd.read_csv(file_path)

# Ensure the DateTime column is in datetime format
data['DateTime'] = pd.to_datetime(data['DateTime'])

# Sort the data by DateTime
data.sort_values(by='DateTime', inplace=True)

# Set the DateTime column as the index
data.set_index('DateTime', inplace=True)

# Create lagged feature for NOx
data['NOx_lagged'] = data['PT08.S3(NOx)'].shift(1)

# Drop rows with NaN values
data.dropna(inplace=True)

# Define input feature (lagged NOx) and output target (current NOx)
X = data['NOx_lagged'].values.reshape(-1, 1)
y = data['PT08.S3(NOx)'].values

# Split the data into training and testing sets (using the entire time span for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0, shuffle=False)

# Train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions for the entire dataset
y_pred = model.predict(X)

# Create a DataFrame for the results
results = pd.DataFrame({
    'DateTime': data.index,
    'Actual_NOx': y,
    'Predicted_NOx': y_pred
})

# Plot the actual data and predictions
plt.figure(figsize=(14, 7))
plt.plot(results['DateTime'], results['Actual_NOx'], label='Actual NOx')
plt.plot(results['DateTime'], results['Predicted_NOx'], label='Predicted NOx', linestyle='--')
plt.xlabel('DateTime')
plt.ylabel('NOx Levels')
plt.title('Actual vs Predicted NOx Levels Over Time')
plt.legend()
plt.show()
