In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv('cosmos_db_export.csv')

# Ensure 'timestamp' is in datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'])
data = data[data['timestamp'] >= '2024-12-03']

# Select relevant columns
# Assuming columns: 'traffic_data.duration_in_traffic', 'air_quality_data.air_quality_index', and additional features
data = data[['traffic_data.duration_in_traffic', 'air_quality_data.air_quality_index', 'air_quality_data.components.co', 'air_quality_data.components.pm2_5']]

# Handle missing values (fill with mean as an example)
data.fillna(data.mean(), inplace=True)

# Define features (X) and target (y)
X = data[['traffic_data.duration_in_traffic', 'air_quality_data.components.co', 'air_quality_data.components.pm2_5']]  # Features
y = data['air_quality_data.air_quality_index']  # Target (traffic duration)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)

# Display model coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)

# Optional: Predict traffic duration for a new sample
sample_data = np.array([[50, 20, 80]])  # Example: AQI=50, temperature=20°C, air_quality_data.components.pm2_5=80%
predicted_duration = model.predict(sample_data)
print("Predicted Traffic Duration:", predicted_duration[0])


Mean Squared Error (MSE): 0.27941954211397124
R-squared (R²): 0.3355219888862445
                             Feature  Coefficient
0   traffic_data.duration_in_traffic     0.000115
1     air_quality_data.components.co     0.000712
2  air_quality_data.components.pm2_5     0.036737
Predicted Traffic Duration: 4.291792957619675


