In [1]:
import pandas as pd

# Load your dataframe (assuming it's already sorted by location and date_utc)
df = pd.read_csv('/Users/magnesium/Documents/Light House Labs Bootcamp/Projects/Final Project/data/air_quality_imputed.csv')

In [2]:
# Create time based features
df["date_utc"] = pd.to_datetime(df["date_utc"])
df["day_of_week"] = df["date_utc"].dt.dayofweek
df["month"] = df["date_utc"].dt.month
df["hour"] = df["date_utc"].dt.hour

# Create rolling averages of pollution concentrations
pollutants = ["co", "no2", "o3", "pm10", "pm25", "so2"]
window_size = 24

for pollutant in pollutants:
    df[f"{pollutant}_rolling_mean"] = df.groupby("location")[pollutant].transform(lambda x: x.rolling(window=window_size).mean())

# Create difference from rolling average and current value
for pollutant in pollutants:
    df[f"{pollutant}_diff_from_mean"] = df[pollutant] - df[f"{pollutant}_rolling_mean"]


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import KNNImputer

# Impute missing values using KNN Imputer
imputer = KNNImputer(n_neighbors=5)
X = df.drop(['date_utc', 'location', 'latitude', 'longitude', 'pm25'], axis=1)
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

In [6]:
# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f'{model_name} Evaluation:')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}')
    print('-------------------------')

In [4]:
# Split the data into training and testing sets
y = df['pm25']
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
evaluate_model(y_test, gb_pred, 'Gradient Boosting')

In [None]:
# XGBoost
import xgboost as xgb
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
evaluate_model(y_test, xgb_pred, 'XGBoost')

In [None]:
# LightGBM
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
evaluate_model(y_test, lgb_pred, 'LightGBM')