In [2]:
import pandas as pd

# Load your dataframe (assuming it's already sorted by location and date_utc)
df = pd.read_csv('/Users/magnesium/Documents/Light House Labs Bootcamp/Projects/Final Project/data/air_quality_imputed.csv')


In [4]:
# Create time based features
df["date_utc"] = pd.to_datetime(df["date_utc"])
df["day_of_week"] = df["date_utc"].dt.dayofweek
df["month"] = df["date_utc"].dt.month
df["hour"] = df["date_utc"].dt.hour

# Create rolling averages of pollution concentrations
pollutants = ["co", "no2", "o3", "pm10", "pm25", "so2"]
window_size = 24

for pollutant in pollutants:
    df[f"{pollutant}_rolling_mean"] = df.groupby("location")[pollutant].transform(lambda x: x.rolling(window=window_size).mean())

# Create difference from rolling average and current value
for pollutant in pollutants:
    df[f"{pollutant}_diff_from_mean"] = df[pollutant] - df[f"{pollutant}_rolling_mean"]


In [12]:
# check for Nan values
df.isna().sum()

date_utc                 0
location                 0
latitude                 0
longitude                0
co                       0
no2                      0
o3                       0
pm10                     0
pm25                     0
so2                      0
day_of_week              0
month                    0
hour                     0
co_rolling_mean        138
no2_rolling_mean       138
o3_rolling_mean        138
pm10_rolling_mean      138
pm25_rolling_mean      138
so2_rolling_mean       138
co_diff_from_mean      138
no2_diff_from_mean     138
o3_diff_from_mean      138
pm10_diff_from_mean    138
pm25_diff_from_mean    138
so2_diff_from_mean     138
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import KNNImputer

# Impute missing values using KNN Imputer
imputer = KNNImputer(n_neighbors=5)
X = df.drop(['date_utc', 'location', 'latitude', 'longitude', 'pm25'], axis=1)
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)


In [7]:
# Split the data into training and testing sets
y = df['pm25']
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Train models
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
lr_pred = lr_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

In [8]:
# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f'{model_name} Evaluation:')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}')
    print('-------------------------')

evaluate_model(y_test, lr_pred, 'Linear Regression')
evaluate_model(y_test, rf_pred, 'Random Forest')

Linear Regression Evaluation:
Mean Squared Error: 1.4218364126986636
Mean Absolute Error: 0.0337162324398716
R^2 Score: 0.9999595555725649
-------------------------
Random Forest Evaluation:
Mean Squared Error: 247.67519923549386
Mean Absolute Error: 0.8082761317944359
R^2 Score: 0.9929548283237865
-------------------------


In [13]:
# Perform cross validation to better assess model performance
from sklearn.model_selection import cross_val_score

lr_cv_scores = cross_val_score(lr_model, X_imputed, y, cv=5, scoring='neg_mean_squared_error')
rf_cv_scores = cross_val_score(rf_model, X_imputed, y, cv=5, scoring='neg_mean_squared_error')

lr_avg_mse = -lr_cv_scores.mean()
rf_avg_mse = -rf_cv_scores.mean()

lr_avg_r2 = cross_val_score(lr_model, X_imputed, y, cv=5, scoring='r2').mean()
rf_avg_r2 = cross_val_score(rf_model, X_imputed, y, cv=5, scoring='r2').mean()

print(f'Linear Regression (Cross-Validation) - Average MSE: {lr_avg_mse}, Average R^2: {lr_avg_r2}')
print(f'Random Forest (Cross-Validation) - Average MSE: {rf_avg_mse}, Average R^2: {rf_avg_r2}')


Linear Regression (Cross-Validation) - Average MSE: 8.296784272436494, Average R^2: 0.9993931157337567
Random Forest (Cross-Validation) - Average MSE: 3404.4070609488526, Average R^2: 0.9540676122937581


In [14]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Set up Lasso Regression model
lasso_model = Lasso()

# Set up a range of alpha values for hyperparameter tuning
alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]

# Set up a dictionary for the grid search
param_grid = {'alpha': alphas}

# Set up GridSearchCV with Lasso Regression model, using 5-fold cross-validation and neg_mean_squared_error as the scoring metric
grid_search = GridSearchCV(lasso_model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit GridSearchCV to the imputed data
grid_search.fit(X_imputed, y)

# Get the best alpha value and corresponding negative mean squared error
best_alpha = grid_search.best_params_['alpha']
best_neg_mse = grid_search.best_score_

# Print the results
print(f"Best alpha value: {best_alpha}")
print(f"Best negative mean squared error: {best_neg_mse}")

# Train Lasso Regression model with the best alpha value
best_lasso_model = Lasso(alpha=best_alpha)
best_lasso_model.fit(X_train, y_train)

# Make predictions
lasso_pred = best_lasso_model.predict(X_test)

# Evaluate the model
evaluate_model(y_test, lasso_pred, 'Lasso Regression')

Best alpha value: 1
Best negative mean squared error: -1.3418841693832089
Lasso Regression Evaluation:
Mean Squared Error: 1.0099235464645562
Mean Absolute Error: 0.026232031069316944
R^2 Score: 0.9999712725182551
-------------------------
