In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

df = pd.read_csv('/content/final_dataset.csv')
df = df.dropna()

features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Temperature', 'Humidity', 'Wind Speed']
X = df[features].values
y = (df['AQI'] > 100).astype(int).values

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lambdas = [0, 0.1, 1, 10, 100, 1000]
train_losses = []
test_losses = []
l2_norms = []
weights_list = []
train_accuracies = []
test_accuracies = []

for lam in lambdas:
    if lam == 0:
        C = 1e10
    else:
        C = 1 / lam
    model = LogisticRegression(penalty='l2', C=C, solver='lbfgs', max_iter=1000)
    model.fit(X_train, y_train)

    y_train_pred_prob = model.predict_proba(X_train)
    y_test_pred_prob = model.predict_proba(X_test)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_loss = log_loss(y_train, y_train_pred_prob)
    test_loss = log_loss(y_test, y_test_pred_prob)
    train_losses.append(train_loss)
    test_losses.append(test_loss)

    l2_norm = np.linalg.norm(model.coef_)
    l2_norms.append(l2_norm)

    weights_list.append(model.coef_[0])

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)

plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.plot(lambdas, train_losses, label='Train')
plt.plot(lambdas, test_losses, label='Test')
plt.xscale('log')
plt.xlabel('Lambda')
plt.ylabel('Average Cross-Entropy')
plt.legend()
plt.title('Cross-Entropy vs Lambda')

plt.subplot(2, 2, 2)
plt.plot(lambdas, l2_norms)
plt.xscale('log')
plt.xlabel('Lambda')
plt.ylabel('L2 Norm of Weights')
plt.title('L2 Norm vs Lambda')

plt.subplot(2, 2, 3)
for i in range(len(features)):
    weight_i = [w[i] for w in weights_list]
    plt.plot(lambdas, weight_i, label=features[i])
plt.xscale('log')
plt.xlabel('Lambda')
plt.ylabel('Weight Values')
plt.legend()
plt.title('Weights vs Lambda')

plt.subplot(2, 2, 4)
plt.plot(lambdas, train_accuracies, label='Train')
plt.plot(lambdas, test_accuracies, label='Test')
plt.xscale('log')
plt.xlabel('Lambda')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy vs Lambda')

plt.tight_layout()
plt.show()
mus = np.linspace(-10, 10, 5)

def apply_gaussian_basis(X, sigma, mus):
    n_samples, n_features = X.shape
    new_X = np.zeros((n_samples, n_features * len(mus)))
    for f in range(n_features):
        for m_idx, mu in enumerate(mus):
            new_X[:, f * len(mus) + m_idx] = np.exp( - (X[:, f] - mu)**2 / (2 * sigma**2) )
    return new_X

sigmas = [0.1, 0.5, 1, 5, 10]
train_losses_f = []
test_losses_f = []

for sigma in sigmas:
    X_train_basis = apply_gaussian_basis(X_train, sigma, mus)
    X_test_basis = apply_gaussian_basis(X_test, sigma, mus)

    model = LogisticRegression(penalty='none', solver='lbfgs', max_iter=1000)
    model.fit(X_train_basis, y_train)

    y_train_pred_prob = model.predict_proba(X_train_basis)
    y_test_pred_prob = model.predict_proba(X_test_basis)

    train_loss = log_loss(y_train, y_train_pred_prob)
    test_loss = log_loss(y_test, y_test_pred_prob)
    train_losses_f.append(train_loss)
    test_losses_f.append(test_loss)

no_reg_train_loss = train_losses[0]
no_reg_test_loss = test_losses[0]

plt.figure()
plt.plot(sigmas, train_losses_f, label='Train')
plt.plot(sigmas, test_losses_f, label='Test')
plt.axhline(no_reg_train_loss, color='blue', linestyle='--', label='No Basis Train')
plt.axhline(no_reg_test_loss, color='orange', linestyle='--', label='No Basis Test')
plt.xscale('log')
plt.xlabel('Sigma')
plt.ylabel('Average Cross-Entropy')
plt.legend()
plt.title('Error vs Sigma (No Regularization)')
plt.show()
n_mus = len(mus)
n_sigmas = len(sigmas)
n_features = X.shape[1]

def apply_all_gaussian_basis(X, sigmas, mus):
    n_samples = X.shape[0]
    new_X = np.zeros((n_samples, n_features * n_mus * n_sigmas))
    idx = 0
    for s_idx, sigma in enumerate(sigmas):
        for f in range(n_features):
            for m_idx, mu in enumerate(mus):
                new_X[:, idx] = np.exp( - (X[:, f] - mu)**2 / (2 * sigma**2) )
                idx += 1
    return new_X

X_train_all_basis = apply_all_gaussian_basis(X_train, sigmas, mus)
X_test_all_basis = apply_all_gaussian_basis(X_test, sigmas, mus)

lambdas_g = [0, 0.1, 1, 10, 100, 1000, 10000]
train_losses_g = []
test_losses_g = []
l2_norms_g = []
sigma_l2_norms = {s: [] for s in sigmas}
group_size = n_features * n_mus

for lam in lambdas_g:
    if lam == 0:
        C = 1e10
    else:
        C = 1 / lam
    model = LogisticRegression(penalty='l2', C=C, solver='lbfgs', max_iter=1000)
    model.fit(X_train_all_basis, y_train)

    y_train_pred_prob = model.predict_proba(X_train_all_basis)
    y_test_pred_prob = model.predict_proba(X_test_all_basis)

    train_loss = log_loss(y_train, y_train_pred_prob)
    test_loss = log_loss(y_test, y_test_pred_prob)
    train_losses_g.append(train_loss)
    test_losses_g.append(test_loss)

    w = model.coef_[0]
    l2_norm = np.linalg.norm(w)
    l2_norms_g.append(l2_norm)

    for s_idx, sigma in enumerate(sigmas):
        start = s_idx * group_size
        end = start + group_size
        sigma_l2 = np.linalg.norm(w[start:end])
        sigma_l2_norms[sigma].append(sigma_l2)

plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.plot(lambdas_g, train_losses_g, label='Train')
plt.plot(lambdas_g, test_losses_g, label='Test')
plt.xscale('log')
plt.xlabel('Lambda')
plt.ylabel('Average Cross-Entropy')
plt.legend()
plt.title('Cross-Entropy vs Lambda (All Basis)')

plt.subplot(1, 3, 2)
plt.plot(lambdas_g, l2_norms_g)
plt.xscale('log')
plt.xlabel('Lambda')
plt.ylabel('L2 Norm of Weights')
plt.title('L2 Norm vs Lambda (All Basis)')

plt.subplot(1, 3, 3)
for sigma in sigmas:
    plt.plot(lambdas_g, sigma_l2_norms[sigma], label=f'Sigma={sigma}')
plt.xscale('log')
plt.xlabel('Lambda')
plt.ylabel('L2 Norm of Sigma Group')
plt.legend()
plt.title('L2 Norm per Sigma vs Lambda')

plt.tight_layout()
plt.show()

KeyError: "['O3', 'Temperature', 'Humidity', 'Wind Speed'] not in index"

In [None]:
y_reg = df['AQI'].values
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

lin_reg = LinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)
y_train_pred_reg = lin_reg.predict(X_train_reg)
y_test_pred_reg = lin_reg.predict(X_test_reg)
train_mse = mean_squared_error(y_train_reg, y_train_pred_reg)
test_mse = mean_squared_error(y_test_reg, y_test_pred_reg)
print(f'Linear Regression - Train MSE: {train_mse}, Test MSE: {test_mse}')

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_reg, y_train_reg)
y_train_pred_ridge = ridge.predict(X_train_reg)
y_test_pred_ridge = ridge.predict(X_test_reg)
train_mse_ridge = mean_squared_error(y_train_reg, y_train_pred_ridge)
test_mse_ridge = mean_squared_error(y_test_reg, y_test_pred_ridge)
print(f'Ridge Regression - Train MSE: {train_mse_ridge}, Test MSE: {test_mse_ridge}')

log_reg = LogisticRegression(penalty='none', max_iter=1000)
log_reg.fit(X_train, y_train)
train_acc_log = accuracy_score(y_train, log_reg.predict(X_train))
test_acc_log = accuracy_score(y_test, log_reg.predict(X_test))
print(f'Logistic Regression - Train Acc: {train_acc_log}, Test Acc: {test_acc_log}')


NameError: name 'X' is not defined