In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("dataset.csv")
data.head()

In [None]:
# first two columns are unnecessary (index)
data.drop(data.columns[[0, 1]], axis = 1, inplace = True)
data.head()

In [None]:
labels = ["radiation", "ozone", "temperature", "wind"]
fig, axs = plt.subplots(4, 4)
fig.set_size_inches(10, 10)
for i in range(4):
    for j in range(4):
        axs[i, j].axes.xaxis.set_visible(False)
        axs[i, j].axes.yaxis.set_visible(False)
        if i == j:
            axs[i, j].plot()
            axs[i, j].text(.5, .5, labels[i].capitalize(),  horizontalalignment='center', verticalalignment='center', transform=axs[i, j].transAxes, fontsize=13)
        else:
            if i == 0 or i == 3:
                axs[i, j].axes.xaxis.set_visible(True)
            if i == 0:
                axs[i, j].xaxis.tick_top()
            if j == 0 or j == 3:
                axs[i, j].axes.yaxis.set_visible(True)
            if j == 3:
                axs[i, j].yaxis.tick_right()
            axs[i, j].scatter(data[labels[j]], data[labels[i]], s=15, color="black")

In [None]:
def tricubic(x):
    y = np.zeros_like(x)
    idx = (x >= -1) & (x <= 1)
    y[idx] = np.power(1.0 - np.power(np.abs(x[idx]), 3), 3)
    return y
plt.plot(np.linspace(-2, 2, 100), [tricubic(x) for x in np.linspace(-2, 2, 100)])

In [None]:
def get_weights(X, Y, x, q):
    distances = np.linalg.norm(X - x, axis=1)
    sorted_distances = np.sort(distances)[1:q+1]
    d = sorted_distances[-1]
    sorted_distances = sorted_distances / d
    sorted_distances = tricubic(sorted_distances)
    W = np.diag(sorted_distances)

    return W

In [None]:
def estimate(X, Y, x, f):
    q = int(f * len(data))
    standarized_X = (X - X.mean()) / X.std()
    standarized_x = (x - X.mean()) / X.std()
    standarized_Y = (Y - Y.mean()) / Y.std()

    # get q nearest neighbors
    distances = np.linalg.norm(standarized_X - standarized_x, axis=1)
    sorted_index = np.argsort(distances)[1:q+1]
    nearest_X = standarized_X.iloc[sorted_index]
    nearest_Y = standarized_Y.iloc[sorted_index]

    # get weights
    W = get_weights(standarized_X, standarized_Y, standarized_x, q)
    
    # get estimate
    A = np.array(nearest_X)
    b = np.array(nearest_Y)
    beta = np.linalg.solve(A.T @ W @ A, A.T @ W @ b)
    res = beta.T @ standarized_x

    return res * Y.std()["ozone"] + Y.mean()["ozone"]


In [None]:
X = data[["radiation", "temperature", "wind"]]
Y = data[["ozone"]]
estimations = []
for i in range(len(data)):
    estimations.append(estimate(X, Y, X.iloc[i], 0.4))
data["estimation"] = estimations
data["residual"] = data["ozone"] - data["estimation"]

In [None]:
fig, ax = plt.subplots()
ax.set_box_aspect(1)
plt.scatter(data["estimation"], abs(data["residual"]), facecolors='none', edgecolors='black', s=20)
plt.xlabel("Fitted Values")
plt.ylabel("Absolute Residuals")

In [None]:
fig, ax = plt.subplots()
ax.set_box_aspect(1)
plt.scatter(data["radiation"], data["residual"], facecolors='none', edgecolors='black', s=20)
plt.xlabel("Solar Radiation")
plt.ylabel("Residuals")

In [None]:
fig, ax = plt.subplots()
ax.set_box_aspect(1)
plt.scatter(data["temperature"], data["residual"], facecolors='none', edgecolors='black', s=20)
plt.xlabel("Temperature")
plt.ylabel("Residuals")

In [None]:
fig, ax = plt.subplots()
ax.set_box_aspect(1)
plt.scatter(data["wind"], data["residual"], facecolors='none', edgecolors='black', s=20)
plt.xlabel("Wind Speed")
plt.ylabel("Residuals")