## Title: Leveraging Quadratic Polynomials in Python for Advanced Data Analysis

### Version 1

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd  # Import pandas for handling CSV
from numpy.polynomial.polynomial import Polynomial
from sklearn.preprocessing import StandardScaler # v2

In [None]:
# User inputs for the descriptive elements of the plot
description = input("Enter the location description (e.g., Kyiv, Shcherbakovskaya St.): ")
pollution_name = input("Enter the pollution name (e.g., PM2.5): ")
y_label = input("Enter the y-axis label (e.g., PM2.5 Index): ")

In [None]:
# Read data from a CSV file
# Use the direct link to the raw CSV file from the GitHub repository
data = pd.read_csv('https://raw.githubusercontent.com/rsipakov/QuadraticPolynomialsPyDA/main/notebooks/pm_data.csv')
# Or downloading CSV file to the local
# data = pd.read_csv('/path/pm_data.csv')  # Update the path to your CSV file

In [None]:
# v2 Handling Missing Values
# removes any rows with missing values from the dataset to ensure that the subsequent analysis is not affected by incomplete data
data.dropna(inplace=True)

In [None]:
# v2 Outlier Detection and Treatment using Z-Score method
z_scores = np.abs((data - data.mean()) / data.std())
data = data[(z_scores < 3).all(axis=1)]

In [None]:
# Extracting months and values
months = data['Month'].to_numpy()
values = data['Values'].to_numpy()  # Corrected to 'Values'

In [None]:
# v.2 Data Normalization using StandardScaler
scaler = StandardScaler()
months_scaled = scaler.fit_transform(months.reshape(-1, 1)).flatten()
values_scaled = scaler.fit_transform(values.reshape(-1, 1)).flatten()

In [None]:
# v.2 Fit the quadratic polynomial
coefs = Polynomial.fit(months_scaled, values_scaled, 2).convert().coef

In [None]:
# v.2 Calculate y values from the fitted polynomial coefficients
fitted_y_values = coefs[0] + coefs[1] * months_scaled + coefs[2] * months_scaled**2

In [None]:
# v.2 Calculate R-squared value
residuals = values_scaled - fitted_y_values
ss_res = np.sum(residuals**2)
ss_tot = np.sum((values_scaled - np.mean(values_scaled))**2)
r_squared = 1 - (ss_res / ss_tot)

In [None]:
# v.2 Generate a smooth curve by evaluating the polynomial at many points
x = np.linspace(months_scaled.min(), months_scaled.max(), 200)
y = coefs[0] + coefs[1] * x + coefs[2] * x**2

In [None]:
# Create the plot
plt.figure(figsize=(12, 7))

# Plot the data and the fitted curve
plt.scatter(months_scaled, values_scaled, color='black', label='Actual Data')
plt.plot(x, y, color='purple', label=f'Fitted curve: {coefs[2]:.4f}$x^2$ + {coefs[1]:.4f}$x$ + {coefs[0]:.4f}\n$R^2 = {r_squared:.4f}$')

# Title and labels using user inputs
plt.title(f'{pollution_name} by Month in \n{description}')
plt.xlabel('Month')
plt.ylabel(y_label)

# Add legend and grid
plt.legend()
plt.grid(True)

# Show the plot
plt.show()