### Take four models of polynomials of degree (2, 3, 4, 5), and use the normal equation to optimize theta (with small lambda term, as shown in class). Split the weather data using (80:20) split, print train, and test loss for each model. Write which model you consider best based on train and test loss along with proper explanation.


normal equation = \[
(X^T X) \theta - (X^T y)
\]


In [None]:
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
models=[2,3,4,5]

In [None]:
#loading data
data=xr.open_dataset(r"C:\Users\vvagh\OneDrive - Indian Institute of Science Education and Research Bhopal\Documents\IISERB docs\workshoop assignments\PERCDR_0.25deg_2001_2010_precipitation_data.nc")
df=data.to_dataframe().reset_index()
df
#data

In [None]:
col=df.columns
col
y=df[['precip']].values
y
scale=StandardScaler()

In [None]:
for x in col[:len(col)-2]:
    X_reshaped = df[x].values.reshape(-1, 1)
    X_scaled = scale.fit_transform(X_reshaped)
    plt.figure(figsize=(10, 6))
    plt.scatter(X_scaled, y, color='blue', label='precipitaion')
    plt.xlabel(f'parameter: {x} (standardised)')
    plt.ylabel('precip')
    plt.title('scatter Plot of precip with different variables')
    plt.legend()
    plt.show()


In [None]:
# Extract features and target variable
df['DAY_OF_YEAR'] = df['datetime'].dt.dayofyear
# Use 'DAY_OF_YEAR' as the feature and 'TMAX' (maximum temperature) as the target variable
X = df[['DAY_OF_YEAR']].values  # Feature: day of the year
y = df['precip'].values  # Target: maximum temperature
df1 = df.sort_values('datetime').reset_index(drop=True)
X_scaled = scale.fit_transform(X)

# Plot the data and regression line
plt.figure(figsize=(10, 6))
plt.scatter(X_scaled, y, color='blue', label='Actual Temperatures')
plt.xlabel('Day of the Year (standardized)')
plt.ylabel('precipitaion')
plt.title('Linear Regression on precipitation.')
plt.legend()
plt.show()

In [None]:
# Experiment with different polynomial degrees

for degree in models:
    print(f"\nPolynomial Degree: {degree}")
    
    # Create polynomial features
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_poly)

    # Add a column of ones to X to account for the bias term (intercept)
    X_b = np.c_[np.ones((X_scaled.shape[0], 1)), X_scaled]

    # Add lambda term
    lambda_ = 1e-5
    I = np.eye(X_b.shape[1])
    I[0, 0] = 0  # Do not change intercept term

    # Solve for theta using the regularized normal equation
    theta = np.linalg.inv(X_b.T.dot(X_b) + lambda_ * I).dot(X_b.T).dot(y)  # Here small constant has been added (Not to intercept term)

    # Make predictions
    y_pred = X_b.dot(theta)
    
    # Sort the values by the feature before plotting
    sorted_indices = np.argsort(X[:, 0])
    X_sorted = X[sorted_indices]
    y_pred_sorted = y_pred[sorted_indices]

    # Plot the data and regression curve
    plt.figure(figsize=(10, 6))
    plt.scatter(X, y, color='blue', label='actual precip')
    plt.plot(X_sorted, y_pred_sorted, color='red', linewidth=2, label='predicted precip')
    plt.xlabel('Day of the Year')
    plt.ylabel('precipitaion')
    plt.title(f'Polynomial Regression (Degree {degree}) precipitaion')
    plt.legend()
    plt.show()

    # Scatter plot of predicted vs actual values
    plt.figure(figsize=(10, 6))
    plt.scatter(y, y_pred, color='blue', alpha=0.5)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
    plt.xlabel('Actual precip')
    plt.ylabel('Predicted precip')
    plt.title(f'Predicted vs Actual pecip (Degree {degree})')
    plt.show()

    # Calculate and print performance metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")

taking single feature into account wont work, we'll also have to see how precipitaion behaves with lat lon as well, as R2 score is pretty low as well along with high error. 

In [None]:
df.columns

In [None]:
# correlation of datetime with precip
df_corr=df.groupby('datetime')[['precip']].corr()
df_corr

In [None]:
# correlation of lon with precip
df_corr2=df.groupby('lon')[['precip']].corr()
df_corr2

In [None]:
# correlation of lat with precip
df_corr3=df.groupby('lat')[['precip']].corr()
df_corr3

the above 3 feerures are to be taken into account as they are highly correlated to the precipitation.

In [None]:
#splittiing up the datetime column in day, month, year
df['day']=df['datetime'].dt.day
df['month']=df['datetime'].dt.month
df['year']=df['datetime'].dt.year
df

In [None]:
order=['DAY_OF_YEAR','day','month','year','lon','lat','precip']
df_modified=df[order]
df_modified

In [None]:
#splitting the data using scikit learn so as to introduce some randomness in the data as well.
# Split the data into training and test sets, using inbuilt command
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
features=['DAY_OF_YEAR','lon','lat']
X=df[features].values
# Experiment with different polynomial degrees
for degree in models:
    print(f"\nPolynomial Degree: {degree}")
    
    # Create polynomial features
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_poly)

    # Add a column of ones to X to account for the bias term (intercept)
    X_b = np.c_[np.ones((X_scaled.shape[0], 1)), X_scaled]

    # Add lambda term
    lambda_ = 1e-5
    I = np.eye(X_b.shape[1])
    I[0, 0] = 0  # Do not regularize intercept term

    # Solve for theta using the regularized normal equation
    theta = np.linalg.inv(X_b.T.dot(X_b) + lambda_ * I).dot(X_b.T).dot(y)

    # Make predictions
    y_pred = X_b.dot(theta)
    
    # Sorting indices based on the first feature (datetime)
    sorted_indices = np.argsort(X[:, 0])
    X_sorted = X[sorted_indices]
    y_pred_sorted = y_pred[sorted_indices]

    # Plot the data and regression curve
    plt.figure(figsize=(10, 6))
    plt.scatter(X[:, 0], y, color='blue', label='actual precip')  # Scatter plot of actual data
    plt.plot(X_sorted[:, 0], y_pred_sorted, color='red', linewidth=2, label='predicted precip')  # Regression curve
    plt.xlabel('Day of the Year')  # Assuming datetime is the day of the year
    plt.ylabel('Precipitation')
    plt.title(f'Polynomial Regression (Degree {degree}) Precipitation')
    plt.legend()
    plt.show()

    # Scatter plot of predicted vs actual values
    plt.figure(figsize=(10, 6))
    plt.scatter(y, y_pred, color='blue', alpha=0.5)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
    plt.xlabel('Actual Precipitation')
    plt.ylabel('Predicted Precipitation')
    plt.title(f'Predicted vs Actual Precipitation (Degree {degree})')
    plt.show()

    # Calculate and print performance metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")


In [None]:
#trying the same model with slightly different features
features=['day','month','year','lon','lat']
X=df[features].values
# Experiment with different polynomial degrees
for degree in models:
    print(f"\nPolynomial Degree: {degree}")
    
    # Create polynomial features
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_poly)

    # Add a column of ones to X to account for the bias term (intercept)
    X_b = np.c_[np.ones((X_scaled.shape[0], 1)), X_scaled]

    # Add lambda term
    lambda_ = 1e-5
    I = np.eye(X_b.shape[1])
    I[0, 0] = 0  # Do not regularize intercept term

    # Solve for theta using the regularized normal equation
    theta = np.linalg.inv(X_b.T.dot(X_b) + lambda_ * I).dot(X_b.T).dot(y)

    # Make predictions
    y_pred = X_b.dot(theta)
    
    # Sorting indices based on the first feature (datetime)
    sorted_indices = np.argsort(X[:, 0])
    X_sorted = X[sorted_indices]
    y_pred_sorted = y_pred[sorted_indices]

    # Plot the data and regression curve
    plt.figure(figsize=(10, 6))
    plt.scatter(X[:, 0], y, color='blue', label='actual precip')  # Scatter plot of actual data
    plt.plot(X_sorted[:, 0], y_pred_sorted, color='red', linewidth=2, label='predicted precip')  # Regression curve
    plt.xlabel('Day of the Year')  # Assuming datetime is the day of the year
    plt.ylabel('Precipitation')
    plt.title(f'Polynomial Regression (Degree {degree}) Precipitation')
    plt.legend()
    plt.show()

    # Scatter plot of predicted vs actual values
    plt.figure(figsize=(10, 6))
    plt.scatter(y, y_pred, color='blue', alpha=0.5)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
    plt.xlabel('Actual Precipitation')
    plt.ylabel('Predicted Precipitation')
    plt.title(f'Predicted vs Actual Precipitation (Degree {degree})')
    plt.show()

    # Calculate and print performance metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")
