# Your Name: Stephanie Buchanan

# import all packages 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, euclidean_distances
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.compose import ColumnTransformer

# Data Prepocessing

In [2]:
solar_angle = pd.read_csv('sza.csv', index_col= 0)
print(solar_angle.dtypes)
#check if any data missing
solar_angle.isnull().sum()

latitude      int64
month        object
tst           int64
sza         float64
dtype: object


latitude      0
month         0
tst           0
sza         215
dtype: int64

In [3]:
#drop missing data
solar_angle.dropna(how='any', inplace=True)
solar_angle.isnull().sum()

latitude    0
month       0
tst         0
sza         0
dtype: int64

In [4]:
X_solar_angle = pd.get_dummies(solar_angle.drop(['sza'], axis=1), columns= ['latitude'])
y_solar_angle = solar_angle.sza

In [5]:
X_solar_angle['month']= X_solar_angle['month'].replace(X_solar_angle['month'].unique(),list(range(1,13)))

# Data Splitting 

In [6]:
X_sol_train, X_sol_test, y_sol_train, y_sol_test = train_test_split(X_solar_angle, y_solar_angle, 
                                                                                    test_size=0.25, 
                                                                                    random_state=42)
print(f'X_train shape:{X_sol_train.shape:}')
print(f'X_test shape:{X_sol_test.shape:}')
print(f'y_train shape:{y_sol_train.shape:}')
print(f'y_test shape:{y_sol_test.shape:}')

X_train shape:(450, 6)
X_test shape:(151, 6)
y_train shape:(450,)
y_test shape:(151,)


# Model Building and Evaluation 

In [7]:
chain_sol = [('mm_scale', MinMaxScaler()), ('knn_reg', KNeighborsRegressor())]

knn_pipe_sza = Pipeline(chain_sol)
knn_pipe_sza = knn_pipe_sza.fit(X_sol_train, y_sol_train)
y_sol_pred = knn_pipe_sza.predict(X_sol_test)
print(f'Solar Angle Predicction with KNN, Test set MSE: {mean_squared_error(y_sol_test, y_sol_pred):.3f}')

Solar Angle Predicction with KNN, Test set MSE: 11.877


In [9]:
sol_param_grid = {'knn_reg__n_neighbors': range(1, 51) , 
                    'knn_reg__weights':['uniform', 'distance'],
                    'knn_reg__metric': ['minkowski','euclidean', 'manhattan']}

sol_grid_search = GridSearchCV(knn_pipe_sza, sol_param_grid)
sol_grid_search.fit(X_sol_train, y_sol_train)
best_knn_sza = sol_grid_search.best_estimator_
y_best_test =  best_knn_sza.predict(X_sol_test)
y_best_train =  best_knn_sza.predict(X_sol_train)

In [10]:
print(f'Solar Angle Prediction with best KNN, Train set MSE: {mean_squared_error(y_sol_train, y_best_train):.3f}')
print(f'Solar Angle Prediction with best KNN, Test set MSE: {mean_squared_error(y_sol_test, y_best_test):.3f}')

Solar Angle Prediction with best KNN, Train set MSE: 0.000
Solar Angle Prediction with best KNN, Test set MSE: 9.485


# Conclusion 

Is KNN a good model for predicting what the solar zenith angle will be based on the latitude, month and true solar time(tst)?  The data was sourced from https://vincentarelbundock.github.io/Rdatasets/articles/data.html and contains 3 variables and 1 outcome variable.  The outcome variable is 'sza' and all others are inputs. 

A KNN model was fit and evaluated after splitting the data into train and test sets. There does seem to be overfitting as the MSE on the train set is 0.0, and the MSE on the train set is significantly lower than on the test set.  Therefore, some more optimizing of the model would have to be done, and there is not strong evidence that KNN is a good model for predicting what the solar zenith angle will be based on the latitude, month and true solar time(tst).  