In [4]:
import pandas as pd
import hvplot.pandas
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the historical data
data = pd.read_csv("C:/Users/nigan/final-project/weather_features.csv")

# Define features
features = ['temp',	'temp_min',	'temp_max',	'pressure',	'humidity',	'wind_speed', 'wind_deg', 'rain_1h', 'rain_3h',	'snow_3h', 'clouds_all', 'weather_id']

# Separate the features (X) and target
X = data[features]
Y = data['temp']

# Drop rows with NaN values in the features and target
data_cleaned = data.dropna(subset=features + ['temp'])

# Separate the features and target from the cleaned data
X_cleaned = data_cleaned[features]
Y_cleaned = data_cleaned['temp']

# Normalize or scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_cleaned)

# Set the percentage of variance explained
variance_threshold = 0.95

# Create PCA instance with the desired variance threshold
pca = PCA(n_components=variance_threshold)
#pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.3781155 , 0.30053558, 0.18407787, 0.06709317, 0.0598246 ])

In [6]:
# Create the PCA DataFrame
X_pca_df = pd.DataFrame(
    X_pca,
    columns=["PCA1", "PCA2", "PCA3", "PCA4", "PCA5"])

# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=42)
    k_model.fit(X_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Plot the DataFrame
df_elbow.hvplot.line(
     x="k", 
     y="inertia", 
     title="Elbow Curve", 
     xticks=k)

In [None]:
# Define the model with 3 clusters
model = KMeans(n_clusters=5, random_state=42)

# Fit the model
model.fit(X_pca_df)

# Make predictions
k_2 = model.predict(X_pca_df)

# Create a copy of the PCA DataFrame
pca_predictions = X_pca_df.copy()