In [3]:
# Weather Forecast and Rainfall Prediction
# Jupyter Notebook

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('weatherAUS.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

# Step 2: Data Cleaning
data.fillna(method='ffill', inplace=True)
data.drop(columns=['Location','RISK_MM'], inplace=True)

# Step 3: Feature Engineering
for i in range(1,4):
    data[f'Temp_lag{i}'] = data['MinTemp'].shift(i)
    data[f'Humidity_lag{i}'] = data['Humidity9am'].shift(i)
    data[f'Rain_lag{i}'] = data['Rainfall'].shift(i)

data.dropna(inplace=True)

# Step 4: Target Variable
data['RainTomorrow'] = data['RainTomorrow'].map({'Yes':1, 'No':0})

# Step 5: Features & Target
features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
            'Temp_lag1','Temp_lag2','Temp_lag3','Humidity_lag1','Humidity_lag2','Humidity_lag3','Rain_lag1','Rain_lag2','Rain_lag3']
X = data[features]
y = data['RainTomorrow']

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Step 7: Model Training
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# KNN Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(X)

# Step 8: Model Evaluation

def evaluate_model(y_true, y_pred, model_name):
    print(f'--- {model_name} ---')
    print('Accuracy:', accuracy_score(y_true, y_pred))
    print('Classification Report:\n', classification_report(y_true, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_true, y_pred))

# Logistic Regression Evaluation
y_pred_lr = lr_model.predict(X_test)
evaluate_model(y_test, y_pred_lr, 'Logistic Regression')

# KNN Evaluation
y_pred_knn = knn_model.predict(X_test)
evaluate_model(y_test, y_pred_knn, 'KNN')

# Step 9: 7-Day Forecast
last_7_days = X_test.tail(7)
predictions = lr_model.predict(last_7_days)
print('7-Day Rainfall Forecast (1=Rain, 0=No Rain):', predictions)

# Step 10: Visualizations
# Logistic Regression Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.show()

# KNN Confusion Matrix
cm_knn = confusion_matrix(y_test, y_pred_knn)
sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Blues')
plt.title('KNN Confusion Matrix')
plt.show()


  data.fillna(method='ffill', inplace=True)


KeyError: "['RISK_MM'] not found in axis"