In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("weatherAUS.csv")
df = df[['Location','MinTemp','MaxTemp','Humidity3pm','WindDir3pm','RainTomorrow']]
df = df[df['RainTomorrow'].notna()]
df['RainTomorrow'] = df['RainTomorrow'].map({'Yes':1, 'No':0})

X = df.drop("RainTomorrow", axis=1)
y = df["RainTomorrow"]

num_cols = ['MinTemp','MaxTemp','Humidity3pm']
cat_cols = ['Location','WindDir3pm']

num_imputer = SimpleImputer(strategy='mean')
X[num_cols] = num_imputer.fit_transform(X[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

encoder_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoder_dict[col] = le

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

joblib.dump(model, "Rainfall.pkl")
joblib.dump(scaler, "scale.pkl")
joblib.dump(num_imputer, "imputer.pkl")
joblib.dump(encoder_dict, "encoder.pkl")

print(" Model & preprocessing files saved successfully!")


Accuracy: 0.827947536833222
 Model & preprocessing files saved successfully!
