In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

from sklearn.ensemble import RandomForestClassifier
import xgboost
import pickle

In [32]:
data = pd.read_csv("rainfall_prediction.csv")
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Delhi,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Delhi,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Delhi,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Delhi,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Delhi,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [33]:
data.drop(['Evaporation','Sunshine','Cloud9am','Cloud3pm'], axis=1, inplace=True)

In [34]:
num_cols = data.select_dtypes(include=np.number).columns

for col in num_cols:
    data[col].fillna(data[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


In [35]:
cat_cols = data.select_dtypes(include='object').columns

imp = SimpleImputer(strategy='most_frequent')
data[cat_cols] = imp.fit_transform(data[cat_cols])

In [36]:
data['RainTomorrow'] = data['RainTomorrow'].map({'No':0, 'Yes':1})
data['RainToday'] = data['RainToday'].map({'No':0, 'Yes':1})

In [37]:
le = LabelEncoder()

for col in cat_cols:
    data[col] = le.fit_transform(data[col])

In [38]:
#X = data.drop('RainTomorrow', axis=1)
#y = data['RainTomorrow']
# Select only 5 features for model
selected_features = [
    'MinTemp',
    'MaxTemp',
    'Rainfall',
    'Humidity9am',
    'Humidity3pm'
]

X = data[selected_features]
y = data['RainTomorrow']

In [39]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [41]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [42]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8321187955451671
[[21579  1201]
 [ 3683  2629]]
              precision    recall  f1-score   support

           0       0.85      0.95      0.90     22780
           1       0.69      0.42      0.52      6312

    accuracy                           0.83     29092
   macro avg       0.77      0.68      0.71     29092
weighted avg       0.82      0.83      0.82     29092



In [43]:
pickle.dump(model, open("rainfall.pkl", "wb"))
pickle.dump(scaler, open("scale.pkl", "wb"))
pickle.dump(le, open("encoder.pkl", "wb"))
pickle.dump(imp, open("imputer.pkl", "wb"))

In [44]:
import pickle

pickle.dump(model, open("rainfall.pkl", "wb"))
pickle.dump(scaler, open("scale.pkl", "wb"))

In [45]:
import os
os.getcwd()

'C:\\Users\\salma'

In [46]:
pickle.dump(model, open(r"C:\Users\salma\OneDrive\Desktop\rainfall\rainfall.pkl", "wb"))

pickle.dump(scaler, open(r"C:\Users\salma\OneDrive\Desktop\rainfall\scale.pkl", "wb"))

In [47]:
print("Number of features used for training:", X.shape[1])

Number of features used for training: 5
