In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
from load_and_preprocess import load_model_components, preprocess_data

In [2]:
# Load model components
model_components = load_model_components('models/aussie_rain.joblib')
#joblib.dump(model, 'models/aussie_rain.joblib', compress=('zlib', 3))

In [3]:
# Load dataset
raw_df = pd.read_csv('data/weatherAUS.csv')

raw_df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

In [4]:
# Create train and test sets
X = raw_df[model_components['input_cols']]
y = raw_df[model_components['target_col']]
train_inputs, test_inputs, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Check train dataset for null values
train_inputs.isna().sum().apply(lambda x: format(x/train_inputs.shape[0],'.2%'))

Location          0.00%
MinTemp           0.32%
MaxTemp           0.20%
Rainfall          0.00%
Evaporation      42.45%
Sunshine         47.55%
WindGustDir       6.51%
WindGustSpeed     6.47%
WindDir9am        6.85%
WindDir3pm        2.60%
WindSpeed9am      0.74%
WindSpeed3pm      1.79%
Humidity9am       1.07%
Humidity3pm       2.50%
Pressure9am       9.80%
Pressure3pm       9.83%
Cloud9am         37.38%
Cloud3pm         39.84%
Temp9am           0.45%
Temp3pm           1.87%
RainToday         0.00%
dtype: object

In [6]:
# Preprocess raw data with trained imputer, scaler, encoder
X_train, train_inputs = preprocess_data(train_inputs, model_components)
X_test, test_inputs = preprocess_data(test_inputs, model_components)

  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = components['encoder'].transform(data[components['categorical_cols']])
  data[components['encoded_cols']] = 

In [7]:
# Define and train Random Forest model
model = RandomForestClassifier(n_estimators=40, max_leaf_nodes=30, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Make predictions on train set
pred_train = model.predict(X_train)
print(classification_report(y_train, pred_train, digits=4))

              precision    recall  f1-score   support

          No     0.8333    0.9811    0.9012     87668
         Yes     0.8242    0.3109    0.4515     24961

    accuracy                         0.8326    112629
   macro avg     0.8288    0.6460    0.6763    112629
weighted avg     0.8313    0.8326    0.8015    112629



In [8]:
# Make predictions on test set
pred_test = model.predict(X_test)
print(classification_report(y_test, pred_test, digits=4))

              precision    recall  f1-score   support

          No     0.8344    0.9798    0.9013     21918
         Yes     0.8171    0.3171    0.4569      6240

    accuracy                         0.8329     28158
   macro avg     0.8258    0.6485    0.6791     28158
weighted avg     0.8306    0.8329    0.8028     28158



In [9]:
# Save model with new trained estimator
rf_model = {
    'model': model_components['model'],
    'imputer': model_components['imputer'],
    'scaler': model_components['scaler'],
    'encoder': model_components['encoder'],
    'input_cols': model_components['input_cols'],
    'target_col': model_components['target_col'],
    'numeric_cols': model_components['numeric_cols'],
    'categorical_cols': model_components['categorical_cols'],
    'encoded_cols': model_components['encoded_cols']
}
joblib.dump(rf_model, "models/rf_model.joblib")

['models/rf_model.joblib']

In [10]:
raw_df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,140319.0,140480.0,140787.0,81093.0,73982.0,131682.0,139732.0,138256.0,139270.0,137286.0,127044.0,127018.0,88162.0,84693.0,140131.0,138163.0
mean,12.184824,23.23512,2.349974,5.472516,7.63054,39.97052,13.990496,18.631141,68.826833,51.449288,1017.654577,1015.257963,4.431161,4.49925,16.987066,21.693183
std,6.403879,7.1145,8.465173,4.189132,3.781729,13.578201,8.88621,8.798096,19.06365,20.80731,7.104867,7.035411,2.886594,2.719752,6.496012,6.937784
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4
25%,7.6,17.9,0.0,2.6,4.9,31.0,7.0,13.0,57.0,37.0,1013.0,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.5,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1
75%,16.8,28.3,0.8,7.4,10.7,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7
