## LOAD

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline

rain_data = pd.read_csv('../assets/rain_data.csv')

# Handle missing values in 'RainToday' and 'RainTomorrow' before conversion
rain_data.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

# Convert 'RainToday' and 'RainTomorrow' to binary (0 and 1)
rain_data['RainToday'] = rain_data['RainToday'].map({'No': 0, 'Yes': 1})
rain_data['RainTomorrow'] = rain_data['RainTomorrow'].map({'No': 0, 'Yes': 1})

# Dropping categorical columns to simplify the example
rain_data = rain_data.select_dtypes(exclude=['object'])


## IMPUTE

In [14]:
imputer = IterativeImputer(random_state=42)  # You can adjust parameters as needed
numerical_columns = rain_data.select_dtypes(include=['float64', 'int64']).columns
rain_data[numerical_columns] = imputer.fit_transform(rain_data[numerical_columns])

print(rain_data.isnull().sum())


MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64




## TRAIN/TEST

In [15]:

# Separate features and target
X = rain_data.drop('RainTomorrow', axis=1)
y = rain_data['RainTomorrow'].astype(int)  # Ensure y is integer type

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(50287, 17) (12572, 17) (50287,) (12572,)


## MODEL

In [16]:

# Create a pipeline with RandomForestClassifier
pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Generate the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      6290
           1       0.85      0.85      0.85      6282

    accuracy                           0.85     12572
   macro avg       0.85      0.85      0.85     12572
weighted avg       0.85      0.85      0.85     12572

