### Future Rain Prediction

Using the trained and tuned model, we now define a function to predict the probability of rain for the next 21 days. The prediction process simulates day-by-day forecasting while updating rolling and lagged features.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import joblib

# Load and preprocess data
df = pd.read_csv('weather_data.csv')
df['date'] = pd.to_datetime(df['date'])

# Convert rain_or_not to binary
df['rain_or_not'] = df['rain_or_not'].map({'Rain': 1, 'No Rain': 0})

# Create temporal features
df['month'] = df['date'].dt.month
df['day_of_year'] = df['date'].dt.dayofyear
df['day_of_week'] = df['date'].dt.dayofweek

# Lagged variables
lagged_features = ['avg_temperature', 'avg_wind_speed', 'humidity', 'cloud_cover', 'pressure', 'rain_or_not']
for feature in lagged_features:
    df[f'{feature}_lag1'] = df[feature].shift(1).fillna(df[feature].iloc[0])

# Rolling variables
df['avg_temperature_roll3'] = df['avg_temperature'].rolling(window=3).mean().fillna(df['avg_temperature'])
df['avg_humidity_roll7'] = df['humidity'].rolling(window=7).std().fillna(df['humidity'])
df['avg_wind_speed_roll3'] = df['avg_wind_speed'].rolling(window=3).mean().fillna(df['avg_wind_speed'])
df['cloud_cover_roll7'] = df['cloud_cover'].rolling(window=7).mean().fillna(df['cloud_cover'])
df['pressure_roll3'] = df['pressure'].rolling(window=3).mean().fillna(df['pressure'])
df['rain_or_not_roll3'] = df['rain_or_not'].rolling(window=3).mean().fillna(df['rain_or_not'])

# Interaction variables
df['temp_wind_interaction'] = df['avg_temperature'] * df['avg_wind_speed']
df['pressure_humidity_interaction'] = df['pressure'] * df['humidity']

# Clean final dataset
df = df.dropna().reset_index(drop=True)

# Prepare features and target
features = [
    'month', 'day_of_year', 'day_of_week',
    'avg_temperature_lag1', 'avg_wind_speed_lag1', 'humidity_lag1',
    'cloud_cover_lag1', 'pressure_lag1', 'rain_or_not_lag1',
    'avg_temperature_roll3', 'avg_humidity_roll7', 'avg_wind_speed_roll3',
    'cloud_cover_roll7', 'pressure_roll3', 'rain_or_not_roll3',
    'temp_wind_interaction', 'pressure_humidity_interaction'
]

# Load model
model = joblib.load('rain_prediction_model.pkl')

# Prediction function
def predict_future(last_known_data, days=21):
    predictions = []
    current_data = last_known_data.copy()
    current_date = pd.to_datetime(current_data['date'])
    
    # Mapping between rolling features and their base features
    roll_feature_map = {
        'avg_temperature_roll3': 'avg_temperature',
        'avg_humidity_roll7': 'humidity',
        'avg_wind_speed_roll3': 'avg_wind_speed',
        'cloud_cover_roll7': 'cloud_cover',
        'pressure_roll3': 'pressure',
        'rain_or_not_roll3': 'rain_or_not'
    }
    
    # Initialize rolling features with current values
    for feat in roll_feature_map:
        base_feature = roll_feature_map[feat]
        current_data[feat] = current_data[base_feature]
    
    for _ in range(days):
        # Prepare features
        features = {
            'month': current_date.month,
            'day_of_year': current_date.dayofyear,
            'day_of_week': current_date.dayofweek,
            'avg_temperature_lag1': current_data['avg_temperature'],
            'avg_wind_speed_lag1': current_data['avg_wind_speed'],
            'humidity_lag1': current_data['humidity'],
            'cloud_cover_lag1': current_data['cloud_cover'],
            'pressure_lag1': current_data['pressure'],
            'rain_or_not_lag1': current_data['rain_or_not'],
            'avg_temperature_roll3': current_data['avg_temperature_roll3'],
            'avg_humidity_roll7': current_data['avg_humidity_roll7'],
            'avg_wind_speed_roll3': current_data['avg_wind_speed_roll3'],
            'cloud_cover_roll7': current_data['cloud_cover_roll7'],
            'pressure_roll3': current_data['pressure_roll3'],
            'rain_or_not_roll3': current_data['rain_or_not_roll3'],
            'temp_wind_interaction': current_data['avg_temperature'] * current_data['avg_wind_speed'],
            'pressure_humidity_interaction': current_data['pressure'] * current_data['humidity']
        }
        
        # Predict probability
        proba = model.predict_proba(pd.DataFrame([features]))[0][1]
        predictions.append(proba)
        
        # Update state for next day
        current_date += pd.DateOffset(days=1)
        rain_status = 1 if proba > 0.5 else 0
        
        # Update rolling features using FIFO approach
        current_data['avg_temperature_roll3'] = np.mean([
            current_data['avg_temperature_roll3'],
            current_data['avg_temperature']
        ])
        current_data['avg_humidity_roll7'] = np.std([
            current_data['avg_humidity_roll7'],
            current_data['humidity']
        ])
        current_data['avg_wind_speed_roll3'] = np.mean([
            current_data['avg_wind_speed_roll3'],
            current_data['avg_wind_speed']
        ])
        current_data['cloud_cover_roll7'] = np.mean([
            current_data['cloud_cover_roll7'],
            current_data['cloud_cover']
        ])
        current_data['pressure_roll3'] = np.mean([
            current_data['pressure_roll3'],
            current_data['pressure']
        ])
        current_data['rain_or_not_roll3'] = np.mean([
            current_data['rain_or_not_roll3'],
            rain_status
        ])
        
        # Update lagged features with current values
        current_data['avg_temperature'] = current_data['avg_temperature']
        current_data['avg_wind_speed'] = current_data['avg_wind_speed']
        current_data['humidity'] = current_data['humidity']
        current_data['cloud_cover'] = current_data['cloud_cover']
        current_data['pressure'] = current_data['pressure']
        current_data['rain_or_not'] = rain_status
    
    return predictions

# Get last known data
last_known = df.iloc[-1].to_dict()
last_known['date'] = df['date'].iloc[-1]

# Generate predictions
future_probas = predict_future(last_known, days=21)

# Create result dataframe
future_dates = pd.date_range(start=last_known['date'] + pd.DateOffset(days=1), periods=21)
result = pd.DataFrame({
    'Date': future_dates,
    'Rain_Probability': [f"{p:.1%}" for p in future_probas]
})

print(result)

         Date Rain_Probability
0  2023-11-08             2.0%
1  2023-11-09             2.4%
2  2023-11-10             2.0%
3  2023-11-11             2.0%
4  2023-11-12             2.0%
5  2023-11-13             2.0%
6  2023-11-14             1.9%
7  2023-11-15             2.0%
8  2023-11-16             2.0%
9  2023-11-17             2.0%
10 2023-11-18             2.0%
11 2023-11-19             2.0%
12 2023-11-20             2.0%
13 2023-11-21             1.9%
14 2023-11-22             2.0%
15 2023-11-23             2.0%
16 2023-11-24             2.0%
17 2023-11-25             2.0%
18 2023-11-26             2.0%
19 2023-11-27             2.0%
20 2023-11-28             1.9%
