# Model Training Using XGBoost

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import pickle

In [2]:
df = pd.read_csv("cleaned_data.csv")  

# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Extract date features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.weekday
df['DayOfYear'] = df['Date'].dt.dayofyear

## Dynamic Seasons
### As cities have different durations and starts for seasons

In [3]:
# Define seasons for each city
city_season_mapping = {
    "Mumbai": {
        "Summer": [3, 4, 5],       # March to May
        "Monsoon": [6, 7, 8],      # June to August
        "Post-Monsoon": [9, 10, 11], # September to November
        "Winter": [12, 1, 2]       # December to February
    },
    "Delhi": {
        "Summer": [5, 6],          # May to June
        "Monsoon": [7, 8],         # July to August
        "Post-Monsoon": [9, 10, 11], # September to November
        "Winter": [12, 1, 2, 3]    # December to March
    },
    "Chennai": {
        "Summer": [3, 4, 5],       # March to May
        "Monsoon": [6, 7],         # June to July
        "Post-Monsoon": [8, 9],    # August to September
        "Winter": [10, 11, 12]     # October to December
    },
    "Kolkata": {
        "Summer": [3, 4, 5],       # March to May
        "Monsoon": [6, 7, 8],      # June to August
        "Post-Monsoon": [9, 10],   # September to October
        "Winter": [11, 12, 1, 2]   # November to February
    },
    "Bengaluru": {
        "Summer": [3, 4],          # March to April
        "Monsoon": [5, 6, 7],      # May to July
        "Post-Monsoon": [8, 9],    # August to September
        "Winter": [10, 11, 12]     # October to December
    },
    "Hyderabad": {
        "Summer": [3, 4, 5],       # March to May
        "Monsoon": [6, 7],         # June to July
        "Post-Monsoon": [8],       # August 
        "Winter": [9, 10, 11, 12]   # September to December
    },
    "Ahmedabad": {
        "Summer": [4, 5, 6],       # April to June
        "Monsoon": [7, 8],         # July to August
        "Post-Monsoon": [9],       # September 
        "Winter": [10, 11, 12]     # October to December
    },
    "Pune": {
        "Summer": [3, 4],          # March to April
        "Monsoon": [5, 6, 7],      # May to July
        "Post-Monsoon": [8],       # August 
        "Winter": [9, 10]          # September and October 
    }
}

# Reverse mapping for season names to months
season_name_mapping = {
    "Mumbai": {
        "Summer": 3,
        "Monsoon": 6,
        "Post-Monsoon": 9,
        "Winter": 12
    },
    "Delhi": {
        "Summer": 5,
        "Monsoon": 7,
        "Post-Monsoon": 9,
        "Winter": 12
    },
     "Chennai": {
        "Summer": 3,
        "Monsoon": 6,
        "Post-Monsoon": 8,
        "Winter": 10
    },
    "Kolkata": {
        "Summer": 3,
        "Monsoon": 6,
        "Post-Monsoon": 9,
        "Winter": 11
    },
    "Bengaluru": {
        "Summer": 3,
        "Monsoon": 5,
        "Post-Monsoon": 8,
        "Winter": 10
    },
    "Hyderabad": {
        "Summer": 3,
        "Monsoon": 6,
        "Post-Monsoon": 8,
        "Winter": 9
    },
    "Ahmedabad": {
        "Summer": 4,
        "Monsoon": 7,
        "Post-Monsoon": 9,
        "Winter": 10
    },
    "Pune": {
        "Summer": 3,
        "Monsoon": 5,
        "Post-Monsoon": 8,
        "Winter": 9
    }
}

In [4]:
def get_city_season(month, city):
    """Get season based on city-specific mapping."""
    if city not in city_season_mapping:
        raise ValueError(f"City '{city}' not found in season mapping.")
    
    for season, months in city_season_mapping[city].items():
        if month in months:
            return season
    return None

# Function to get the month from season name
def get_month_from_season(season, city):
    """Get starting month based on city and season name."""
    if city not in season_name_mapping:
        raise ValueError(f"City '{city}' not found in season name mapping.")
    if season not in season_name_mapping[city]:
        raise ValueError(f"Season '{season}' not found for city '{city}'.")
    return season_name_mapping[city][season]

## Introduction of Cyclic Features for better predictions

In [5]:
# Apply dynamic season assignment based on city and month in your DataFrame.
df['Season'] = df.apply(lambda row: get_city_season(row['Month'], row['city']), axis=1)

# Encode City names using Label Encoding.
le = LabelEncoder()
df['City_Encoded'] = le.fit_transform(df['city'])

# Encode Season
season_encoder = LabelEncoder()
df['Season_Encoded'] = season_encoder.fit_transform(df['Season'])

# Cyclical features for Month and DayOfYear.
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
df['DayOfYear_sin'] = np.sin(2 * np.pi * df['DayOfYear'] / 365)
df['DayOfYear_cos'] = np.cos(2 * np.pi * df['DayOfYear'] / 365)
df['Year_sin'] = np.sin(2 * np.pi * df['Year'] / 365)
df['Year_cos'] = np.cos(2 * np.pi * df['Year'] / 365)

## A Simple XGBoost Model

In [6]:
# Select features and target variables.
features = ['Year', 'Month', 'Day', 'Weekday', 'Season_Encoded', 'City_Encoded',
            'Month_sin', 'Month_cos', 'DayOfYear_sin', 'DayOfYear_cos','Year_sin', 'Year_cos']
targets = ['Temp Min', 'Temp Max', 'Rain']

# Split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(df[features], df[targets], test_size=0.2,
                                                    random_state=42)

# Train a MultiOutputRegressor with XGBoost for multivariate regression.
model = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=2000,
                                               learning_rate=0.1,
                                               random_state=42))
model.fit(X_train, y_train)

## Saving model to Pickle format

In [8]:
# Save the model as a pickle file after training.
with open("multivariate_model_seasons_final.pkl", "wb") as f:
    pickle.dump(model, f)

## Model Performance

In [9]:
# Evaluate models on test data.
y_pred = model.predict(X_test)
for i, target in enumerate(targets):
    mae = mean_absolute_error(y_test[target], y_pred[:, i])
    print(f"{target} MAE: {mae:.2f}")

Temp Min MAE: 0.88
Temp Max MAE: 1.04
Rain MAE: 3.23


## We want the users to only input a few things and for that:

### Function to take only date and city as input

In [10]:
def predict_weather_by_date(date_str: str, city: str):
    """Predict weather based on a specific date and city."""
    
    dt = pd.to_datetime(date_str)
    
    year = dt.year
    month = dt.month
    day = dt.day
    weekday = dt.weekday()
    dayofyear = dt.dayofyear

    # Get season and encode it
    season = get_city_season(month, city)
    season_encoded = season_encoder.transform([season])[0]

    # Encode city
    city_encoded = le.transform([city])[0]

    # Create input data
    input_data = pd.DataFrame({
        'Year': [year],
        'Month': [month],
        'Day': [day],
        'Weekday': [weekday],
        'Season_Encoded': [season_encoded],
        'City_Encoded': [city_encoded],
        'Month_sin': [np.sin(2 * np.pi * month / 12)],
        'Month_cos': [np.cos(2 * np.pi * month / 12)],
        'DayOfYear_sin': [np.sin(2 * np.pi * dayofyear / 365)],
        'DayOfYear_cos': [np.cos(2 * np.pi * dayofyear / 365)],
        'Year_sin': [np.sin(2 * np.pi * year / 365)],
        'Year_cos': [np.cos(2 * np.pi * year / 365)]
    })

    # Ensure feature order matches training data
    input_data = input_data[features]

    # Predict weather using the trained model
    predictions = model.predict(input_data)[0]

    return {
        "Temp Min": predictions[0],
        "Temp Max": predictions[1],
        "Rain": predictions[2]
    }

### Function to take only season, year, and city as input

In [11]:
def predict_weather_by_season(year: int, season: str, city: str):
    """Predict weather based on season name and city."""

    # Get the starting month for the specified season and city.
    try:
        month = get_month_from_season(season, city)
    except ValueError as e:
        return {"Error": str(e)}

    # Encode the season
    season_encoded = season_encoder.transform([season])[0]

    input_data = pd.DataFrame({
        'Year': [year],
        'Month': [month],
        'Day': [1],          # Dummy value; not used in prediction.
        'Weekday': [0],      # Dummy value; not used in prediction.
        'Season_Encoded': [season_encoded],
        'City_Encoded': [le.transform([city])[0]],
        'Month_sin': [0],
        'Month_cos': [0],
        'DayOfYear_sin': [0],
        'DayOfYear_cos': [0],
        'Year_sin': [0],
        'Year_cos': [0]
    })

    input_data = input_data[features]

    predictions = model.predict(input_data)[0]

    return {
        "Temp Min": predictions[0],
        "Temp Max": predictions[1],
        "Rain": predictions[2]
    }

## Example Usage

In [12]:
# Example usage for date-based prediction:
date_input_mumbai = "2025-01-27"
city_input_mumbai = "Mumbai"
prediction_date_based_mumbai = predict_weather_by_date(date_input_mumbai, city_input_mumbai)
print("Predicted Weather (Mumbai by date):", prediction_date_based_mumbai)

date_input_delhi = "2025-07-15"
city_input_delhi = "Delhi"
prediction_date_based_delhi = predict_weather_by_date(date_input_delhi, city_input_delhi)
print("Predicted Weather (Delhi by date):", prediction_date_based_delhi)

Predicted Weather (Mumbai by date): {'Temp Min': 16.41373, 'Temp Max': 29.53263, 'Rain': 17.047567}
Predicted Weather (Delhi by date): {'Temp Min': 27.880053, 'Temp Max': 36.79865, 'Rain': 1.6061037}


In [13]:
# Example usage for seasonal prediction:
year_input_mumbai = 2025
season_input_mumbai = "Summer"
city_input_mumbai = "Mumbai"
prediction_season_based_mumbai = predict_weather_by_season(year_input_mumbai, season_input_mumbai, city_input_mumbai)
print("Predicted Weather (Mumbai by dynamic season):", prediction_season_based_mumbai)

year_input_delhi = 2026
season_input_delhi = "Winter"
city_input_delhi = "Delhi"
prediction_season_based_delhi = predict_weather_by_season(year_input_delhi, season_input_delhi, city_input_delhi)
print("Predicted Weather (Delhi by dynamic season):", prediction_season_based_delhi)

Predicted Weather (Mumbai by dynamic season): {'Temp Min': 22.908241, 'Temp Max': 32.76773, 'Rain': 1.5773422}
Predicted Weather (Delhi by dynamic season): {'Temp Min': 21.099031, 'Temp Max': 36.10203, 'Rain': -4.287574}
