In [119]:
import numpy as np
import pandas as pd
# Load the dataset
df = pd.read_csv('mumbai_weather_2024_2025.csv')  # Replace with your file path

# Basic inspection
print("Dataset Info:")
print(df.info())
print("\nFirst 5 Rows:")
print(df.head())
print("\nMissing Values:")
print(df.isnull().sum())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11184 entries, 0 to 11183
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              11184 non-null  object 
 1   temperature       11142 non-null  float64
 2   humidity          11142 non-null  float64
 3   precipitation     11142 non-null  float64
 4   rain              11142 non-null  float64
 5   snowfall          11142 non-null  float64
 6   cloud_coverage    11142 non-null  float64
 7   wind_speed        11142 non-null  float64
 8   wind_direction    11142 non-null  float64
 9   pressure          11142 non-null  float64
 10  surface_pressure  11142 non-null  float64
 11  is_day            11184 non-null  int64  
 12  weather_code      11142 non-null  float64
dtypes: float64(11), int64(1), object(1)
memory usage: 1.1+ MB
None

First 5 Rows:
               time  temperature  humidity  precipitation  rain  snowfall  \
0  2024-01-0

In [123]:
# Drop rows where critical columns are missing
df = df.dropna(subset=['temperature', 'humidity', 'weather_code'])

# Verify
print("Missing Values After Drop:")
print(df.isnull().sum())
df.info()

Missing Values After Drop:
time                0
temperature         0
humidity            0
precipitation       0
rain                0
snowfall            0
cloud_coverage      0
wind_speed          0
wind_direction      0
pressure            0
surface_pressure    0
is_day              0
weather_code        0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 11142 entries, 0 to 11141
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              11142 non-null  object 
 1   temperature       11142 non-null  float64
 2   humidity          11142 non-null  float64
 3   precipitation     11142 non-null  float64
 4   rain              11142 non-null  float64
 5   snowfall          11142 non-null  float64
 6   cloud_coverage    11142 non-null  float64
 7   wind_speed        11142 non-null  float64
 8   wind_direction    11142 non-null  float64
 9   pressure          11142 non-null  float64
 10  s

In [131]:
# Convert time to datetime
df['time'] = pd.to_datetime(df['time'])

# Ensure numeric columns are float
numeric_cols = ['temperature', 'humidity', 'precipitation', 'rain', 'snowfall',
                'cloud_coverage', 'wind_speed', 'wind_direction', 'pressure',
                'surface_pressure']
df[numeric_cols] = df[numeric_cols].astype(float)

# Ensure weather_code is int (for classification)
df['weather_code'] = df['weather_code'].astype(int)

# Verify
print(df.dtypes)

time                      datetime64[ns]
temperature                      float64
humidity                         float64
precipitation                    float64
rain                             float64
snowfall                         float64
cloud_coverage                   float64
wind_speed                       float64
wind_direction                   float64
pressure                         float64
surface_pressure                 float64
is_day                             int64
weather_code                       int32
forecasted_temperature           float64
dtype: object


In [133]:
# Definition: forecasted_temperature = temperature at time + 1 hour.
# Shift the temperature column by -1 to align each row’s features with the next hour’s temperature.
df['forecasted_temperature'] = df['temperature'].shift(-1)

In [135]:
# Drop rows where forecasted_temperature or key features are NaN
df = df.dropna(subset=['forecasted_temperature', 'temperature', 'humidity', 'wind_speed', 'pressure'])

# Verify
print("Missing Values After Drop:")
print(df.isnull().sum())
print(df.info())

Missing Values After Drop:
time                      0
temperature               0
humidity                  0
precipitation             0
rain                      0
snowfall                  0
cloud_coverage            0
wind_speed                0
wind_direction            0
pressure                  0
surface_pressure          0
is_day                    0
weather_code              0
forecasted_temperature    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 11141 entries, 0 to 11140
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   time                    11141 non-null  datetime64[ns]
 1   temperature             11141 non-null  float64       
 2   humidity                11141 non-null  float64       
 3   precipitation           11141 non-null  float64       
 4   rain                    11141 non-null  float64       
 5   snowfall                11141 non-null 

In [137]:
# Feature Engineering
# Create features to help predict forecasted_temperature:
# Time-based: Hour, day of week, month (captures diurnal/seasonal patterns).
# Lagged variables: Past temperatures and humidity (weather is autocorrelated).
# Wind components: Split wind_speed and wind_direction for continuity.
# Precipitation flag: Simplifies sparse precipitation data.
# Time-based features
df['hour'] = df['time'].dt.hour
df['day_of_week'] = df['time'].dt.dayofweek
df['month'] = df['time'].dt.month

# Lagged variables
df['temp_lag1'] = df['temperature'].shift(1)
df['temp_lag2'] = df['temperature'].shift(2)
df['temp_lag3'] = df['temperature'].shift(3)
df['humidity_lag1'] = df['humidity'].shift(1)

# Wind components
df['wind_x'] = df['wind_speed'] * np.cos(np.radians(df['wind_direction']))
df['wind_y'] = df['wind_speed'] * np.sin(np.radians(df['wind_direction']))

# Precipitation flag
df['has_precipitation'] = (df['precipitation'] > 0).astype(int)

# Drop rows with NaN from lagging
df = df.dropna()
print(df.isnull().sum())
print(df.info())

#Time: Temperature varies by hour (hotter midday) and season (cooler in January).
#Lags: Past temperatures strongly predict future ones (e.g., warm now → warm soon).
#Wind: Degrees (0°=360°) are circular; x/y components make it linear.
#Precipitation: Sparse (mostly 0), so a binary flag is robust.

time                      0
temperature               0
humidity                  0
precipitation             0
rain                      0
snowfall                  0
cloud_coverage            0
wind_speed                0
wind_direction            0
pressure                  0
surface_pressure          0
is_day                    0
weather_code              0
forecasted_temperature    0
hour                      0
day_of_week               0
month                     0
temp_lag1                 0
temp_lag2                 0
temp_lag3                 0
humidity_lag1             0
wind_x                    0
wind_y                    0
has_precipitation         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 11138 entries, 3 to 11140
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   time                    11138 non-null  datetime64[ns]
 1   temperature             11

In [139]:
# Basic stats
print("Temperature Stats:")
print(df[['temperature', 'forecasted_temperature', 'humidity']].describe())

# Cap outliers
df['temperature'] = df['temperature'].clip(lower=-50, upper=50)
df['forecasted_temperature'] = df['forecasted_temperature'].clip(lower=-50, upper=50)
df['humidity'] = df['humidity'].clip(lower=0, upper=100)
# Capping prevents extreme values (e.g., 100°C) from skewing the model.

Temperature Stats:
        temperature  forecasted_temperature      humidity
count  11138.000000            11138.000000  11138.000000
mean      27.352245               27.352676     66.558538
std        3.679264                3.678848     21.157461
min       16.700000               16.700000      9.000000
25%       25.200000               25.200000     50.000000
50%       27.000000               27.000000     70.000000
75%       29.500000               29.500000     86.000000
max       41.400000               41.400000     99.000000


In [183]:
#Exclude: time (non-numeric), precipitation/rain/snowfall (redundant with has_precipitation), surface_pressure (correlated with pressure), weather_code
features = ['temperature', 'humidity', 'wind_x', 'wind_y', 'pressure',
            'cloud_coverage', 'has_precipitation', 'hour', 'day_of_week', 'month',
            'temp_lag1', 'temp_lag2', 'temp_lag3', 'humidity_lag1']
target = 'forecasted_temperature'

X = df[features]
y = df[target]


In [185]:
# Split Data
# Use a time-based split to mimic forecasting (train on past, test on future)
# Split: ~80% train, 20% test
train_size = int(0.8 * len(df))
X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test = y.iloc[train_size:]

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)
# Time-based split prevents data leakage (future info in training).
# 80/20 is standard;

Train Shape: (8910, 14)
Test Shape: (2228, 14)


In [187]:
# Scale Features:Normalize features for better regression performance.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for clarity
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

In [189]:
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# Train and evaluate XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

In [191]:
# # For Saving model 
import joblib
joblib.dump(xgb_model, 'temp_model_xgb.pkl')
joblib.dump(scaler, 'scaler.pkl')  # Save scaler for live predictions

['scaler.pkl']

In [157]:
# Lets Start Training our Classifier for precipitation
# Define target: 1 if next hour has precipitation, 0 otherwise
df['has_precipitation_future'] = (df['precipitation'].shift(-1) > 0).astype(int)

# Drop rows where target is NaN (last row after shift)
df = df.dropna(subset=['has_precipitation_future'])

In [159]:
print("Rows After Drop:", len(df))
print("Missing Values:")
print(df[['precipitation', 'has_precipitation_future']].isnull().sum())

Rows After Drop: 11138
Missing Values:
precipitation               0
has_precipitation_future    0
dtype: int64


In [161]:
# Features (same as regression)
features = ['temperature', 'humidity', 'wind_x', 'wind_y', 'pressure',
            'cloud_coverage', 'has_precipitation', 'hour', 'day_of_week', 'month',
            'temp_lag1', 'temp_lag2', 'temp_lag3', 'humidity_lag1']

# Update X and y for classification (after new NaN drop)
X = df[features]
y_precip = df['has_precipitation_future']

In [165]:
from sklearn.model_selection import train_test_split
# Here we used Random split because via time-split we are getting biased split which is not good for training model
#Biased split means getting dry precipitation on one side
# Random split
X_train_precip, X_test_precip, y_train_precip, y_test_precip = train_test_split(
    X, y_precip, test_size=0.2, random_state=42, stratify=y_precip
)

print("New Train Shape:", X_train_precip.shape)  
print("New Test Shape:", X_test_precip.shape)

New Train Shape: (8910, 14)
New Test Shape: (2228, 14)


In [169]:
# Precipitation training (stratified with SMOTE)
print("Precipitation Train Distribution (Before SMOTE):\n", y_train_precip.value_counts(normalize=True))
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_precip_balanced, y_train_precip_balanced = smote.fit_resample(X_train_precip, y_train_precip)
print("Balanced Precipitation Train Shape:", X_train_precip_balanced.shape)
print("Balanced Precipitation Distribution (After SMOTE):\n", pd.Series(y_train_precip_balanced).value_counts(normalize=True))

Precipitation Train Distribution (Before SMOTE):
 has_precipitation_future
0    0.781481
1    0.218519
Name: proportion, dtype: float64
Balanced Precipitation Train Shape: (13926, 14)
Balanced Precipitation Distribution (After SMOTE):
 has_precipitation_future
0    0.5
1    0.5
Name: proportion, dtype: float64


In [193]:
# Why rescaling because previous X_train_scaled and X_test_scaled were on time based split
from sklearn.preprocessing import StandardScaler

scaler_precip = StandardScaler()
X_train_scaled_precip = scaler_precip.fit_transform(X_train_precip_balanced)
X_test_scaled_precip = scaler_precip.transform(X_test_precip)

In [195]:
from xgboost import XGBClassifier

precip_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
precip_model.fit(X_train_scaled_precip, y_train_precip_balanced)

In [197]:
from sklearn.metrics import accuracy_score, f1_score

y_pred_precip = precip_model.predict(X_test_scaled_precip)
accuracy = accuracy_score(y_test_precip, y_pred_precip)
f1 = f1_score(y_test_precip, y_pred_precip)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")

Accuracy: 0.9439
F1-Score: 0.8773


In [199]:
joblib.dump(precip_model, 'precip_model_xgb.pkl')
joblib.dump(scaler_precip, 'precip_scaler.pkl')  # Save scaler for live predictions

['precip_scaler.pkl']