In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle
import json
import warnings
warnings.filterwarnings('ignore')

class WeatherPredictor:
    def __init__(self, model_name='gradient_boost'):
        self.model = None
        self.scaler = StandardScaler()
        self.model_name = model_name
        self.performance_metrics = {}
        self.feature_names = ['humidity', 'pressure', 'wind_speed', 'temperature', 'cloud_cover', 'dew_point', 'visibility']

    def generate_realistic_data(self, n_samples=2000):
        """Generate realistic weather data with temporal patterns"""
        np.random.seed(42)

        # Create time-based patterns
        days = np.arange(n_samples)
        seasonal_pattern = 15 * np.sin(2 * np.pi * days / 365)

        # Base weather components
        base_temp = 15 + seasonal_pattern
        temperature = base_temp + np.random.normal(0, 2, n_samples)

        # Humidity inversely correlated with temperature
        humidity = 70 - 0.5 * (temperature - base_temp) + np.random.normal(0, 5, n_samples)
        humidity = np.clip(humidity, 20, 100)

        # Pressure with weather patterns
        pressure = 1015 + 5 * np.sin(2 * np.pi * days / 30) + np.random.normal(0, 2, n_samples)

        # Wind speed (higher on certain pressure conditions)
        wind_speed = 8 + 2 * np.abs(pressure - 1015) / 5 + np.random.normal(0, 1.5, n_samples)
        wind_speed = np.clip(wind_speed, 0, 40)

        # Cloud cover correlated with humidity
        cloud_cover = 40 + 0.4 * (humidity - 70) + np.random.normal(0, 8, n_samples)
        cloud_cover = np.clip(cloud_cover, 0, 100)

        # Dew point (related to humidity and temperature)
        dew_point = temperature - (100 - humidity) / 5 + np.random.normal(0, 1, n_samples)

        # Visibility (inversely related to humidity and cloud cover)
        visibility = 10 - (humidity - 50) / 20 - cloud_cover / 50 + np.random.normal(0, 0.5, n_samples)
        visibility = np.clip(visibility, 0.1, 10)

        # Tomorrow's temperature (next day prediction)
        tomorrow_temp = (
            0.6 * temperature +
            0.15 * (humidity - 70) / 30 +
            0.1 * (pressure - 1015) / 10 +
            0.08 * wind_speed / 10 +
            0.05 * cloud_cover / 100 +
            0.02 * dew_point +
            seasonal_pattern * 0.1 +
            np.random.normal(0, 0.8, n_samples)
        )

        df = pd.DataFrame({
            'humidity': humidity,
            'pressure': pressure,
            'wind_speed': wind_speed,
            'temperature': temperature,
            'cloud_cover': cloud_cover,
            'dew_point': dew_point,
            'visibility': visibility,
            'tomorrow_temp': tomorrow_temp
        })

        return df

    def prepare_data(self, df, test_size=0.2):
        """Split and scale data"""
        X = df[self.feature_names]
        y = df['tomorrow_temp']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )

        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test, X_train, X_test

    def train(self, X_train, y_train, X_test, y_test):
        """Train model with multiple algorithms and select best"""

        if self.model_name == 'gradient_boost':
            self.model = GradientBoostingRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=7,
                min_samples_split=5,
                min_samples_leaf=2,
                subsample=0.8,
                random_state=42
            )
        elif self.model_name == 'random_forest':
            self.model = RandomForestRegressor(
                n_estimators=200,
                max_depth=20,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1
            )
        elif self.model_name == 'neural_net':
            self.model = MLPRegressor(
                hidden_layer_sizes=(100, 50, 25),
                activation='relu',
                solver='adam',
                learning_rate_init=0.001,
                max_iter=500,
                random_state=42
            )

        self.model.fit(X_train, y_train)

        # Evaluate
        y_pred = self.model.predict(X_test)

        self.performance_metrics = {
            'mse': mean_squared_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'mae': mean_absolute_error(y_test, y_pred),
            'r2': r2_score(y_test, y_pred)
        }

        return y_pred

    def get_feature_importance(self, X_train):
        """Extract feature importance"""
        if hasattr(self.model, 'feature_importances_'):
            importances = self.model.feature_importances_
            return pd.DataFrame({
                'feature': self.feature_names,
                'importance': importances
            }).sort_values('importance', ascending=False)
        return None

    def predict(self, conditions_dict):
        """Make prediction for new conditions"""
        conditions = np.array([[
            conditions_dict['humidity'],
            conditions_dict['pressure'],
            conditions_dict['wind_speed'],
            conditions_dict['temperature'],
            conditions_dict['cloud_cover'],
            conditions_dict['dew_point'],
            conditions_dict['visibility']
        ]])

        conditions_scaled = self.scaler.transform(conditions)
        prediction = self.model.predict(conditions_scaled)

        return prediction[0]

    def save_model(self, filepath='weather_model.pkl'):
        """Save model and scaler"""
        with open(filepath, 'wb') as f:
            pickle.dump({'model': self.model, 'scaler': self.scaler}, f)
        print(f"Model saved to {filepath}")

    def load_model(self, filepath='weather_model.pkl'):
        """Load saved model"""
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
            self.model = data['model']
            self.scaler = data['scaler']
        print(f"Model loaded from {filepath}")

# Main execution
if __name__ == "__main__":
    print("=" * 70)
    print("ADVANCED WEATHER PREDICTION MODEL")
    print("=" * 70)

    # Initialize predictor
    predictor = WeatherPredictor(model_name='gradient_boost')

    # Generate data
    print("\n[1] Generating realistic weather data...")
    df = predictor.generate_realistic_data(n_samples=2000)
    print(f"    Generated {len(df)} samples with {len(predictor.feature_names)} features")

    # Prepare data
    print("[2] Preparing and scaling data...")
    X_train, X_test, y_train, y_test, X_train_orig, X_test_orig = predictor.prepare_data(df)
    print(f"    Training samples: {len(X_train)}, Test samples: {len(X_test)}")

    # Train model
    print("[3] Training Gradient Boosting model...")
    y_pred = predictor.train(X_train, y_train, X_test, y_test)
    print("    Training complete!")

    # Display metrics
    print("\n" + "=" * 70)
    print("MODEL PERFORMANCE METRICS")
    print("=" * 70)
    for metric, value in predictor.performance_metrics.items():
        print(f"{metric.upper():10s}: {value:.4f}")

    # Feature importance
    print("\n" + "=" * 70)
    print("FEATURE IMPORTANCE")
    print("=" * 70)
    importance_df = predictor.get_feature_importance(X_train_orig)
    if importance_df is not None:
        print(importance_df.to_string(index=False))

    # Sample predictions
    print("\n" + "=" * 70)
    print("SAMPLE PREDICTIONS (Actual vs Predicted)")
    print("=" * 70)
    for i in range(5):
        actual = y_test.iloc[i]
        predicted = y_pred[i]
        error = abs(actual - predicted)
        print(f"Actual: {actual:7.2f}°C | Predicted: {predicted:7.2f}°C | Error: {error:6.2f}°C")

    # New prediction
    print("\n" + "=" * 70)
    print("PREDICTION FOR NEW CONDITIONS")
    print("=" * 70)
    new_conditions = {
        'humidity': 65,
        'pressure': 1013,
        'wind_speed': 10,
        'temperature': 15,
        'cloud_cover': 45,
        'dew_point': 8,
        'visibility': 8.5
    }

    print("Input Conditions:")
    for key, value in new_conditions.items():
        print(f"  {key.replace('_', ' ').title():20s}: {value}")

    prediction = predictor.predict(new_conditions)
    print(f"\nPredicted Tomorrow's Temperature: {prediction:.2f}°C")

    # Save model
    print("\n[4] Saving model...")
    predictor.save_model('weather_model.pkl')
    print("=" * 70)

ADVANCED WEATHER PREDICTION MODEL

[1] Generating realistic weather data...
    Generated 2000 samples with 7 features
[2] Preparing and scaling data...
    Training samples: 1600, Test samples: 400
[3] Training Gradient Boosting model...
    Training complete!

MODEL PERFORMANCE METRICS
MSE       : 0.8123
RMSE      : 0.9013
MAE       : 0.7180
R2        : 0.9862

FEATURE IMPORTANCE
    feature  importance
temperature    0.985903
  dew_point    0.004080
   pressure    0.002345
 wind_speed    0.002117
 visibility    0.001896
   humidity    0.001890
cloud_cover    0.001768

SAMPLE PREDICTIONS (Actual vs Predicted)
Actual:   16.83°C | Predicted:   15.33°C | Error:   1.50°C
Actual:    7.22°C | Predicted:    6.87°C | Error:   0.35°C
Actual:    1.27°C | Predicted:    2.86°C | Error:   1.60°C
Actual:   11.70°C | Predicted:   10.70°C | Error:   1.00°C
Actual:    4.98°C | Predicted:    3.95°C | Error:   1.03°C

PREDICTION FOR NEW CONDITIONS
Input Conditions:
  Humidity            : 65
  Pressure