## This Script Contains the Following Points:
#### 1. Importing Libraries & Data
#### 2. Data Wrangling
#### 3. Reshaping
#### 4. Data Split
#### 5. Bayesian Hyperparameter Optimization

# 1. Importing Libraries & Data

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.multiclass import type_of_target
import tensorflow as tf
from numpy import unique
from numpy import reshape
from tensorflow.keras.models import Sequential
from sklearn.model_selection import cross_val_score
from tensorflow.keras.layers import Input, Conv1D, Dense, Dropout, BatchNormalization, Flatten, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from scikeras.wrappers import KerasClassifier  # Use scikeras for scikit-learn compatibility
from math import floor
from bayes_opt import BayesianOptimization
from tensorflow.keras.layers import LeakyReLU  # Use tensorflow.keras instead of keras
LeakyReLU = LeakyReLU(negative_slope=0.1)
import warnings

In [6]:
# Setting option to ensure charts are displayed inline in the notebook

%matplotlib inline

In [8]:
# Creating folder path to project folder

path = r'/Users/C SaiVishwanath/Desktop/ClimateWins'

In [12]:
# importing unscaled dataset

unscaled = pd.read_csv(os.path.join(path, '01 Data/Original Data/weather_prediction.csv'))

In [14]:
# importing pleasant weather dataset

pleasant = pd.read_csv(os.path.join(path, '01 Data/Original Data/Pleasant_Weather.csv'))

In [16]:
unscaled.head(1)

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9


In [18]:
unscaled.shape

(22950, 170)

In [20]:
pleasant.head(1)

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
pleasant.shape

(22950, 16)

# 2. Data Wrangling

In [26]:
# To ensure data is structured with correct shape to feed the deep learning model completing the following:

# Dropping 3 weather stations not included in 'pleasant'.
# Removing 2 types of observations (columns) missing multiple entries for most stations.
# Filling in 3 individual observations assuming nearby stations have similar weather.
# Dropping DATE and MONTH from observations and DATE from unscaled df. 
# Checking X shape should be (22950, 135) and y shape should be (22950, 15).
# Exporting dataset as "Cleaned" version.

# Dropping the columns related to Tours, Gdansk and Rome from the unscaled dataset

unscaled = unscaled.drop(['GDANSK_cloud_cover', 'GDANSK_humidity', 'GDANSK_precipitation', 'GDANSK_snow_depth', 'GDANSK_temp_mean', 'GDANSK_temp_min', 'GDANSK_temp_max',
                        'ROMA_cloud_cover', 'ROMA_wind_speed', 'ROMA_humidity', 'ROMA_pressure', 'ROMA_sunshine', 'ROMA_temp_mean',
                        'TOURS_wind_speed', 'TOURS_humidity', 'TOURS_pressure', 'TOURS_global_radiation', 'TOURS_precipitation', 'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max'], axis=1)

In [28]:
unscaled.shape

(22950, 149)

In [30]:
# Extracting the different observation types

observation_types = ['cloud_cover', 'wind_speed', 'humidity', 'pressure',
                     'global_radiation', 'precipitation', 'snow_depth', 
                     'sunshine', 'temp_mean', 'temp_min', 'temp_max']

In [32]:
# Creating a dictionary to store the count of stations for each observation type
station_counts = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in unscaled.columns if col.endswith(obs)]
    
    # Count the number of stations (i.e., the number of columns) for the current observation type
    station_counts[obs] = len(columns)

# Print the count of stations for each observation type
print("Number of stations covered by each observation type:")
for obs, count in station_counts.items():
    print(f"{obs}: {count} stations")

Number of stations covered by each observation type:
cloud_cover: 14 stations
wind_speed: 9 stations
humidity: 14 stations
pressure: 14 stations
global_radiation: 15 stations
precipitation: 15 stations
snow_depth: 6 stations
sunshine: 15 stations
temp_mean: 15 stations
temp_min: 15 stations
temp_max: 15 stations


In [34]:
# The two columns missing multiple entries for most stations are: wind_speed (only 9 stations) and snow_depth (only 6 stations).
# Dropping columns that end with wind_speed and snow_depth from the dataset

columns_to_drop = unscaled.filter(regex='(_wind_speed|_snow_depth)$').columns
columns_to_drop

Index(['BASEL_wind_speed', 'BASEL_snow_depth', 'DEBILT_wind_speed',
       'DUSSELDORF_wind_speed', 'DUSSELDORF_snow_depth', 'HEATHROW_snow_depth',
       'KASSEL_wind_speed', 'LJUBLJANA_wind_speed', 'MAASTRICHT_wind_speed',
       'MADRID_wind_speed', 'MUNCHENB_snow_depth', 'OSLO_wind_speed',
       'OSLO_snow_depth', 'SONNBLICK_wind_speed', 'VALENTIA_snow_depth'],
      dtype='object')

In [36]:
unscaled = unscaled.drop(columns=columns_to_drop)

In [38]:
unscaled.shape

(22950, 134)

In [40]:
# Looking for remaining missing entry
# Creating a list of all unique station names in the dataset

all_stations = set([col.split('_')[0] for col in unscaled.columns if '_' in col])
all_stations

{'BASEL',
 'BELGRADE',
 'BUDAPEST',
 'DEBILT',
 'DUSSELDORF',
 'HEATHROW',
 'KASSEL',
 'LJUBLJANA',
 'MAASTRICHT',
 'MADRID',
 'MUNCHENB',
 'OSLO',
 'SONNBLICK',
 'STOCKHOLM',
 'VALENTIA'}

In [42]:
observation_types = ['cloud_cover', 'humidity', 'pressure']

missing_stations_by_observation = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in unscaled.columns if col.endswith(obs)]
    
    # Extract station names by removing the observation type from the column names
    station_names = set([col.replace(f'_{obs}', '') for col in columns])
    
    # Identify stations that are in all_stations but missing from the current observation type
    missing_stations = all_stations - station_names
    
    # Store the missing station names in the dictionary
    missing_stations_by_observation[obs] = missing_stations

# Print the missing station names for each observation type
for obs, missing_stations in missing_stations_by_observation.items():
    print(f"\nStations missing from {obs}:")
    if missing_stations:
        for station in missing_stations:
            print(station)
    else:
        print("None")


Stations missing from cloud_cover:
KASSEL

Stations missing from humidity:
STOCKHOLM

Stations missing from pressure:
MUNCHENB


In [44]:
# Finding the position of HEATHROW_temp_max to see where to position the new KASSEL_cloud_cover  (+1 next to it)

unscaled.columns.get_loc('HEATHROW_temp_max')


55

In [46]:
unscaled.columns.get_loc('STOCKHOLM_cloud_cover')

117

In [48]:
unscaled.columns.get_loc('MUNCHENB_humidity')

92

In [50]:
# Inserting new columns into "unscaled" at specific positions.
# Copying data from other existing columns:
# Kassel_cloud_cover with Dusseldorf_cloud_cover
# Stockholm_humidity with Oslo_humidity
# Munchenb_pressure with Basel_pressure

unscaled.insert(56,'KASSEL_cloud_cover', unscaled['DUSSELDORF_cloud_cover'])
unscaled.insert(119, 'STOCKHOLM_humidity', unscaled['OSLO_humidity'])
unscaled.insert(94,'MUNCHENB_pressure',unscaled['BASEL_pressure'])

In [52]:
unscaled.columns.tolist()

['DATE',
 'MONTH',
 'BASEL_cloud_cover',
 'BASEL_humidity',
 'BASEL_pressure',
 'BASEL_global_radiation',
 'BASEL_precipitation',
 'BASEL_sunshine',
 'BASEL_temp_mean',
 'BASEL_temp_min',
 'BASEL_temp_max',
 'BELGRADE_cloud_cover',
 'BELGRADE_humidity',
 'BELGRADE_pressure',
 'BELGRADE_global_radiation',
 'BELGRADE_precipitation',
 'BELGRADE_sunshine',
 'BELGRADE_temp_mean',
 'BELGRADE_temp_min',
 'BELGRADE_temp_max',
 'BUDAPEST_cloud_cover',
 'BUDAPEST_humidity',
 'BUDAPEST_pressure',
 'BUDAPEST_global_radiation',
 'BUDAPEST_precipitation',
 'BUDAPEST_sunshine',
 'BUDAPEST_temp_mean',
 'BUDAPEST_temp_min',
 'BUDAPEST_temp_max',
 'DEBILT_cloud_cover',
 'DEBILT_humidity',
 'DEBILT_pressure',
 'DEBILT_global_radiation',
 'DEBILT_precipitation',
 'DEBILT_sunshine',
 'DEBILT_temp_mean',
 'DEBILT_temp_min',
 'DEBILT_temp_max',
 'DUSSELDORF_cloud_cover',
 'DUSSELDORF_humidity',
 'DUSSELDORF_pressure',
 'DUSSELDORF_global_radiation',
 'DUSSELDORF_precipitation',
 'DUSSELDORF_sunshine',
 'DUSS

In [54]:
# Dropping unnecessary columns

unscaled.drop(['DATE', 'MONTH'], axis=1, inplace=True)

In [56]:
unscaled.shape

(22950, 135)

In [58]:
pleasant.drop(columns = 'DATE', inplace = True)
pleasant.shape

(22950, 15)

In [60]:
X = unscaled

In [62]:
X.head(1)

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,STOCKHOLM_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,7,0.85,1.018,0.32,0.09,0.7,6.5,0.8,10.9,1,...,4.9,5,0.88,1.0003,0.45,0.34,4.7,8.5,6.0,10.9


In [75]:
y = pleasant
y.head(1)

Unnamed: 0,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# 4. Reshaping

In [77]:
# Using argmax for y

X = X.reshape(-1, 15, 9)
y = np.argmax(y, axis=1)  

In [79]:
X.shape

(22950, 15, 9)

In [81]:
y.shape

(22950,)

In [83]:
# Correct shapes achieved

# 5. Data Split

In [86]:
# Splitting the data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [88]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(17212, 15, 9) (17212,)
(5738, 15, 9) (5738,)


# 6. Bayesian Hyperparameter Optimization

In [91]:
# Creating a Keras model with given hyperparameters

def create_model(n_hidden, kernel_size, activation, optimizer):
    model = Sequential()
    model.add(Conv1D(filters=n_hidden, kernel_size=int(kernel_size), activation=activation, input_shape=(15, 9)))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(15, activation='softmax'))
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [93]:
# Defining the function to optimize

def model_evaluation(n_hidden, kernel_size, activation, optimizer):
    activation = ['relu', 'sigmoid', 'tanh'][int(activation)]
    optimizer = ['adam', 'sgd', 'rmsprop'][int(optimizer)]
    
    model = create_model(n_hidden=int(n_hidden), kernel_size=int(kernel_size), activation=activation, optimizer=optimizer)
    model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)
    
    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    return accuracy

In [97]:
# Defining the hyperparameter bounds

param_bounds = {
    'n_hidden': (32, 128),
    'kernel_size': (2, 5),
    'activation': (0, 2),  # 0: relu, 1: sigmoid, 2: tanh
    'optimizer': (0, 2)    # 0: adam, 1: sgd, 2: rmsprop
}

In [99]:
# Running Bayesian Optimization

optimizer = BayesianOptimization(f=model_evaluation, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=15)

|   iter    |  target   | activa... | kernel... | n_hidden  | optimizer |
-------------------------------------------------------------------------


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m1        [39m | [39m0.7773   [39m | [39m0.7491   [39m | [39m4.852    [39m | [39m102.3    [39m | [39m1.197    [39m |
| [39m2        [39m | [39m0.7468   [39m | [39m0.312    [39m | [39m2.468    [39m | [39m37.58    [39m | [39m1.732    [39m |
| [39m3        [39m | [39m0.7612   [39m | [39m1.202    [39m | [39m4.124    [39m | [39m33.98    [39m | [39m1.94     [39m |
| [35m4        [39m | [35m0.8161   [39m | [35m1.665    [39m | [35m2.637    [39m | [35m49.46    [39m | [35m0.3668   [39m |
| [35m5        [39m | [35m0.825    [39m | [35m0.6085   [39m | [35m3.574    [39m | [35m73.47    [39m | [35m0.5825   [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m6        [39m | [39m0.8113   [39m | [39m1.101    [39m | [39m3.678    [39m | [39m67.29    [39m | [39m0.2336   [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m7        [39m | [39m0.7726   [39m | [39m1.73     [39m | [39m2.008    [39m | [39m81.39    [39m | [39m1.565    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m8        [39m | [39m0.8109   [39m | [39m1.716    [39m | [39m2.807    [39m | [39m56.13    [39m | [39m0.652    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m9        [39m | [39m0.7649   [39m | [39m0.5542   [39m | [39m2.345    [39m | [39m128.0    [39m | [39m1.06     [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m10       [39m | [39m0.7712   [39m | [39m0.03959  [39m | [39m4.774    [39m | [39m51.68    [39m | [39m1.994    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m11       [39m | [39m0.8207   [39m | [39m0.1567   [39m | [39m2.304    [39m | [39m70.86    [39m | [39m0.2327   [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m12       [39m | [39m0.76     [39m | [39m0.8597   [39m | [39m4.93     [39m | [39m71.31    [39m | [39m1.932    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m13       [39m | [39m0.7497   [39m | [39m0.3871   [39m | [39m3.864    [39m | [39m55.83    [39m | [39m1.536    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m14       [39m | [39m0.7813   [39m | [39m0.01261  [39m | [39m3.378    [39m | [39m125.8    [39m | [39m1.053    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m15       [39m | [39m0.8172   [39m | [39m1.485    [39m | [39m3.575    [39m | [39m60.83    [39m | [39m0.03935  [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m16       [39m | [39m0.77     [39m | [39m1.774    [39m | [39m3.593    [39m | [39m56.66    [39m | [39m1.062    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m17       [39m | [39m0.8106   [39m | [39m0.4124   [39m | [39m4.91     [39m | [39m33.38    [39m | [39m0.1389   [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m18       [39m | [39m0.7623   [39m | [39m0.8968   [39m | [39m2.264    [39m | [39m36.96    [39m | [39m1.674    [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m19       [39m | [39m0.7998   [39m | [39m0.1923   [39m | [39m3.344    [39m | [39m72.92    [39m | [39m0.4384   [39m |


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


| [39m20       [39m | [39m0.8194   [39m | [39m0.7815   [39m | [39m3.144    [39m | [39m74.12    [39m | [39m0.5712   [39m |


In [101]:
# Printing best parameters

best_params = optimizer.max['params']
print("Best Parameters:", best_params)

Best Parameters: {'activation': 0.6084844859190754, 'kernel_size': 3.5742692948967134, 'n_hidden': 73.46672178964312, 'optimizer': 0.5824582803960838}


In [103]:
# Rebuilding the model with the best parameters

best_activation = ['relu', 'sigmoid', 'tanh'][int(best_params['activation'])]
best_optimizer = ['adam', 'sgd', 'rmsprop'][int(best_params['optimizer'])]

In [105]:
final_model = create_model(
    n_hidden=int(best_params['n_hidden']),
    kernel_size=int(best_params['kernel_size']),
    activation=best_activation,
    optimizer=best_optimizer
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [107]:
# Training the final model

final_model.fit(X_train, y_train, epochs=30, batch_size=16, verbose=2)

Epoch 1/30
1076/1076 - 1s - 786us/step - accuracy: 0.7303 - loss: 0.8445
Epoch 2/30
1076/1076 - 1s - 655us/step - accuracy: 0.7749 - loss: 0.6540
Epoch 3/30
1076/1076 - 1s - 562us/step - accuracy: 0.7869 - loss: 0.6046
Epoch 4/30
1076/1076 - 1s - 556us/step - accuracy: 0.7942 - loss: 0.5736
Epoch 5/30
1076/1076 - 1s - 555us/step - accuracy: 0.8051 - loss: 0.5470
Epoch 6/30
1076/1076 - 1s - 552us/step - accuracy: 0.8114 - loss: 0.5245
Epoch 7/30
1076/1076 - 1s - 552us/step - accuracy: 0.8151 - loss: 0.5095
Epoch 8/30
1076/1076 - 1s - 553us/step - accuracy: 0.8224 - loss: 0.4916
Epoch 9/30
1076/1076 - 1s - 555us/step - accuracy: 0.8271 - loss: 0.4732
Epoch 10/30
1076/1076 - 1s - 550us/step - accuracy: 0.8290 - loss: 0.4643
Epoch 11/30
1076/1076 - 1s - 555us/step - accuracy: 0.8367 - loss: 0.4431
Epoch 12/30
1076/1076 - 1s - 553us/step - accuracy: 0.8394 - loss: 0.4317
Epoch 13/30
1076/1076 - 1s - 585us/step - accuracy: 0.8437 - loss: 0.4197
Epoch 14/30
1076/1076 - 1s - 553us/step - accur

<keras.src.callbacks.history.History at 0x3101b2f90>

In [115]:
from sklearn.metrics import confusion_matrix, classification_report

In [117]:
# Evaluating the model

y_pred = np.argmax(final_model.predict(X_test), axis=1)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 369us/step
Confusion Matrix:
 [[3481  109   13   13   10   14    4    3    1   33    1    0    0    0]
 [ 143  914    6    3    2    6    2    3    0   11    0    2    0    0]
 [  33   47   90    8    5   15    2    3    0   11    0    0    0    0]
 [  17    6    3   40    5    8    3    0    0    0    0    0    0    0]
 [   5    2    2    3    7    7    0    0    0    3    0    0    0    0]
 [  11    3    3    1    3   47    0    0    0   13    0    1    0    0]
 [   3    1    1    1    2    0    2    0    0    1    0    0    0    0]
 [  17    3    3    0    2    2    1   23    0    9    0    1    0    0]
 [   6    0    0    1    0    0    0    0    1    1    0    0    0    0]
 [  52   17   10    0    4   21    0    7    0  347    0    0    0    0]
 [   7    1    0    0    0    0    0    0    0    0    0    0    0    0]
 [   1    0    0    0    1    0    0    0    0    0    0    2    1    0]
 [   1    0    1    1    0  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
