In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [3]:
# Load the data
data=pd.read_csv('weather_classification_data.csv')

In [4]:
# View Dimensions of Dataset
data.shape

(13200, 11)

In [5]:
# Preview the dataset
data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [6]:
# Count of labels for targeted column
data['Weather Type'].value_counts()

Weather Type
Rainy     3300
Cloudy    3300
Sunny     3300
Snowy     3300
Name: count, dtype: int64

In [9]:
# Assuming 'Weather Type' and 'Location' are your target columns
X = data.drop(columns=['Weather Type', 'Location','Season'])
y = data[['Weather Type', 'Location','Season']]

# Encode categorical features
categorical_columns = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_columns)

# Encode each target column separately if categorical
label_encoders = {}
for col in y.columns:
    if y[col].dtype == 'object':
        le = LabelEncoder()
        y[col] = le.fit_transform(y[col])
        label_encoders[col] = le

# Convert y to dummy variables (one-hot encoding) for multi-label classification
y = np.array(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])


In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# View dimensions of train and test set
X_train.shape, X_test.shape

((9240, 11), (3960, 11))

In [12]:
# Check data types in X_train
X_train.dtypes

Temperature                  float64
Humidity                       int64
Wind Speed                   float64
Precipitation (%)            float64
Atmospheric Pressure         float64
UV Index                       int64
Visibility (km)              float64
Cloud Cover_clear               bool
Cloud Cover_cloudy              bool
Cloud Cover_overcast            bool
Cloud Cover_partly cloudy       bool
dtype: object

In [13]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Reshape data to 3D for LSTM [samples, timesteps, features]
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [14]:
X_train[:5]

array([[[ 0.26483744,  0.31895701,  1.11711666,  0.95642653,
         -0.16389665, -0.26298232, -0.14256814, -0.44110105,
         -0.17775095,  1.08948715, -0.73257801]],

       [[ 1.53366033,  0.02114849, -0.91351934, -0.32651198,
         -4.44857653,  2.06679395,  0.00604757, -0.44110105,
          5.62584897, -0.91786305, -0.73257801]],

       [[ 0.26483744,  0.91457405,  1.18963937,  0.36189405,
          0.15485699, -0.78071038, -1.18287805, -0.44110105,
         -0.17775095, -0.91786305,  1.36504233]],

       [[-1.06165921,  1.65909536,  3.65541165,  1.14417363,
         -0.47340317,  0.25474574, -0.43979954, -0.44110105,
         -0.17775095,  1.08948715, -0.73257801]],

       [[ 1.30296526,  0.91457405,  1.11711666,  1.45708547,
          0.28241284,  1.03133783,  0.74912608, -0.44110105,
         -0.17775095, -0.91786305,  1.36504233]]])

In [15]:
# Build LSTM model
model = Sequential()
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(y.shape[1], activation='sigmoid'))  # Adjust for multi-label classification
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(**kwargs)


In [16]:
# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.3)

Epoch 1/200
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.6265 - loss: -0.0169 - val_accuracy: 0.6674 - val_loss: -4.7496
Epoch 2/200
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6690 - loss: -6.8757 - val_accuracy: 0.6699 - val_loss: -12.8234
Epoch 3/200
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6607 - loss: -14.5319 - val_accuracy: 0.6764 - val_loss: -19.8578
Epoch 4/200
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6555 - loss: -21.1017 - val_accuracy: 0.6880 - val_loss: -26.4250
Epoch 5/200
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6726 - loss: -27.4500 - val_accuracy: 0.6966 - val_loss: -32.7716
Epoch 6/200
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6780 - loss: -34.1262 - val_accuracy: 0.6995 - val_loss: -39.1652
E

<keras.src.callbacks.history.History at 0x1802c0e59d0>

In [17]:
# Evaluate on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6886 - loss: -1737.9794
Test Accuracy: 0.6823232173919678


In [22]:
# Predict classes
pred = model.predict(X_test)
threshold = 0.5  # Adjust as needed
predict_classes = (pred > threshold).astype(int)
print("Predicted classes: \n", predict_classes)
# Predict classes
pred = model.predict(X_test)
predict_classes = np.argmax(pred, axis=1)
print("Predicted classes: ", predict_classes)

[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Predicted classes: 
 [[1 1 1]
 [1 1 1]
 [1 1 1]
 ...
 [1 1 1]
 [1 1 1]
 [1 1 1]]
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Predicted classes:  [0 0 0 ... 0 0 0]
