# Import Libraries

In [42]:
# from google.colab import drive
# drive.mount('/content/drive')

In [44]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

filepath = r"D:\_SHL\Folder-D\PRSNL\study\Rainfall_Prediction\Source Code\SYLHET_data.xlsx"
sheet_name1='SYLHET_data'
df = pd.read_excel(filepath, sheet_name1)
df.head()

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,Rain_today,Rain_tomorrow
0,41866,30.9,25.7,27.4,40.9,25.7,30.6,26.1,93.0,24.8,...,24.1,175.6,1003.9,96.0,3.7,184.7,15.8,6,rain,rain
1,41867,30.9,25.4,27.5,39.2,25.4,30.8,25.8,90.5,30.5,...,25.9,176.5,1004.4,92.9,3.8,208.1,17.8,8,rain,rain
2,41868,32.5,25.2,28.0,41.9,25.2,32.1,25.9,89.0,9.5,...,15.1,199.4,1004.1,92.9,3.3,255.9,22.0,9,rain,rain
3,41869,32.2,25.1,28.0,42.1,25.1,32.5,25.9,89.1,23.4,...,16.9,195.7,1007.0,86.0,3.5,280.0,24.2,9,rain,rain
4,41870,31.6,25.1,27.4,40.7,25.1,30.8,25.9,92.1,27.9,...,18.7,201.7,1007.9,94.9,2.5,259.3,22.3,8,rain,rain


In [46]:
# Extract features and target variable
x = df.iloc[:, 1:19].values  # Exclude the last feature
categorical_feature = df.iloc[:, -1].values  # The last feature

# Use LabelEncoder to convert string labels to numerical labels for the categorical feature
le = LabelEncoder()
categorical_feature_encoded = le.fit_transform(categorical_feature)

# Concatenate the encoded categorical feature to the rest of the features
x_encoded = np.column_stack((x, categorical_feature_encoded))

# Feature scaling
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_encoded)

# Extract the target variable
y_labels = df['Rain_tomorrow'].values

# Use LabelEncoder to convert string labels to numerical labels for the target variable
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y_labels)

# Use OneHotEncoder to convert string labels to one-hot encoded labels for the target variable
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y_labels.reshape(-1, 1))



In [47]:
y_onehot

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [40]:
y_encoded

array([[2],
       [2],
       [2],
       ...,
       [2],
       [0],
       [1]])

In [None]:
# Define the sequence length (number of time steps to consider as input)
sequence_length = 10  # You can adjust this based on your preference

# Create sequences for LSTM
sequences = []
labels = []

for i in range(len(x_scaled) - sequence_length):
    sequence = x_scaled[i : i + sequence_length]
    target_label = y_onehot[i + sequence_length]
    
    # Check for NaN values in the sequence
    if not np.isnan(sequence).any():
        sequences.append(sequence)
        labels.append(target_label)

sequences = np.array(sequences)
labels = np.array(labels)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=0)

# Number of classes (3 for rain, cloudy, clear-day)
num_classes = y_onehot.shape[1]

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, activation='relu', input_shape=(sequence_length, x_scaled.shape[1])))
model.add(Dense(units=num_classes, activation='softmax'))  # Multiclass, so softmax activation

# Compile the model with categorical crossentropy loss
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

# # Evaluate the model
# loss, accuracy = model.evaluate(x_test, y_test)
# print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

In [60]:
# Now, let's make predictions on the test set
predictions = model.predict(x_test)
predictions



array([[5.4882485e-06, 3.4492742e-10, 9.9999452e-01],
       [3.3899675e-17, 1.8245714e-11, 1.0000000e+00],
       [1.8468320e-04, 2.3061691e-03, 9.9750912e-01],
       ...,
       [1.2819092e-03, 4.2535530e-07, 9.9871767e-01],
       [1.3836989e-01, 8.6057562e-01, 1.0544672e-03],
       [9.9758649e-01, 9.0444878e-09, 2.4135411e-03]], dtype=float32)

In [80]:
from tensorflow.keras.utils import to_categorical

# If you want to get the class labels (indices of the class with the highest probability)
predicted_labels = np.argmax(predictions, axis=1)

# Convert the predicted labels to one-hot encoded format
predicted_labels_onehot = to_categorical(predicted_labels, num_classes=num_classes)

# Optionally, you can convert the predicted labels back to the original label encoding
predicted_labels_original = le_y.inverse_transform(predicted_labels)

# Convert y_test back to original labels
y_test_original = le_y.inverse_transform(np.argmax(y_test, axis=1))

# Print the first few predictions
print("Sample Predictions:")
print(predictions[:5])
print("Corresponding Predicted Labels (One-Hot Encoded):")
print(predicted_labels_onehot[:5])
print("Corresponding Predicted Labels (Original Encoding):")
print(predicted_labels_original[:5])

# Print the corresponding original labels for y_test
print("Corresponding Original Labels for y_test:")
print(y_test_original[:5])

Sample Predictions:
[[5.4882485e-06 3.4492742e-10 9.9999452e-01]
 [3.3899675e-17 1.8245714e-11 1.0000000e+00]
 [1.8468320e-04 2.3061691e-03 9.9750912e-01]
 [9.9198467e-01 3.9852523e-05 7.9755122e-03]
 [2.7313044e-06 6.3434607e-05 9.9993384e-01]]
Corresponding Predicted Labels (One-Hot Encoded):
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]
Corresponding Predicted Labels (Original Encoding):
['rain' 'rain' 'rain' 'clear-day' 'rain']
Corresponding Original Labels for y_test:
['rain' 'rain' 'rain' 'clear-day' 'rain']


In [104]:
predicted_labels_original.shape

(522,)

In [103]:
y_test_original.shape

(522,)

In [111]:
import pandas as pd

# Assuming predicted_labels_original and y_test_original are 1D arrays
data = {'Observed Labels': y_test_original,'Predicted Labels': predicted_labels_original, }
df = pd.DataFrame(data)

# Export the DataFrame to an Excel file
df.to_excel('D:\_SHL\Folder-D\PRSNL\study\Rainfall_Prediction\Source Code\observed_vs_predicted_labels.xlsx', index=False)

In [112]:
df

Unnamed: 0,Observed Labels,Predicted Labels
0,rain,rain
1,rain,rain
2,rain,rain
3,clear-day,clear-day
4,rain,rain
...,...,...
517,rain,rain
518,cloudy,cloudy
519,rain,rain
520,clear-day,cloudy


In [113]:
x_test.shape

(522, 10, 19)

In [114]:
y_test.shape

(522, 3)

In [119]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_original, predicted_labels_original)
print(f'Accuracy: {accuracy:.7f}')
df_pred.to_excel(r'D:\_SHL\Folder-D\PRSNL\study\Rainfall_Prediction\Source Code\df_pred.xlsx')

Accuracy: 0.8237548


In [120]:
df_pred.shape

(522, 1)

In [121]:
y_onehot.shape[1]
x_train
# y_train
labels.shape
# sequences.shape

(2608, 3)

In [122]:
x.shape

(2681, 18)

# Importing Dataset

In [None]:
filepath = r"D:\_SHL\Folder-D\PRSNL\study\Rainfall_Prediction\Source Code\SYLHET_data.xlsx"
sheet_name1='SYLHET_data'
dataset = pd.read_excel(filepath, sheet_name1)
x = dataset.iloc[ : , 1:20].values
y = dataset.iloc[:,-1].values
dataset.head(3)

In [None]:
x

In [None]:
y

In [None]:
dataset.head()

In [None]:
dataset.corr()

In [None]:
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# import pandas as pd

# # Assuming x is a DataFrame with a column 'Rain_today' containing mixed types
# x['Rain_today'] = pd.to_numeric(x['Rain_today'], errors='coerce')

# # Extract numeric columns for correlation matrix
# numeric_columns = x.select_dtypes(include=[np.number])

# # Convert the DataFrame to a NumPy array
# numeric_array = numeric_columns.values

# # Calculate correlation matrix
# corr_matrix = np.corrcoef(numeric_array, rowvar=False)

# # Create heatmap
# plt.figure(figsize=(6, 8))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, xticklabels=numeric_columns.columns, yticklabels=numeric_columns.columns)
# plt.title('Correlation Heatmap')
# plt.show()

In [None]:
y = y.reshape(-1,1) # convert 1D arry to 2D array

In [None]:
print(y)

# Taking Care of Missing Values

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x = imputer.fit_transform(x)
y = imputer.fit_transform(y)

In [None]:
print(x)

In [None]:
print(y)

# Encoding the Dataset

In [None]:
from sklearn.preprocessing import LabelEncoder
# le1 = LabelEncoder()
# x[:,10] = le1.fit_transform(x[:,10])
le2 = LabelEncoder()
x[:,-1] = le2.fit_transform(x[:,-1])
le3 = LabelEncoder()
y = le3.fit_transform(y)

In [None]:
print(x)

In [None]:
print(y)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [None]:
print(x)

# Splitting Dataset into Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
print(x_train)

In [None]:
print(y_train)

# Training Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=150, random_state=0)
classifier.fit(x_train, y_train)

In [None]:
classifier.score(x_train,y_train)

In [None]:
y_pred = le3.inverse_transform(np.array(classifier.predict(x_test),dtype=int))
y_test = le3.inverse_transform(np.array(y_test,dtype=int))

In [None]:
# print(y_pred)

In [None]:
# print(y_test)

In [None]:
y_pred = y_pred.reshape(-1,1)
y_test = y_test.reshape(-1,1)
# print(y_pred)

In [None]:
# print(x_test)

In [None]:
df = np.concatenate((y_test,y_pred),axis=1)
dataframe = pd.DataFrame(df,columns=['Rain on Tommorrow','Predition of Rain'])
print(dataframe)

In [None]:
# dataframe.to_excel('pandas_to_excel_no_index_header.xlsx', index=False)

# Visualizing the Results

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

# Out of Sample Validation

In [None]:
filepath = r"D:\_SHL\Folder-D\PRSNL\study\Rainfall_Prediction\Source Code\SYLHET_data.xlsx"
sheet_name ='validation'
dataset2 = pd.read_excel(filepath, sheet_name)
x1 = dataset2.iloc[ : , 1:20].values
x1

In [None]:
from sklearn.preprocessing import LabelEncoder
# le4 = LabelEncoder()
# x1[:,10] = le4.fit_transform(x1[:,10])
le5 = LabelEncoder()
x1[:,-1] = le5.fit_transform(x1[:,-1])
le6 = LabelEncoder()
# y = le3.fit_transform(y)
x1 = sc.transform (x1)
y1 = dataset2.iloc[:,-1].values
dataset2.head(3)

# le5 = LabelEncoder()
# x[:,-1] = le5.fit_transform(x[:,-1])
# le6 = LabelEncoder()
# y = le6.fit_transform(y)

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x1 = imputer.fit_transform(x1)

In [None]:
x1.shape

In [None]:
y1.shape
y1

In [None]:
# Make predictions on the out-of-sample data
y1_pred = classifier.predict(x1)

# Inverse transform the encoded predictions to original labels
y1_pred_labels = le3.inverse_transform(y1_pred)

# Print the predicted labels for the out-of-sample data
print("Predicted Labels for Out-of-Sample Data:")
print(y1_pred_labels)

In [None]:
y1.shape

In [None]:
y1_pred.shape

In [None]:
# You can also create a DataFrame to display the predictions alongside the original data
df_predictions = pd.DataFrame({'Original Rainfall': y1, 'Predicted Rainfall': y1_pred_labels})
print(df_predictions)

# Results

In [None]:
print(y1.dtype, y1_pred_labels.dtype)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y1, y1_pred_labels)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y1, y1_pred_labels)
print(conf_matrix)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y1, y1_pred_labels)
print(report)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Example confusion matrix
conf_matrix = confusion_matrix(y1, y1_pred_labels)

class_labels = ['Rain', 'Cloudy', 'Clear-Day']

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels,
            yticklabels=class_labels)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load data
filepath = r'D:\_SHL\Folder-D\PRSNL\study\Rainfall_Prediction\Source Code\SYLHET_data.csv'
df = pd.read_csv(filepath)

# Extract features and target variable
x = df.iloc[:, 1:20].values
y_labels = df['Rain_tomorrow'].values

# Use LabelEncoder to convert string labels to numerical labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_labels)

# Feature scaling
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Assuming you have a 'timestep' variable in your data
timesteps = df['timestep'].max()

# Reshape the input data for LSTM
x_lstm = np.reshape(x_scaled, (x_scaled.shape[0], timesteps, -1))

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_lstm, y_encoded, test_size=0.2, random_state=0)

# Number of classes (3 for rain, cloudy, clear-day)
num_classes = len(np.unique(y_encoded))

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, activation='relu', input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Dense(units=num_classes, activation='softmax'))  # Multiclass, so softmax activation

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

# Make predictions on the out-of-sample data
# Assuming x1 is your new data
x1_scaled = scaler.transform(x1)
x1_lstm = np.reshape(x1_scaled, (x1_scaled.shape[0], timesteps, -1))

y1_pred = model.predict_classes(x1_lstm)

# Inverse transform the encoded predictions to original labels
y1_pred_labels = le.inverse_transform(y1_pred)

# Print the predicted labels for the out-of-sample data
print("Predicted Labels for Out-of-Sample Data:")
print(y1_pred_labels)

In [None]:
# plt.figure(figsize=(12, 6))
# plt.plot(data.index[-len(y1_observe):], y_test.values, label='Actual')
# plt.plot(data.index[-len(y1_pred):], y_pred, label='Predicted')
# plt.xlabel('Date')
# plt.ylabel('Price')
# plt.legend()
# plt.title('Forex Price Prediction')
# plt.show()