Bert_with_Lstm


In [None]:
# Importing essential libraries for data manipulation, machine learning, and NLP
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical computing
from tqdm import tqdm  # Progress bars for loops
from transformers import BertTokenizer, BertModel  # BERT tokenizer and model for NLP
import torch  # PyTorch for tensor computation and model handling


In [None]:
# Importing the pandas library
import pandas as pd

# Reading the CSV file into a pandas DataFrame
file_path = "/content/ghc_train.csv"  # Path to the CSV file
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify successful loading
print(df.head())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill any missing values in the 'text' column with an empty string
df['text'] = df['text'].fillna('')

# Define the features (X) and the target variable (y)
X = df['text']
y = df['label']

# Split the dataset into training and testing sets
# 20% of the data is used for testing, and 80% for training
# Stratify ensures that each split maintains the proportion of classes in the target variable
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Display the size of the training and testing sets
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


In [None]:
import numpy as np  # For numerical operations and array manipulations
import tensorflow as tf  # TensorFlow library for building and training neural networks
from tensorflow.keras.models import Sequential  # Sequential model type for Keras
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout  # Layers used in the model
from tensorflow.keras.callbacks import EarlyStopping  # Callback for early stopping during training
from sklearn.utils.class_weight import compute_class_weight  # Function to compute class weights for imbalanced data
from sklearn.metrics import confusion_matrix, classification_report  # Metrics for evaluating model performance
import seaborn as sns  # For creating statistical graphics
import matplotlib.pyplot as plt  # For plotting data and evaluation results
from transformers import BertTokenizer, TFBertModel  # BERT tokenizer and model from Hugging Face

# Initializing the BERT tokenizer with the pre-trained 'bert-base-uncased' model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initializing the BERT model with the pre-trained 'bert-base-uncased' model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]
vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]
tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]
/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.

In [None]:
# Function to encode text data using a BERT tokenizer
def encode_texts(texts, tokenizer, max_length=100):
    """
    Encode a list of texts using the BERT tokenizer.

    Parameters:
    texts (pd.Series): Series of text data to encode.
    tokenizer (BertTokenizer): Pre-trained BERT tokenizer.
    max_length (int): Maximum length of tokenized sequences.

    Returns:
    dict: Dictionary containing input_ids and attention_mask tensors.
    """
    encodings = tokenizer(
        texts.tolist(),          # Convert Series to list of texts
        truncation=True,         # Truncate sequences longer than max_length
        padding='max_length',    # Pad sequences to max_length
        max_length=max_length,   # Maximum length of the sequences
        return_tensors='tf'      # Return TensorFlow tensors
    )
    return encodings

# Encoding the training and testing text data with a maximum length of 50
X_train_encodings = encode_texts(X_train, tokenizer, max_length=50)
X_test_encodings = encode_texts(X_test, tokenizer, max_length=50)

# Function to get BERT embeddings from encoded text data
def get_bert_embeddings(encodings, bert_model):
    """
    Obtain BERT embeddings for the encoded text data.

    Parameters:
    encodings (dict): Encoded text data containing input_ids and attention_mask.
    bert_model (TFBertModel): Pre-trained BERT model.

    Returns:
    tf.Tensor: Tensor containing BERT embeddings.
    """
    outputs = bert_model(
        encodings['input_ids'],    # Input token IDs
        attention_mask=encodings['attention_mask']  # Attention mask
    )
    return outputs.last_hidden_state  # Return the embeddings from the last hidden state

# Setting the batch size for processing
batch_size = 16


In [None]:
# List to store BERT embeddings for the training data
X_train_embeddings = []

# Process the training data in batches to get BERT embeddings
for i in range(0, len(X_train_encodings['input_ids']), batch_size):
    # Create a batch of encodings
    batch_encodings = {key: val[i:i+batch_size] for key, val in X_train_encodings.items()}
    # Get BERT embeddings for the current batch
    batch_embeddings = get_bert_embeddings(batch_encodings, bert_model)
    # Append the batch embeddings to the list
    X_train_embeddings.append(batch_embeddings)

# Concatenate all the batch embeddings into a single tensor
X_train_embeddings = tf.concat(X_train_embeddings, axis=0)

# List to store BERT embeddings for the testing data
X_test_embeddings = []

# Process the testing data in batches to get BERT embeddings
for i in range(0, len(X_test_encodings['input_ids']), batch_size):
    # Create a batch of encodings
    batch_encodings = {key: val[i:i+batch_size] for key, val in X_test_encodings.items()}
    # Get BERT embeddings for the current batch
    batch_embeddings = get_bert_embeddings(batch_encodings, bert_model)
    # Append the batch embeddings to the list
    X_test_embeddings.append(batch_embeddings)

# Concatenate all the batch embeddings into a single tensor
X_test_embeddings = tf.concat(X_test_embeddings, axis=0)

# Compute class weights to handle class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Convert class weights to a dictionary format required by TensorFlow
class_weights_dict = dict(enumerate(class_weights))

# Print class weights for verification
print("Class weights:", class_weights_dict)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Define the model
model = Sequential()

# Adding a bidirectional LSTM layer with 256 units, returning sequences
model.add(Bidirectional(LSTM(units=256, return_sequences=True, input_shape=(X_train_embeddings.shape[1], X_train_embeddings.shape[2]))))

# Adding dropout to prevent overfitting
model.add(Dropout(0.5))

# Adding another bidirectional LSTM layer with 64 units, returning sequences
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))

# Adding dropout to prevent overfitting
model.add(Dropout(0.3))

# Adding another bidirectional LSTM layer with 64 units, not returning sequences
model.add(Bidirectional(LSTM(units=64)))
model.add(Dropout(0.2))

# Adding a dense layer with 64 units and ReLU activation function
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

# Adding the final dense layer with 1 unit and sigmoid activation function for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train_embeddings, y_train, epochs=5, batch_size=batch_size,
                    validation_data=(X_test_embeddings, y_test),
                    class_weight=class_weights_dict,
                    callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_embeddings, y_test, verbose=2)
print(f'\nTest accuracy: {test_acc:.4f}')

# Predict the labels for the test set
y_pred = model.predict(X_test_embeddings)
y_pred_classes = (y_pred > 0.5).astype("int32")

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
print(classification_report(y_test, y_pred_classes, target_names=['Class 0', 'Class 1']))


WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.

Epoch 1/5
1102/1102 [==============================] - 443s 390ms/step - loss: 0.5381 - accuracy: 0.7320 - val_loss: 0.4959 - val_accuracy: 0.7947
Epoch 2/5
1102/1102 [==============================] - 424s 385ms/step - loss: 0.4686 - accuracy: 0.7822 - val_loss: 0.3700 - val_accuracy: 0.8142
Epoch 3/5
1102/1102 [==============================] - 426s 386ms/step - loss: 0.4434 - accuracy: 0.7869 - val_loss: 0.4812 - val_accuracy: 0.7661
Epoch 4/5
1102/1102 [==============================] - 426s 386ms/step - loss: 0.4171 - accuracy: 0.8071 - val_loss: 0.4239 - val_accuracy: 0.8169
Epoch 5/5
1102/1102 [==============================] - 427s 388ms/step - loss: 0.3905 - accuracy: 0.8200 - val_loss: 0.3814 - val_accuracy: 0.8373
138/138 - 7s - loss: 0.3700 - accuracy: 0.8142 - 7s/epoch - 48ms/step

Test accuracy: 0.8142014741897583
138/138 [==============================] - 8s 51ms/step

 precision    recall  f1-score   support

     Class 0       0.96      0.82      0.89      3874
     Class 1       0.37      0.78      0.51       534

    accuracy                           0.81      4408
   macro avg       0.67      0.80      0.70      4408
weighted avg       0.89      0.81      0.84      4408


In [None]:
from sklearn.metrics import (confusion_matrix, classification_report, accuracy_score,
                             recall_score, precision_score, f1_score, roc_auc_score,
                             roc_curve, auc)
import matplotlib.pyplot as plt

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes)
f1 = f1_score(y_test, y_pred_classes)
roc_auc = roc_auc_score(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc_value = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc_value:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()


Accuracy: 0.8142014519056261
Recall: 0.7846441947565543
Precision: 0.37310774710596617
F1 Score: 0.5057332528666264
ROC AUC Score: 0.8777224616622099
Among all other dl models, the BERT with LSTM model is giving the best result with a good accuracy and a notable recall of 78%.The metric that iam considering is recall(i.e. Recall measures the proportion of true positive instances correctly identified by the model among all actual positive instances.) As there is an improvement in the metric that Iam considering that is Recall which is improved from 56% to 78% and The ROC AUC Score was also showing betterment than the finalized benchmark of the Machine Learning Model. Based on these, I Finalized BERT with LSTM model.