In [1]:
# Install required libraries
!pip install gensim transformers torch scikit-learn huggingface_hub

import pandas as pd
from google.colab import files
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from huggingface_hub import hf_hub_download, HfApi, notebook_login

# Step 1: Upload the CSV file containing your dataset
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))  # Assumes the first uploaded file is your dataset





Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv


In [5]:
# Step 1: Install required libraries
!pip install tensorflow nltk tqdm

# Step 2: Import necessary libraries
import numpy as np
from transformers import  AdamW
from tensorflow.keras.models import load_model
import nltk
from tqdm import tqdm  # For progress bar in the training loop

# Step 3: Download necessary NLTK data
nltk.download('punkt')


# Step 5: Preprocess the data (tokenize the text)
def preprocess(text):
    return ' '.join(nltk.word_tokenize(text.lower()))

df['content'] = df['content'].apply(preprocess)

# Step 6: Download and load your fine-tuned GloVe model from Hugging Face
glove_model_path = hf_hub_download(repo_id="RafidMehda/glove_fine-tuned_model", filename="glove_fine_tuned_model.h5")
glove_model = load_model(glove_model_path)

# Access the embedding matrix
embedding_matrix = glove_model.get_layer('embedding').get_weights()[0]

# Step 7: Define a function to get GloVe embeddings from the model
def get_glove_embeddings(text):
    tokens = text.lower().split()
    valid_embeddings = []
    for token in tokens:
        token_index = glove_model.layers[0].input_dim  # Assuming GloVe model uses token indices
        if token_index < embedding_matrix.shape[0]:
            embedding = embedding_matrix[token_index]
            valid_embeddings.append(embedding)
    if valid_embeddings:
        return torch.tensor(np.mean(valid_embeddings, axis=0))  # Average pooling
    else:
        return torch.zeros(embedding_matrix.shape[1])  # Return zero vector if no valid tokens

# Step 8: Map labels and prepare data
df['label'] = df['RequirementType'].map({'F': 1, 'NF': 0})  # 1 for Functional, 0 for Non-Functional

# Step 9: Split the dataset into training, validation, and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, valid_df = train_test_split(train_df, test_size=0.25, random_state=42)

# Step 10: Tokenize the text data for DistilBERT with reduced max length
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_df['content']), truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(list(valid_df['content']), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_df['content']), truncation=True, padding=True, max_length=128)

# Step 11: Prepare PyTorch datasets
class GloVeBERTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GloVeBERTDataset(train_encodings, train_df['label'].tolist())
valid_dataset = GloVeBERTDataset(valid_encodings, valid_df['label'].tolist())
test_dataset = GloVeBERTDataset(test_encodings, test_df['label'].tolist())

# Step 12: Load the pre-trained DistilBERT model for fine-tuning
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Step 13: Fine-tune the model on GloVe-embedded data
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Reduced batch size
valid_loader = DataLoader(valid_dataset, batch_size=4)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set the device to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Training on: {device}")
model.to(device)

# Mixed precision setup (optional)
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

# Step 14: Training loop with progress bar and mixed precision, adding metrics evaluation
for epoch in range(3):  # 3 epochs for demonstration
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        optimizer.zero_grad()

        with autocast():  # Enable mixed precision
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss

        total_loss += loss.item()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Store predictions and labels
        _, preds = torch.max(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # Update progress bar with loss
        progress_bar.set_postfix(loss=total_loss / len(train_loader))

    # Calculate F1, Precision, Recall, and Accuracy for training
    print(f"\nEpoch {epoch+1} - Training Results:")
    print(classification_report(all_labels, all_preds, digits=4))

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

    # Validation loop
    model.eval()
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in valid_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)

            _, preds = torch.max(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    # Calculate F1, Precision, Recall, and Accuracy for validation
    print(f"\nEpoch {epoch+1} - Validation Results:")
    print(classification_report(val_labels, val_preds, digits=4))

# Step 15: Evaluate the model on the test set
test_loader = DataLoader(test_dataset, batch_size=4)
test_preds = []
test_labels = []
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)

        _, preds = torch.max(outputs.logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate F1, Precision, Recall, and Accuracy for the test set
print(f"\nTest Results:")
print(classification_report(test_labels, test_preds, digits=4))





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Training on: cuda


  with autocast():  # Enable mixed precision
Epoch 1: 100%|██████████| 1875/1875 [01:31<00:00, 20.39it/s, loss=0.181]



Epoch 1 - Training Results:
              precision    recall  f1-score   support

           0     0.9358    0.9414    0.9386      4181
           1     0.9256    0.9186    0.9221      3316

    accuracy                         0.9313      7497
   macro avg     0.9307    0.9300    0.9303      7497
weighted avg     0.9313    0.9313    0.9313      7497

Epoch 1, Loss: 0.18052895596822102

Epoch 1 - Validation Results:
              precision    recall  f1-score   support

           0     0.9631    0.9794    0.9712      1360
           1     0.9749    0.9552    0.9650      1139

    accuracy                         0.9684      2499
   macro avg     0.9690    0.9673    0.9681      2499
weighted avg     0.9685    0.9684    0.9684      2499



  with autocast():  # Enable mixed precision
Epoch 2: 100%|██████████| 1875/1875 [01:32<00:00, 20.18it/s, loss=0.0777]



Epoch 2 - Training Results:
              precision    recall  f1-score   support

           0     0.9784    0.9861    0.9823      4181
           1     0.9823    0.9726    0.9774      3316

    accuracy                         0.9801      7497
   macro avg     0.9804    0.9793    0.9798      7497
weighted avg     0.9801    0.9801    0.9801      7497

Epoch 2, Loss: 0.07769022617340088

Epoch 2 - Validation Results:
              precision    recall  f1-score   support

           0     0.9890    0.9875    0.9882      1360
           1     0.9851    0.9868    0.9860      1139

    accuracy                         0.9872      2499
   macro avg     0.9870    0.9872    0.9871      2499
weighted avg     0.9872    0.9872    0.9872      2499



  with autocast():  # Enable mixed precision
Epoch 3: 100%|██████████| 1875/1875 [01:33<00:00, 20.11it/s, loss=0.0331]



Epoch 3 - Training Results:
              precision    recall  f1-score   support

           0     0.9914    0.9940    0.9927      4181
           1     0.9924    0.9891    0.9908      3316

    accuracy                         0.9919      7497
   macro avg     0.9919    0.9916    0.9918      7497
weighted avg     0.9919    0.9919    0.9919      7497

Epoch 3, Loss: 0.033144462092717486

Epoch 3 - Validation Results:
              precision    recall  f1-score   support

           0     0.9898    0.9941    0.9919      1360
           1     0.9929    0.9877    0.9903      1139

    accuracy                         0.9912      2499
   macro avg     0.9913    0.9909    0.9911      2499
weighted avg     0.9912    0.9912    0.9912      2499


Test Results:
              precision    recall  f1-score   support

           0     0.9922    0.9964    0.9943      1402
           1     0.9954    0.9900    0.9927      1097

    accuracy                         0.9936      2499
   macro avg     

In [7]:
# Step 16: Save and upload the merged model to Hugging Face
model.save_pretrained('./merge_glove_distilbert_model')
tokenizer.save_pretrained('./merge_glove_distilbert_model')

!pip install huggingface_hub

from huggingface_hub import notebook_login
notebook_login()

from huggingface_hub import HfApi
api = HfApi()

# Upload the model to your Hugging Face repository
api.upload_folder(
    folder_path='./merge_glove_distilbert_model',  # Folder containing the fine-tuned model
    repo_id='RafidMehda/merge_glove_distilbert_model',  # Hugging Face repository name
    repo_type='model'  # Specify that it's a model repository
)



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/RafidMehda/merge_glove_distilbert_model/commit/0291ca19115e119bbb502cc48cdaeda6776f255b', commit_message='Upload folder using huggingface_hub', commit_description='', oid='0291ca19115e119bbb502cc48cdaeda6776f255b', pr_url=None, pr_revision=None, pr_num=None)