In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel  # 1. Import BERT-related modules
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Load and preprocess data
train_data = pd.read_csv('/content/restrain.csv')
test_data = pd.read_csv('/content/restest.csv')

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

train_data['Sentence'] = train_data['Sentence'].apply(preprocess_text)
test_data['Sentence'] = test_data['Sentence'].apply(preprocess_text)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Initialize BERT tokenizer

# Tokenize text data with BERT tokenizer
train_encodings = tokenizer(train_data['Sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['Sentence'].tolist(), truncation=True, padding=True)

# Load and preprocess labels
encoder = LabelEncoder()
train_labels = encoder.fit_transform(train_data['Polarity'])
test_labels = encoder.transform(test_data['Polarity'])

# Define custom dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long),
                'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long),
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)}

# Create datasets and data loaders
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define GCN model with BERT embeddings
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')  # Load BERT model
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = torch.relu(self.fc1(pooled_output))
        x = self.fc2(x)
        return torch.log_softmax(x, dim=1)

# Define training function
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Define evaluation function
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    predictions = []
    ground_truths = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            total_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            predictions.extend(predicted.cpu().numpy())
            ground_truths.extend(labels.cpu().numpy())
    accuracy = correct / len(test_loader.dataset)
    avg_loss = total_loss / len(test_loader)
    return avg_loss, accuracy, predictions, ground_truths

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, optimizer, and loss function
input_dim = 768  # BERT hidden size
hidden_dim = 128
output_dim = len(encoder.classes_)
model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)  # BERT recommended learning rate
criterion = nn.CrossEntropyLoss()

# Train and evaluate the model
num_epochs = 10
best_accuracy = 0.0
for epoch in range(1, num_epochs + 1):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_accuracy, predictions, ground_truths = evaluate(model, test_loader, criterion, device)
    print(f'Epoch {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

    # Calculate precision, recall, and F1 score
    precision = precision_score(ground_truths, predictions, average='weighted')
    recall = recall_score(ground_truths, predictions, average='weighted')
    f1 = f1_score(ground_truths, predictions, average='weighted')
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

    # Save the model with the best accuracy
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), 'best_gcn_model.pth')

# Save trained model
torch.save(model.state_dict(), 'gcn_model.pth')


**2 Code **

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Load and preprocess data
train_data = pd.read_csv('/content/tweet train (1).csv')
test_data = pd.read_csv('/content/tweet test (1).csv')

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

train_data['Sentence'] = train_data['Sentence'].apply(preprocess_text)
test_data['Sentence'] = test_data['Sentence'].apply(preprocess_text)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text data with BERT tokenizer
train_encodings = tokenizer(train_data['Sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['Sentence'].tolist(), truncation=True, padding=True)

# Load and preprocess labels
encoder = LabelEncoder()
train_labels = encoder.fit_transform(train_data['Polarity'])
test_labels = encoder.transform(test_data['Polarity'])

# Define custom dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long),
                'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long),
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)}

# Create datasets and data loaders
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define GCN model with BERT embeddings
class GCN(nn.Module):
    def __init__(self, output_dim):
        super(GCN, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.5)  # Increased dropout rate for regularization
        self.fc = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        x = self.fc(x)
        return torch.log_softmax(x, dim=1)

# Define training function with gradient clipping
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping to prevent exploding gradients
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Define evaluation function
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    predictions = []
    ground_truths = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            total_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            predictions.extend(predicted.cpu().numpy())
            ground_truths.extend(labels.cpu().numpy())
    accuracy = correct / len(test_loader.dataset)
    avg_loss = total_loss / len(test_loader)
    return avg_loss, accuracy, predictions, ground_truths

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, optimizer, and loss function
output_dim = len(encoder.classes_)
model = GCN(output_dim).to(device)

# Adjust the learning rate and weight decay
optimizer = optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 1e-5},  # Fine-tune BERT layers with a lower learning rate
    {'params': model.fc.parameters(), 'lr': 5e-5}     # Higher learning rate for the additional layers
], weight_decay=0.01)

criterion = nn.CrossEntropyLoss()

# Implement learning rate scheduler with warm-up steps
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)  # Adjust step_size and gamma as needed

# Train and evaluate the model
num_epochs = 10
best_accuracy = 0.0
for epoch in range(1, num_epochs + 1):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_accuracy, predictions, ground_truths = evaluate(model, test_loader, criterion, device)
    print(f'Epoch {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

    # Calculate precision, recall, and F1 score
    precision = precision_score(ground_truths, predictions, average='weighted')
    recall = recall_score(ground_truths, predictions, average='weighted')
    f1 = f1_score(ground_truths, predictions, average='weighted')
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

    # Adjust learning rate scheduler
    scheduler.step()

    # Save the model with the best accuracy
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), 'best_gcn_model.pth')

# Save trained model
torch.save(model.state_dict(), 'gcn_model.pth')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10, Train Loss: 0.9735, Test Loss: 0.8130, Test Accuracy: 0.6322
Precision: 0.6735, Recall: 0.6322, F1 Score: 0.6040
Epoch 2/10, Train Loss: 0.7969, Test Loss: 0.7782, Test Accuracy: 0.6588
Precision: 0.6574, Recall: 0.6588, F1 Score: 0.6574
Epoch 3/10, Train Loss: 0.6814, Test Loss: 0.7669, Test Accuracy: 0.6883
Precision: 0.6904, Recall: 0.6883, F1 Score: 0.6863
Epoch 4/10, Train Loss: 0.5615, Test Loss: 0.8448, Test Accuracy: 0.6588
Precision: 0.6766, Recall: 0.6588, F1 Score: 0.6589
Epoch 5/10, Train Loss: 0.4563, Test Loss: 0.8340, Test Accuracy: 0.6883
Precision: 0.6916, Recall: 0.6883, F1 Score: 0.6891
Epoch 6/10, Train Loss: 0.3337, Test Loss: 0.8403, Test Accuracy: 0.6972
Precision: 0.6973, Recall: 0.6972, F1 Score: 0.6973
Epoch 7/10, Train Loss: 0.3164, Test Loss: 0.8682, Test Accuracy: 0.6869
Precision: 0.6866, Recall: 0.6869, F1 Score: 0.6866
Epoch 8/10, Train Loss: 0.3001, Test Loss: 0.8996, Test Accuracy: 0.6869
Precision: 0.6873, Recall: 0.6869, F1 Score: 0.6870


# Distilled **bert**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Load data
train_data = pd.read_csv('/content/restrain.csv')  # Replace with path to your training data
test_data = pd.read_csv('/content/restest.csv')    # Replace with path to your testing data
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

train_data['Sentence'] = train_data['Sentence'].apply(preprocess_text)
test_data['Sentence'] = test_data['Sentence'].apply(preprocess_text)

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize text data with DistilBERT tokenizer
train_encodings = tokenizer(train_data['Sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['Sentence'].tolist(), truncation=True, padding=True)

# Label Encoding
encoder = LabelEncoder()
train_labels = encoder.fit_transform(train_data['Polarity'])
test_labels = encoder.transform(test_data['Polarity'])

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create datasets and data loaders
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define GCN model with DistilBERT embeddings
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0, :]  # Use the [CLS] token embedding
        x = torch.relu(self.fc1(pooled_output))
        x = self.fc2(x)
        return torch.log_softmax(x, dim=1)

# Define training function
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Define evaluation function
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    predictions = []
    ground_truths = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            total_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            ground_truths.extend(labels.cpu().numpy())
    accuracy = accuracy_score(ground_truths, predictions)
    avg_loss = total_loss / len(test_loader)
    return avg_loss, accuracy, predictions, ground_truths

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, optimizer, and loss function
input_dim = 768  # DistilBERT hidden size
hidden_dim = 128
output_dim = len(np.unique(train_labels))
model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)  # Recommended learning rate for DistilBERT
criterion = nn.CrossEntropyLoss()

# Train and evaluate the model
num_epochs = 10
best_accuracy = 0.0
for epoch in range(1, num_epochs + 1):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_accuracy, _, _ = evaluate(model, test_loader, criterion, device)
    print(f'Epoch {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

    # Save the model with the best accuracy
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), 'best_gcn_model.pth')

# Save trained model
torch.save(model.state_dict(), 'gcn_model.pth')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10, Train Loss: 0.7667, Test Loss: 0.5481, Test Accuracy: 0.7909
Epoch 2/10, Train Loss: 0.5480, Test Loss: 0.6345, Test Accuracy: 0.7685
Epoch 3/10, Train Loss: 0.4457, Test Loss: 0.6465, Test Accuracy: 0.7802
Epoch 4/10, Train Loss: 0.3796, Test Loss: 0.6271, Test Accuracy: 0.7659
Epoch 5/10, Train Loss: 0.3210, Test Loss: 0.6151, Test Accuracy: 0.7668
Epoch 6/10, Train Loss: 0.2676, Test Loss: 0.6443, Test Accuracy: 0.7784
Epoch 7/10, Train Loss: 0.2457, Test Loss: 0.6717, Test Accuracy: 0.7721
Epoch 8/10, Train Loss: 0.2347, Test Loss: 0.6966, Test Accuracy: 0.7784
Epoch 9/10, Train Loss: 0.2259, Test Loss: 0.7703, Test Accuracy: 0.7623
Epoch 10/10, Train Loss: 0.2098, Test Loss: 0.8504, Test Accuracy: 0.7507
