In [None]:
!pip install transformers



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Assuming your CSV file is named 'your_dataset.csv'
df = pd.read_csv('demo.csv', encoding='ISO-8859-1')
df.isna().sum()
df=df.fillna(0)
df.head()


Unnamed: 0,Post,Perception,Post/Comment/Reply,Positive,Neutral,Negative,Different Topic
0,"Many companies, including government, are talk...",Concerns About Fairness: diversity hiring may ...,post,0.0,1.0,0.0,0.0
1,People who do not benefit from DEI hiring have...,Concerns About Fairness: diversity hiring may ...,comment,0.0,0.0,1.0,0.0
2,Remember when judging people based on skin col...,Concerns About Fairness: diversity hiring may ...,comment,1.0,0.0,0.0,0.0
3,What's not ethical is having teams of all one ...,Concerns About Fairness: diversity hiring may ...,comment,1.0,0.0,0.0,0.0
4,"Yes, it is ethical to put programs in place to...",1. Provides equitable opportunities: Diversity...,comment,0.0,0.0,1.0,0.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR
import torch
from tqdm import tqdm


# Combine 'Post' and 'Perception' into a single column
combined_data = df['Post'] + " " + df['Perception']

# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    combined_data, df[['Positive', 'Neutral', 'Negative', 'Different Topic']],
    test_size=0.2, random_state=42
)

# Convert 'Positive', 'Neutral', 'Negative', 'Different Topic' columns to a single column
train_labels['combined_labels'] = train_labels.apply(lambda row: row.idxmax(), axis=1)
test_labels['combined_labels'] = test_labels.apply(lambda row: row.idxmax(), axis=1)

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Tokenize the input data
train_encodings = tokenizer(list(train_data.astype(str)), truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(list(test_data.astype(str)), truncation=True, padding=True, return_tensors="pt")

# Convert labels to numeric format
label_encoder = LabelEncoder()
train_labels_numeric = label_encoder.fit_transform(train_labels['combined_labels'].astype(str))
test_labels_numeric = label_encoder.transform(test_labels['combined_labels'].astype(str))

train_labels_tensor = torch.tensor(train_labels_numeric, dtype=torch.long)
test_labels_tensor = torch.tensor(test_labels_numeric, dtype=torch.long)

# Define DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels_tensor)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set up optimizer and loss function with weight decay
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Learning rate scheduler

loss_fn = torch.nn.CrossEntropyLoss()

# Number of fine-tuning epochs
num_fine_tune_epochs = 10  # Increase the number of fine-tuning epochs

# Fine-tuning loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_fine_tune_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Fine-Tuning Epoch {epoch + 1}/{num_fine_tune_epochs}"):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

    scheduler.step()  # Update the learning rate

# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert_model_optimized")

# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained("fine_tuned_bert_model_optimized")
model.to(device)  # Make sure the model is on the correct device

# Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Print classification report
classification_rep = classification_report(all_labels, all_preds, target_names=['Positive', 'Neutral', 'Negative', 'Different Topic'])
print('Classification Report:\n', classification_rep)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fine-Tuning Epoch 1/10: 100%|██████████| 84/84 [01:47<00:00,  1.28s/it]
Fine-Tuning Epoch 2/10: 100%|██████████| 84/84 [01:51<00:00,  1.33s/it]
Fine-Tuning Epoch 3/10: 100%|██████████| 84/84 [01:52<00:00,  1.34s/it]
Fine-Tuning Epoch 4/10: 100%|██████████| 84/84 [01:53<00:00,  1.35s/it]
Fine-Tuning Epoch 5/10: 100%|██████████| 84/84 [01:53<00:00,  1.35s/it]
Fine-Tuning Epoch 6/10: 100%|██████████| 84/84 [01:53<00:00,  1.35s/it]
Fine-Tuning Epoch 7/10: 100%|██████████| 84/84 [01:53<00:00,  1.35s/it]
Fine-Tuning Epoch 8/10: 100%|██████████| 84/84 [01:53<00:00,  1.35s/it]
Fine-Tuning Epoch 9/10: 100%|██████████| 84/84 [01:53<00:00,  1.36s/it]
Fine-Tuning Epoch 10/10: 100%|██████████| 84/84 [01:53<00:0

Classification Report:
                  precision    recall  f1-score   support

       Positive       0.78      0.84      0.81       184
        Neutral       0.22      0.27      0.24        41
       Negative       0.34      0.29      0.31        56
Different Topic       0.31      0.24      0.27        55

       accuracy                           0.58       336
      macro avg       0.41      0.41      0.41       336
   weighted avg       0.56      0.58      0.57       336






In [None]:
new_post = "it is reverse discrimination to prefer minorities over skilled people"
new_perception = "1.	The Value of Diversity: diversity hiring helps bringing diverse perspective to enhance business performance."

# Combine 'Post' and 'Perception' for inference
new_post_perception_pair = f"{new_post} {new_perception}"

# Tokenize the new input
new_input_encoding = tokenizer(new_post_perception_pair, truncation=True, padding=True, return_tensors="pt")
new_input_encoding = {key: val.to(device) for key, val in new_input_encoding.items()}

# Forward pass on the new input
with torch.no_grad():
    new_outputs = model(**new_input_encoding)

# Obtain the predicted labels
predicted_labels = torch.argmax(new_outputs.logits, dim=1).item()

# Convert the predicted label back to the original label
predicted_label_str = label_encoder.classes_[predicted_labels]

print(f"The predicted label for the new post-perception pair is: {predicted_label_str}")


The predicted label for the new post-perception pair is: Negative
