In [1]:
!pip install datasets
!pip install transformers
!pip install transformers datasets torch scikit-learn



In [3]:
from datasets import load_dataset
my_cache_dir = "/home/nise/Jinhui/NLP"

# Load the Web of Science dataset
dataset = load_dataset("river-martin/web-of-science-with-label-texts", cache_dir=my_cache_dir)

# Explore the dataset
print("Dataset structure:")
print(dataset)

# Access the training, validation, and test sets
train_data = dataset['train']
test_data = dataset['test']
validate_data = dataset['validate']

# Inspect a sample
sample = train_data[0]
print("\nSample entry:")
print(sample)

# Example of preprocessing the text for classification
def preprocess_data(example):
    # Use 'abstract' as input text and 'area' as a primary label
    return {"input_text": example["abstract"], "area": example["area"], "domain": example["domain"]}

# Apply preprocessing
train_data = train_data.map(preprocess_data)

# Check processed data
print("\nProcessed data:")
print(train_data[0])


Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['abstract', 'domain', 'area'],
        num_rows: 32890
    })
    validate: Dataset({
        features: ['abstract', 'domain', 'area'],
        num_rows: 7048
    })
    test: Dataset({
        features: ['abstract', 'domain', 'area'],
        num_rows: 7047
    })
})

Sample entry:
{'abstract': "Along with the thorough development of underground engineering, people pay more attention on the long-term deformation stability of the tunnels. However, creep characteristic is an important factor to arouse surrounding rock deformation failures and control engineering designs. Phyllite is a common soft rock in engineering practice, when sample is saturated, water will not only impose the softening effect on phyllite, but also it will influence their creep mechanical properties under the long-term loads. In this paper, selecting samples from headrace tunnel GaoPingpu hydropower station in China, we carried out the shear c

In [4]:
# Access the training, validation, and test sets
train_data = dataset['train']
test_data = dataset['test']


all_domains = set()
all_areas = set()

for example in train_data:
    all_domains.add(example['domain'])
    all_areas.add(example['area'])

for example in test_data:
    all_domains.add(example['domain'])
    all_areas.add(example['area'])

print("All Domains:")
for domain in sorted(list(all_domains)):
    print(domain)

print("\nAll Areas:")
for area in sorted(list(all_areas)):
    print(area)

All Domains:
CS
Civil
ECE
MAE
Medical
Psychology
biochemistry

All Areas:
Addiction
Algorithm design
Allergies
Alzheimer's Disease
Ambient Intelligence
Analog signal processing
Ankylosing Spondylitis
Antisocial personality disorder
Anxiety
Asthma
Atopic Dermatitis
Atrial Fibrillation
Attention
Autism
Bamboo as a Building Material
Bioinformatics
Bipolar Disorder
Birth Control
Borderline personality disorder
Cancer
Cell biology
Child abuse
Children's Health
Computer graphics
Computer programming
Computer vision
Construction Management
Control engineering
Crohn's Disease
Cryptography
DNA/RNA sequencing
Data structures
Dementia
Depression
Diabetes
Digestive Health
Digital control
Distributed computing
Eating disorders
Electric motor
Electrical circuits
Electrical generator
Electrical network
Electricity
Emergency Contraception
Enzymology
False memories
Fluid mechanics
Fungal Infection
Gender roles
Genetics
Geotextile
Green Building
HIV/AIDS
Headache
Healthy Sleep
Heart Disease
Hepatitis C


In [5]:
unique_labels = set(train_data['domain'] + validate_data['domain'] + test_data['domain'])
label_to_id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("\nLabel mapping:")
print(label_to_id)

# Example of preprocessing the text for classification
def preprocess_data(example):
    # Use 'abstract' as input text and convert 'domain' to integer ID
    return {
        "text": example["abstract"],
        "label": label_to_id[example["domain"]],
    }

# Apply preprocessing
train_data = train_data.map(preprocess_data)
validate_data = validate_data.map(preprocess_data)
test_data = test_data.map(preprocess_data)

# Check processed data
print("\nProcessed training data:")
print(train_data[0])

print("\nProcessed validation data:")
print(validate_data[0])

print("\nProcessed test data:")
print(test_data[0])


Label mapping:
{'CS': 0, 'Civil': 1, 'ECE': 2, 'MAE': 3, 'Medical': 4, 'Psychology': 5, 'biochemistry': 6}


Map:   0%|          | 0/32890 [00:00<?, ? examples/s]

Map:   0%|          | 0/7048 [00:00<?, ? examples/s]

Map:   0%|          | 0/7047 [00:00<?, ? examples/s]


Processed training data:
{'abstract': "Along with the thorough development of underground engineering, people pay more attention on the long-term deformation stability of the tunnels. However, creep characteristic is an important factor to arouse surrounding rock deformation failures and control engineering designs. Phyllite is a common soft rock in engineering practice, when sample is saturated, water will not only impose the softening effect on phyllite, but also it will influence their creep mechanical properties under the long-term loads. In this paper, selecting samples from headrace tunnel GaoPingpu hydropower station in China, we carried out the shear creep test of saturated phyllite along the schistosity surface under different stress levels. Further, we systematically analyzed and compared the test results with the samples in air-dried conditions, and found that water has great effects on the strength of phyllite: the long-term strength and yield stress of saturated phyllite 

In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from datasets import Dataset

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir=my_cache_dir)

# Example training data (replace this with your data)
# texts = ["Example text 1", "Another example text"]
# labels = [0, 1]  # Binary classification: 0 or 1
texts = train_data['abstract']
labels = []
for example in train_data['domain']:
  labels.append(label_to_id[example])
num_labels = 7

# Tokenize and prepare the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

data = {"text": texts, "label": labels}
dataset = Dataset.from_dict(data)
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset = encoded_dataset.with_format("torch")

train_loader = DataLoader(encoded_dataset, batch_size=8, shuffle=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/32890 [00:00<?, ? examples/s]

In [7]:
import torch
import torch.nn as nn
from transformers import BertPreTrainedModel, BertModel

class CustomBERTClassifier(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)

        # Freeze all parameters of the BERT model
        for param in self.bert.parameters():
            param.requires_grad = False

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 512),
            nn.LeakyReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.1),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, config.num_labels)
        )
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        # Pass inputs through the BERT encoder
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {
            "loss": loss,
            "logits": logits,
        }

# Example Usage
from transformers import BertTokenizer, BertConfig

# Load tokenizer and configuration
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir=my_cache_dir)
config = BertConfig.from_pretrained('bert-base-uncased', cache_dir=my_cache_dir, num_labels=num_labels)  # Example: 3 classes

# Initialize the custom model
model = CustomBERTClassifier(config)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Print the model architecture
print(model)

CustomBERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [None]:
model = CustomBERTClassifier.from_pretrained("./bert-base-uncased-finetuned")
tokenizer = BertTokenizer.from_pretrained("./bert-base-uncased-finetuned")

In [8]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler for learning rate
num_training_steps = 1 * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.train()

epochs = 5
global_step = 0  # Counter for batches
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in train_loader:
        optimizer.zero_grad()

        # Move data to device
        batch = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask", "label"]}

        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
        loss = outputs["loss"]
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        global_step += 1
        if global_step == 10:
            print(f"Step {global_step}, Loss: {loss.item():.4f}")
        if global_step % 1000 == 0:
            print(f"Step {global_step}, Loss: {loss.item():.4f}")

# Save the model
model.save_pretrained("./bert-base-uncased-finetuned")
tokenizer.save_pretrained("./bert-base-uncased-finetuned")




Epoch 1/5
Step 1000, Loss: 1.8501
Step 2000, Loss: 1.8343
Step 3000, Loss: 2.0113
Step 4000, Loss: 1.9432
Epoch 2/5
Step 5000, Loss: 1.8695
Step 6000, Loss: 1.8633
Step 7000, Loss: 1.9215
Step 8000, Loss: 1.8074
Epoch 3/5
Step 9000, Loss: 2.0940
Step 10000, Loss: 2.1281
Step 11000, Loss: 2.0272
Step 12000, Loss: 1.7167
Epoch 4/5
Step 13000, Loss: 1.8289
Step 14000, Loss: 1.9971
Step 15000, Loss: 1.9872
Step 16000, Loss: 1.7524
Epoch 5/5
Step 17000, Loss: 1.9444
Step 18000, Loss: 1.9463
Step 19000, Loss: 1.8997
Step 20000, Loss: 1.8624


('./bert-base-uncased-finetuned/tokenizer_config.json',
 './bert-base-uncased-finetuned/special_tokens_map.json',
 './bert-base-uncased-finetuned/vocab.txt',
 './bert-base-uncased-finetuned/added_tokens.json')

In [None]:
# Save the model
model.save_pretrained("./bert-base-uncased-finetuned")
tokenizer.save_pretrained("./bert-base-uncased-finetuned")

In [13]:
# Example usage of the model for inference
def predict_domain(abstract_text, model, tokenizer, label_to_id, id_to_label):
    # Tokenize the input text
    inputs = tokenizer(abstract_text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Move data to device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted probabilities
    logits = outputs["logits"]
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class_id = torch.argmax(probabilities, dim=-1).item()
    predicted_class_label = id_to_label[predicted_class_id]

    # Print predicted probabilities for all classes
    print("Predicted Probabilities:")
    for i, label in enumerate(id_to_label):
      print(f"{id_to_label[i]}: {probabilities[0][i]:.4f}")

    return predicted_class_label


# Move model to device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

model.eval()

# Example inference
abstract = "This is an example abstract about computer science."  # Replace with actual abstract text

predicted_domain = predict_domain(abstract, model, tokenizer, label_to_id, id_to_label)
print(f"Predicted domain: {predicted_domain}")

Predicted Probabilities:
CS: 0.1643
Civil: 0.1152
ECE: 0.1521
MAE: 0.1032
Medical: 0.1619
Psychology: 0.1636
biochemistry: 0.1397
Predicted domain: CS
