In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer

# Load CSV file
data = pd.read_csv('/content/d5.csv')

# Drop rows with NaN values in 'text' or 'label' columns
data = data.dropna(subset=['text', 'label'])

# Inspect labels to identify any non-numeric values
print(data['label'].unique())

# Assuming we want to remove non-numeric labels
# This function checks if a string can be converted to an integer
def is_integer(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# Filter out rows where labels are not integers
data = data[data['label'].apply(is_integer)]

# Assume the CSV has two columns: 'text' and 'label'
texts = data['text'].tolist()
labels = data['label'].tolist()

# Convert all elements to strings
texts = [str(text) for text in texts]

# Ensure all labels are integers
labels = [int(label) for label in labels]

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Verify that the texts are lists of strings
assert all(isinstance(text, str) for text in train_texts), "train_texts contains non-string elements"
assert all(isinstance(text, str) for text in val_texts), "val_texts contains non-string elements"

# Load the tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Tokenize the data with specified max_length and padding
max_length = 128  # Specify a max length suitable for your use case
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_length)
val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=max_length)

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score

# Define a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create the datasets
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

# Create the dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load the pre-trained XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

# Move the model to the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3  # Assume 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(1):
    for batch in train_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

print("Training complete.")

# Evaluate the model
model.eval()
val_preds = []
val_labels = []

with torch.no_grad():
    for batch in val_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits

        # Get the predictions
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch['labels'].cpu().numpy()

        val_preds.extend(preds)
        val_labels.extend(labels)

# Calculate accuracy and F1 score
accuracy = accuracy_score(val_labels, val_preds)
f1 = f1_score(val_labels, val_preds, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


['1' '0' ' Derwent Innovations Index'
 ' as indexed in the Clarivate Analytics Web of Science Core Collection database. The list was cross-referenced against the number of patents filed by each organization during the same time period in the Derwent World Patents Index and the Derwent Innovations Index. Patent equivalents'
 ' it may not be possible to identify which of the various campuses was responsible for the initial research'
 ' since that indicator is closely related to demonstrating the continuing utility of an invention or process."']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training complete.
Accuracy: 0.5104166666666666
F1 Score: 0.34497126436781606
