In [None]:
import pandas as pd

# Load datasets
true_df = pd.read_csv('/content/politifact_real.csv')
false_df = pd.read_csv('/content/politifact_real.csv')

# Add labels
true_df['label'] = 1
false_df['label'] = 0

# Combine the datasets
df = pd.concat([true_df, false_df], ignore_index=True)

# Save the combined dataset
df.to_csv('combined.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer

# Load the combined dataset
df = pd.read_csv('combined.csv')

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.5, random_state=42, stratify=df['label'])

# Load the tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Ensure all text entries are strings
train_texts = train_df['text'].astype(str).tolist()
test_texts = test_df['text'].astype(str).tolist()
train_labels = train_df['label'].tolist()
test_labels = test_df['label'].tolist()

# Tokenize the text
def tokenize_texts(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

train_encodings = tokenize_texts(train_texts)
test_encodings = tokenize_texts(test_texts)

In [None]:
import torch
from transformers import XLNetForSequenceClassification

# Load the model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Prepare DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 4  # 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train(model, train_loader):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    return accuracy, f1

# Training loop
epochs = 6
for epoch in range(epochs):
    train_loss = train(model, train_loader)
    print(f"Epoch {epoch + 1}, Training loss: {train_loss}")

# Evaluate the model
accuracy, f1 = evaluate(model, test_loader)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


  train_labels = torch.tensor(train_labels)
  test_labels = torch.tensor(test_labels)


Epoch 1, Training loss: 0.7081964910030365
Epoch 2, Training loss: 0.703571999669075
Epoch 3, Training loss: 0.7098169362545014
Epoch 4, Training loss: 0.7003574883937835
Epoch 5, Training loss: 0.7048386180400849
Epoch 6, Training loss: 0.7068474835157394
Accuracy: 0.4993726474278545
F1 Score: 0.3326365082448302


In [None]:
!pip install nlpaug


Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer
import nlpaug.augmenter.word as naw
import torch

# Load the combined dataset
df = pd.read_csv('combined.csv')

# Ensure all text entries are strings
df['text'] = df['text'].astype(str)

# Data augmentation function
def augment_text(text):
    aug = naw.SynonymAug(aug_src='wordnet')
    return aug.augment(text)

# Augment the dataset
augmented_texts = df['text'].apply(augment_text)
augmented_labels = df['label']

augmented_df = pd.DataFrame({'text': augmented_texts, 'label': augmented_labels})
df = pd.concat([df, augmented_df])

# Filter out invalid text entries
df = df[df['text'].apply(lambda x: isinstance(x, str) and x.strip().lower() != 'nan')]

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Load the tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Tokenize the text
def tokenize_texts(texts):
    return tokenizer(
        texts,  # Ensure texts is a list of strings
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

train_texts = train_df['text'].tolist()
test_texts = test_df['text'].tolist()
train_labels = train_df['label'].tolist()
test_labels = test_df['label'].tolist()

# Ensure labels match the texts
assert len(train_texts) == len(train_labels), "Mismatch between number of training texts and labels."
assert len(test_texts) == len(test_labels), "Mismatch between number of testing texts and labels."

# Debug: Print a few examples to check the format
print("Sample train texts:", train_texts[:5])
print("Sample test texts:", test_texts[:5])

# Tokenize the texts
train_encodings = tokenize_texts(train_texts)
test_encodings = tokenize_texts(test_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

from transformers import XLNetForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score

# Prepare DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load the model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_loader) * 6  # Increase to 6 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train(model, train_loader):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    return accuracy, f1

# Training loop
epochs = 6
for epoch in range(epochs):
    train_loss = train(model, train_loader)
    print(f"Epoch {epoch + 1}, Training loss: {train_loss}")

# Evaluate the model
accuracy, f1 = evaluate(model, test_loader)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


Sample train texts: ['Remarks by President Trump in a Listening Session on Healthcare', 'The Committee on Energy and Commerce', 'Palin tries new tactic to unload hated jet: Gov. Sarah Palin background', 'The Des Moines Register', 'International Energy Statistics']
Sample test texts: ['Political Figures: C (2)', 'The CNN Miami Republican debate transcript, annotated', 'Analysis of the 2008 Presidential Candidates’ Tax Plans', 'Obama addresses key concerns for Floridians', 'The Democratic Debate in Cleveland']


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training loss: 0.7245282909226796
Epoch 2, Training loss: 0.7042736193490406
Epoch 3, Training loss: 0.704368982050154
Epoch 4, Training loss: 0.7114719133528452
Epoch 5, Training loss: 0.7001823461244977
Epoch 6, Training loss: 0.7046851203555152
Accuracy: 0.456
F1 Score: 0.39733408961996597
