In [1]:
# !pip install transformers
# !pip install datasets
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import re
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
data_path = 'labelled_train_set_t3.csv'  # Path to your dataset
df = pd.read_csv(data_path)
df.dropna(inplace=True)
# View the first few rows of the dataset
print(df.head())

        ID                                               news  \
0  TRAIN_1  (Oct 5, 2011 12:11 PM CDT) Police are still hu...   
1  TRAIN_2  (Mar 4, 2014 11:30 AM) The New York Times foll...   
2  TRAIN_3  (Nov 4, 2008 3:19 PM) Stocks rallied on Electi...   
3  TRAIN_4  (Dec 24, 2014 11:19 AM) Turns out you won't ev...   
4  TRAIN_5  (Oct 16, 2014 3:02 AM CDT) Tristen Kurilla, th...   

                                            headline           method answer  
0  ____rd Victim Dead in Quarry Shooting; Manhunt...     Trans(three)      3  
1       NYT Corrects 1853 Piece After ____ Years Win         Copy(12)     12  
2                   Stocks Up ____ in Election Rally  Round(305.45,0)    305  
3              You Can Watch The Interview at ____pm          Copy(1)      1  
4      Murder Suspect, ____, Will Stay in Adult Jail         Copy(10)     10  


In [3]:
df = df[df['answer']!="11-Sep"]
df['answer']= df['answer'].str.replace(',', '')
# df['answer'] = pd.to_numeric(df['answer'],errors='coerce')

In [4]:
# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [5]:
def preprocess_text(text):

    # Remove patterns like "(Oct 5, 2011 12:11 PM CDT)"
    text = re.sub(r'\(.*?\)\s*', '', text)

    # Remove URLs (http, https)
    text = re.sub(r'http\S+', '', text)

    # Remove non-printable or non-ASCII characters if necessary
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    text = text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')

    # Convert to lowercase
    text = text.lower()

    return text

In [6]:
train_df = df.copy()
train_df['news'] = train_df['news'].apply(preprocess_text)

In [7]:
# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('google-t5/t5-small')
tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
class NewsHeadlineDataset(Dataset):
    def __init__(self, data, tokenizer, include_method=True):
        self.data = data
        self.tokenizer = tokenizer
        self.include_method = include_method

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract news and headline
        news_text = self.data.iloc[idx].get('news')
        headline_text = self.data.iloc[idx].get('headline')

        # Handle missing or invalid data gracefully
        if pd.isna(news_text) or pd.isna(headline_text):
            return None  # Skip rows with missing news or headline

        # Replace '____' with <extra_id_0> token
        headline_text = headline_text.replace("____", "<extra_id_0>")

        # During training, include the method if available
        if self.include_method and 'method' in self.data.columns:
            method_text = self.data.iloc[idx].get('method', '')
            input_text = f"News: {news_text} Headline: {headline_text} Method: {method_text}"
        else:
            input_text = f"News: {news_text} Headline: {headline_text}"

        # Tokenize input
        inputs = self.tokenizer.encode(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)

        # Handle labels for training
        if 'answer' in self.data.columns:
            label = self.data.iloc[idx].get('answer')

            # Ensure label exists and is valid
            if pd.isna(label):
                return None  # Skip rows with missing labels

            labels = self.tokenizer.encode(f"<extra_id_0> {label} <extra_id_1>", return_tensors="pt", truncation=True, padding="max_length", max_length=8)
            labels = labels.squeeze()  # Remove unnecessary dimensions

            return inputs.squeeze(), labels
        else:
            return inputs.squeeze(), None  # For inference, only inputs are returned

In [9]:
def collate_fn(batch):
    # Filter out any None items
    batch = [item for item in batch if item is not None]

    if len(batch) == 0:
        return None, None  # Handle empty batch case

    inputs, labels = zip(*batch)
    inputs = torch.stack(inputs)

    if all(l is None for l in labels):  # Check if all labels are None
        labels = None
    else:
        labels = torch.stack(labels)

    return inputs, labels

In [10]:
train_dataset = NewsHeadlineDataset(train_df, tokenizer, include_method=True)

# DataLoader with custom collate function
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [11]:
from tqdm import tqdm

model.train()
for epoch in range(3):  # Train for 4 epochs
    total_loss = 0
    # Wrap train_dataloader with tqdm for epoch progress
    with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch + 1}') as pbar:
        for batch in train_dataloader:
            if batch is None:  # Skip empty batches
                continue

            inputs, labels = batch

            # If labels are None, continue to the next batch
            if labels is None:
                continue

            optimizer.zero_grad()

            inputs = inputs.to(model.device)
            labels = labels.to(model.device)

            # Forward pass (let T5 handle label shifting internally)
            outputs = model(input_ids=inputs, labels=labels)

            loss = outputs.loss
            total_loss += loss.item()

            # Backpropagation
            loss.backward()
            optimizer.step()

            # Update the progress bar
            pbar.set_postfix({'loss': loss.item()})
            pbar.update(1)

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

Epoch 1: 100%|██████████| 248/248 [11:16<00:00,  2.73s/it, loss=0.208]


Epoch 1, Loss: 1.5126644978299737


Epoch 2: 100%|██████████| 248/248 [10:29<00:00,  2.54s/it, loss=0.182] 


Epoch 2, Loss: 0.29039154173205456


Epoch 3: 100%|██████████| 248/248 [10:06<00:00,  2.45s/it, loss=0.154] 

Epoch 3, Loss: 0.23258301901120332





In [12]:
# Load test data (no method or label)
test_df = pd.read_csv('unlabelled_test_set_t3.csv')

test_df['news'] = test_df['news'].apply(preprocess_text)
test_dataset = NewsHeadlineDataset(test_df, tokenizer, include_method=False)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

# Inference loop
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        if batch[0] is None:  # Skip empty batches
            continue

        inputs, _ = batch  # No labels in test data
        generated_ids = model.generate(inputs)
        predicted_number = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        predictions.append(predicted_number)

# Output predictions
for i, prediction in enumerate(predictions):
    print(f"Test example {i + 1}: Predicted number: {prediction}")



Test example 1: Predicted number: 70
Test example 2: Predicted number: 15
Test example 3: Predicted number: 1
Test example 4: Predicted number: 61
Test example 5: Predicted number: 792
Test example 6: Predicted number: 45
Test example 7: Predicted number: 4
Test example 8: Predicted number: 2018
Test example 9: Predicted number: 500
Test example 10: Predicted number: 4
Test example 11: Predicted number: 10
Test example 12: Predicted number: 36
Test example 13: Predicted number: 2000
Test example 14: Predicted number: 15
Test example 15: Predicted number: 2
Test example 16: Predicted number: 76
Test example 17: Predicted number: 13
Test example 18: Predicted number: 1800
Test example 19: Predicted number: 50
Test example 20: Predicted number: 0
Test example 21: Predicted number: 69
Test example 22: Predicted number: 66
Test example 23: Predicted number: s 4
Test example 24: Predicted number: 8
Test example 25: Predicted number: 3
Test example 26: Predicted number: 1975
Test example 27: 

In [13]:
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])
predictions_df['ID'] = test_df['ID'].values
predictions_df = predictions_df[['ID', 'Prediction']]

In [14]:
predictions_df['Prediction'] = pd.to_numeric(predictions_df['Prediction'], errors='coerce').fillna(0).astype(int)

In [15]:
predictions_df.to_csv('predictions3.csv',index=False)  # Save predictions to CSV file