In [None]:
pip install transformers




In [None]:
import pandas as pd

# Load the uploaded dataset to examine its structure and content
file_path = 'LABELLED_TRAIN.csv'
dataset = pd.read_csv(file_path)

# Display basic information about the dataset and a sample of its contents
dataset_info = dataset.info()
sample_data = dataset.head()

dataset_info, sample_data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            3000 non-null   object
 1   News Article  3000 non-null   object
 2   Caption       3000 non-null   object
dtypes: object(3)
memory usage: 70.4+ KB


(None,
           ID                                       News Article  \
 0  TRAIN_1_1  (Aug 30, 2019 10:52 AM CDT) The Democratic Nat...   
 1  TRAIN_1_2  (Sep 14, 2016 8:24 AM CDT) Authorities have fi...   
 2  TRAIN_1_3  (Aug 31, 2015 12:54 PM CDT) An Illinois mom wh...   
 3  TRAIN_1_4  (Oct 15, 2012 10:36 AM CDT) Brad Pitt's latest...   
 4  TRAIN_1_5  (Nov 21, 2012 12:01 PM) When Judd Apatow was a...   
 
                                              Caption  
 0            DNC Is Nervous About 2 Virtual Caucuses  
 1          43 Years After Girls' Slayings, 2 Arrests  
 2  Boy Still Missing 4 Years After Mom Killed Her...  
 3  In a First, Chanel No. 5 Hawked by a Guy: Brad...  
 4  Simpsons Episode Apatow Wrote 22 Years Ago to Air  )

In [None]:
# Preprocessing function to clean the "News Article" column
def clean_text(text):
    """
    Cleans the input text by removing timestamps, parentheticals, and normalizing it.
    """
    import re
    # Remove text inside parentheses and timestamps
    text = re.sub(r"\(.*?\)", "", text)
    # Remove leading and trailing whitespaces
    text = text.strip()
    return text

# Apply cleaning to the "News Article" column
dataset['Cleaned Article'] = dataset['News Article'].apply(clean_text)

# Display the first few rows of the cleaned dataset
cleaned_sample = dataset[['ID', 'Cleaned Article', 'Caption']].head()
cleaned_sample


Unnamed: 0,ID,Cleaned Article,Caption
0,TRAIN_1_1,The Democratic National Committee will recomme...,DNC Is Nervous About 2 Virtual Caucuses
1,TRAIN_1_2,Authorities have finally made arrests in the d...,"43 Years After Girls' Slayings, 2 Arrests"
2,TRAIN_1_3,An Illinois mom who killed herself four years ...,Boy Still Missing 4 Years After Mom Killed Her...
3,TRAIN_1_4,Brad Pitt's latest gig: selling perfume. The a...,"In a First, Chanel No. 5 Hawked by a Guy: Brad..."
4,TRAIN_1_5,"When Judd Apatow was a new comedy writer, the ...",Simpsons Episode Apatow Wrote 22 Years Ago to Air


In [None]:
# prompt: train test spilt the data set

from sklearn.model_selection import train_test_split

# Assuming 'Cleaned Article' is your feature and 'Caption' is your target
X = dataset['Cleaned Article']
y = dataset['Caption']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% train, 20% test

# Now you have X_train, X_test, y_train, y_test for model training and evaluation
print(f"Training data shape: X={X_train.shape}, y={y_train.shape}")
print(f"Testing data shape: X={X_test.shape}, y={y_test.shape}")

Training data shape: X=(2400,), y=(2400,)
Testing data shape: X=(600,), y=(600,)


In [None]:
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Step 1: Preprocessing
# Remove timestamps and unnecessary details in "News Article" for clarity
dataset['News Article'] = dataset['News Article'].str.replace(r"\(.*?\)", "", regex=True).str.strip()

# Step 2: Splitting data into training and validation sets
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Tokenizer and Model setup for T5
model_name = "t5-small"  # A lightweight T5 model for demonstration
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization function
def tokenize_data(data, tokenizer, max_input_length=512, max_target_length=50):
    inputs = tokenizer(
        data['News Article'].tolist(),
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    targets = tokenizer(
        data['Caption'].tolist(),
        max_length=max_target_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    return inputs, targets

# Tokenize training and validation data
train_inputs, train_targets = tokenize_data(train_data, tokenizer)
val_inputs, val_targets = tokenize_data(val_data, tokenizer)

# Convert tokenized data into PyTorch Dataset
class HeadlineDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs["input_ids"][idx]
        attention_mask = self.inputs["attention_mask"][idx]
        labels = self.targets["input_ids"][idx]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

train_dataset = HeadlineDataset(train_inputs, train_targets)
val_dataset = HeadlineDataset(val_inputs, val_targets)

# Step 3: Training setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    fp16=True  # Use mixed precision if supported
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the trained model for inference
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

# Test-ready system: Prepare a prediction function
def generate_headline(article, model, tokenizer, max_length=50):
    inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example prediction from the validation set
sample_article = val_data.iloc[0]["News Article"]
sample_prediction = generate_headline(sample_article, model, tokenizer)
sample_prediction


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


Abort: 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import re

# Step 5: Evaluating the Model - Function definition moved to the top
def evaluate_metrics(predictions, references):
    """
    Calculates BLEU and ROUGE-L scores for predictions and references.
    """
    bleu_scores = []
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_scores = []

    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()

        # BLEU Score
        bleu_score = sentence_bleu([ref_tokens], pred_tokens)
        bleu_scores.append(bleu_score)

        # ROUGE-L Score
        rouge_score = rouge_scorer_obj.score(ref, pred)['rougeL'].fmeasure
        rouge_l_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_bleu, avg_rouge_l

# Step 1: Loading the datasets
train_file = 'LABELLED_TRAIN.csv'
val_file = 'LABELLED_DEV.csv'
test_file = 'UNLABELLED_TEST.csv'  # To be used once the test set is released

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

# Load test data only if it exists
try:
    test_data = pd.read_csv(test_file)
except FileNotFoundError:
    print("Test file not found. Proceeding without test data.")
    test_data = None

# Step 2: Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the input text:
    - Removes special characters
    - Converts to lowercase
    """
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Preprocess the articles and captions
train_data['News Article'] = train_data['News Article'].apply(preprocess_text)
val_data['News Article'] = val_data['News Article'].apply(preprocess_text)
train_data['Caption'] = train_data['Caption'].apply(preprocess_text)
val_data['Caption'] = val_data['Caption'].apply(preprocess_text)

if test_data is not None:
    test_data['News Article'] = test_data['News Article'].apply(preprocess_text)

# Convert captions to TF-IDF representations
y_vectorizer = TfidfVectorizer(max_features=5000)  # New TF-IDF vectorizer for the captions
y_train = train_data['Caption']
y_val = val_data['Caption']

y_train_tfidf = y_vectorizer.fit_transform(y_train).toarray()
y_val_tfidf = y_vectorizer.transform(y_val).toarray()

# Step 3: Feature Extraction using TF-IDF
# TF-IDF: Term Frequency-Inverse Document Frequency, a feature representation for text
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['News Article'])
X_val = vectorizer.transform(val_data['News Article'])

if test_data is not None:
    X_test = vectorizer.transform(test_data['News Article'])

# Training Ridge Regression with numeric targets
model = Ridge(alpha=1.0)
model.fit(X_train, y_train_tfidf)

# Predicting on validation data
val_predictions_tfidf = model.predict(X_val)

# Decoding predictions back to text (approximation)
# Using the feature names to decode predictions
def decode_tfidf_predictions(predictions, vectorizer):
    """Convert predicted TF-IDF values back to approximate text."""
    feature_names = vectorizer.get_feature_names_out()
    decoded_predictions = []
    for pred_vector in predictions:
        # Take the top N terms with the highest weights
        top_indices = pred_vector.argsort()[-10:][::-1]  # Top 10 terms
        words = [feature_names[i] for i in top_indices]
        decoded_predictions.append(" ".join(words))
    return decoded_predictions

# Decode predictions
val_predictions_text = decode_tfidf_predictions(val_predictions_tfidf, y_vectorizer)

# Evaluate the model
val_bleu, val_rouge_l = evaluate_metrics(val_predictions_text, y_val)
print(f"Validation Set Performance - BLEU: {val_bleu:.4f}, ROUGE-L: {val_rouge_l:.4f}")


# Step 6: Predicting on Test Data (if available)
if test_data is not None:
    test_predictions_tfidf = model.predict(X_test)
    test_predictions_text = decode_tfidf_predictions(test_predictions_tfidf, y_vectorizer)
    test_data['Prediction'] = test_predictions_text

    # Save predictions to CSV
    output_file = '{{265}}.csv'
    test_data[['ID', 'Prediction']].to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

print("Good to goooo!")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Validation Set Performance - BLEU: 0.0008, ROUGE-L: 0.1417
Predictions saved to {{265}}.csv
Good to goooo!


In [None]:
!pip install --upgrade keras tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer


# Step 1: Loading the datasets
train_file = 'LABELLED_TRAIN.csv'
val_file = 'LABELLED_DEV.csv'

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

# Step 2: Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the input text:
    - Removes special characters
    - Converts to lowercase
    """
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Preprocess the articles and captions
train_data['News Article'] = train_data['News Article'].apply(preprocess_text)
val_data['News Article'] = val_data['News Article'].apply(preprocess_text)
train_data['Caption'] = train_data['Caption'].apply(preprocess_text)
val_data['Caption'] = val_data['Caption'].apply(preprocess_text)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['News Article'])
X_train_seq = tokenizer.texts_to_sequences(train_data['News Article'])
X_val_seq = tokenizer.texts_to_sequences(val_data['News Article'])

# Padding sequences to ensure uniform input length
max_len = max([len(seq) for seq in X_train_seq])  # Could also be a fixed value
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

# Convert captions to TF-IDF representations
y_vectorizer = TfidfVectorizer(max_features=5000)  # New TF-IDF vectorizer for the captions
y_train = train_data['Caption']
y_val = val_data['Caption']

y_train_tfidf = y_vectorizer.fit_transform(y_train).toarray()
y_val_tfidf = y_vectorizer.transform(y_val).toarray()


# Building the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(5000, activation='linear')) # Output size must match the target size of the vectorizer

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error']) # Changed the loss

# Train the model
model.fit(X_train_pad, y_train_tfidf, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val_tfidf))





Epoch 1/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 2s/step - loss: 2.0008e-04 - mean_squared_error: 2.0008e-04 - val_loss: 1.9869e-04 - val_mean_squared_error: 1.9869e-04
Epoch 2/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 2s/step - loss: 1.9959e-04 - mean_squared_error: 1.9959e-04 - val_loss: 1.9871e-04 - val_mean_squared_error: 1.9871e-04
Epoch 3/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 2s/step - loss: 1.9961e-04 - mean_squared_error: 1.9961e-04 - val_loss: 1.9868e-04 - val_mean_squared_error: 1.9868e-04
Epoch 4/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2s/step - loss: 1.9953e-04 - mean_squared_error: 1.9953e-04 - val_loss: 1.9859e-04 - val_mean_squared_error: 1.9859e-04
Epoch 5/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 2s/step - loss: 1.9950e-04 - mean_squared_error: 1.9950e-04 - val_loss: 1.9862e-04 - val_mean_squared_error: 1.9862e-04


<keras.src.callbacks.history.History at 0x7e2dbec669b0>

In [None]:
!pip install --upgrade keras tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
import numpy as np

# Step 1: Loading the datasets
train_file = 'LABELLED_TRAIN.csv'
val_file = 'LABELLED_DEV.csv'

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

# Step 2: Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the input text:
    - Removes special characters
    - Converts to lowercase
    """
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Preprocess the articles and captions
train_data['News Article'] = train_data['News Article'].apply(preprocess_text)
val_data['News Article'] = val_data['News Article'].apply(preprocess_text)
train_data['Caption'] = train_data['Caption'].apply(preprocess_text) #preprocess captions too
val_data['Caption'] = val_data['Caption'].apply(preprocess_text)


# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['News Article'])
X_train_seq = tokenizer.texts_to_sequences(train_data['News Article'])
X_val_seq = tokenizer.texts_to_sequences(val_data['News Article'])

# Padding sequences to ensure uniform input length
max_len = max([len(seq) for seq in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)


# Create binary labels - let's see if the caption contains 'man' or not
y_train = np.array([1 if 'man' in caption else 0 for caption in train_data['Caption']])
y_val = np.array([1 if 'man' in caption else 0 for caption in val_data['Caption']])



# Building the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val))

Epoch 1/5




[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2s/step - accuracy: 0.9083 - loss: 0.3921 - val_accuracy: 0.9260 - val_loss: 0.2638
Epoch 2/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 2s/step - accuracy: 0.9296 - loss: 0.2543 - val_accuracy: 0.9260 - val_loss: 0.2714
Epoch 3/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 2s/step - accuracy: 0.9422 - loss: 0.2187 - val_accuracy: 0.9250 - val_loss: 0.2621
Epoch 4/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 2s/step - accuracy: 0.9495 - loss: 0.1472 - val_accuracy: 0.9100 - val_loss: 0.3056
Epoch 5/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 2s/step - accuracy: 0.9814 - loss: 0.0598 - val_accuracy: 0.9140 - val_loss: 0.4117


<keras.src.callbacks.history.History at 0x7e2db7f77130>