# 1. Setup

In [None]:
# install packages
!pip install -q transformers

In [None]:
# import libraries
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from transformers import TFPegasusForConditionalGeneration, PegasusTokenizer
from sklearn.model_selection import train_test_split
from google.colab import drive

In [None]:
# mount Colab to Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# verify data exists in Google Drive dir
!ls 'drive/My Drive/W266'

reddit_database_cleaned.csv  W266_Final_Project.ipynb
reddit_database.csv	     W266_Final_Project_Main.ipynb


# 2. Load Data

In [None]:
# load claensed Reddit ds
# load data
df = pd.read_csv('drive/My Drive/W266/reddit_database_cleaned.csv')
df.head(3)

Unnamed: 0,title,post
0,so what do you guys all do related to analytic...,theres a lot of reasons to want to know all th...
1,googles invasive nonanonymized ad targeting a ...,im cross posting this from rcyberlaw hopefully...
2,dotced functional web analytics tagging report...,dotceda functional analytics consultant offeri...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274243 entries, 0 to 274242
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   274238 non-null  object
 1   post    274115 non-null  object
dtypes: object(2)
memory usage: 4.2+ MB


# 3. Train/Test Split

In [None]:
# Initialize the tokenizer
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

In [None]:
# Tokenization function
def tokenize_function(input_data):
    # treat post and title as text strings
    post_text = str(input_data['post'])
    title_text = str(input_data['title'])

    model_inputs = tokenizer(
        post_text,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="tf")

    # Prepare labels
    labels = tokenizer(title_text,
                       max_length=128,
                       truncation=True,
                       padding="max_length",
                       return_tensors="tf").input_ids.numpy()

    # Set padding token labels to -100 to be ignored in loss computation
    labels[labels == 0] = -100

    # Ensure the output is compatible with pandas DataFrame operations
    model_inputs = {key: value.numpy().tolist()[0] for key, value in model_inputs.items()}  # Convert tensors to list
    model_inputs['labels'] = labels.tolist()[0]  # Convert numpy array to list

    return model_inputs

In [None]:
# Prepare the NLP dataset

# Subset data
df = df.head(1000)

# Apply Tokenize function to data
data = df.apply(tokenize_function, axis=1)

In [None]:
data_df.columns = ['input_ids', 'attention_mask', 'labels']
data_df.head(3)

Unnamed: 0,input_ids,attention_mask,labels
0,"[186, 116, 114, 367, 113, 1523, 112, 245, 112,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[167, 180, 171, 119, 2266, 149, 171, 985, 112,..."
1,"[7701, 1891, 3906, 136, 135, 6114, 68087, 5505...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6803, 116, 11906, 609, 1321, 24985, 3792, 425..."
2,"[171, 31485, 24703, 3819, 5832, 5705, 1192, 68...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[15525, 27774, 3819, 712, 5832, 28532, 3234, 1..."


In [27]:
# Update dict key names to columns names in data_df
data = {'input_ids': list(data_df['input_ids']), 'attention_mask': list(data_df['attention_mask']), 'labels': list(data_df['labels'])}

In [29]:
data = pd.DataFrame(data)
data.head(3)

Unnamed: 0,input_ids,attention_mask,labels
0,"[186, 116, 114, 367, 113, 1523, 112, 245, 112,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[167, 180, 171, 119, 2266, 149, 171, 985, 112,..."
1,"[7701, 1891, 3906, 136, 135, 6114, 68087, 5505...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6803, 116, 11906, 609, 1321, 24985, 3792, 425..."
2,"[171, 31485, 24703, 3819, 5832, 5705, 1192, 68...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[15525, 27774, 3819, 712, 5832, 28532, 3234, 1..."


In [30]:
# Split the data
train_data, val_data = train_test_split(data, test_size=0.1)


In [32]:
print(train_data.head())
print(train_data["attention_mask"].dtype)

                                             input_ids  \
523  [33181, 18228, 117, 458, 114, 10695, 380, 134,...   
724  [119, 133, 114, 27420, 821, 111, 119, 245, 112...   
678  [9800, 2266, 269, 8078, 190, 109, 712, 5832, 2...   
960  [167, 11022, 532, 192, 172, 112, 248, 109, 467...   
983  [9800, 532, 133, 1939, 17031, 124, 161, 387, 1...   

                                        attention_mask  \
523  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
724  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
678  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
960  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
983  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

                                                labels  
523  [1161, 115, 761, 199, 112, 2488, 109, 1591, 38...  
724  [349, 371, 27420, 11153, 118, 1619, 3059, 111,...  
678  [14684, 1574, 833, 6803, 5832, 1, -100, -100, ...  
960  [117, 186, 114, 230, 17325, 112, 33076, 6921, ...  
983  [906, 160, 1

In [37]:
def convert_attention_mask(data):
  for i in range(len(data)):
    if isinstance(data["attention_mask"][i], list):
      data["attention_mask"][i] = data["attention_mask"][i][0][0]
  return data


train_data = convert_attention_mask(train_data)
val_data = convert_attention_mask(val_data)

train_data["attention_mask"] = train_data["attention_mask"].astype("int64")
val_data["attention_mask"] = val_data["attention_mask"].astype("int64")

KeyError: 13

# 4. Abstractive Summarization Baseline Model: Pegasus

In [31]:
# set bactch size
batch_size = 16

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_data))).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_data))).batch(batch_size)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [None]:
# load PEGASUS
model = TFPegasusForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Fine-tune
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=model.compute_loss)