# 1. Setup

In [1]:
# install packages
!pip install -q transformers==4.37.2
!pip install -q sentencepiece
!pip install -q evaluate
!pip install -q tensorflow==2.15
!pip install -q rouge_score

In [2]:
# import libraries
import os
import re
import numpy as np
import pandas as pd
import evaluate
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [3]:
# mount Colab to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# verify data exists in Google Drive dir
!ls 'drive/My Drive/W266/Final Project'

 fine_tuning		     t5_results.csv    W266_Final_Project_Data_Cleanup.ipynb
 model_checkpoints	     test_pairs.csv    W266_Final_Project_Pegasus_Ray.ipynb
'old dataset split'	     train_pairs.csv  ' W266_Final_Project_T5_Dipika.ipynb'
 reddit_subset_cleaned.csv   valid_pairs.csv   W266_Final_Project_T5_Dipika_t5_finetune.ipynb


# 2. Load Data

In [5]:
# File paths for the split CSV files
train_file = 'drive/My Drive/W266/Final Project/train_pairs.csv'
valid_file = 'drive/My Drive/W266/Final Project/valid_pairs.csv'
test_file = 'drive/My Drive/W266/Final Project/test_pairs.csv'

# Load the CSV files into lists of dictionaries
train_pairs = pd.read_csv(train_file).to_dict('records')
valid_pairs = pd.read_csv(valid_file).to_dict('records')
test_pairs = pd.read_csv(test_file).to_dict('records')

# Display the first few items of each list
print("Training pairs:")
print(train_pairs[:5])

print("\nValidation pairs:")
print(valid_pairs[:5])

print("\nTest pairs:")
print(test_pairs[:5])

Training pairs:
[{'post': 'my goal is to detect an object that is being flashed in front of a camera therefore the input is a video converted into images frames and the sequence matters i am trying to figure out how to go about training it usually video object detection algorithms detect objects in each frame my problem is that the objects i am trying to classify are similar and the object is not fully visible in any single frame because of a hand holding it in order to correctly tell what the object is you have to look at multiple frames i found which looks at the data sequentially but i am not sure how to map it to my situation anyone else knows about any implementations of a similar problem or potential ways of solving it', 'title': 'video object detection detecting object in the video frames sequentially'}, {'post': 'hi everyone for my master is thesis at the vrije universiteit amsterdam i am researching the environment for it professionals to report wrongdoing related to software 

In [26]:
# Modify test data to subset for evaluation

indexes = [408, 711, 1114, 699, 236, 262, 267, 951, 315, 647] #adding the survey indicies
combined_indexes = indexes + list(range(90))  # Adding the first 90 indicies

new_dataset = []  # Initialize a new list to store the selected elements

for index in combined_indexes:
    new_dataset.append(test_pairs[index])

test_pairs = new_dataset

test_pairs

[{'post': 'suppose you are on a data science team and are given a dataset and problem statement after preliminary steps you know you need to train for example a classifier what does this process look like if you are trying to follow a principled data science workflow and organize yourself according to something like cookiecutter data science the goal is to have a reproducible workflow that is transparent so that anyone on your team could see how you arrived at your results and do it themselves this process naturally involves a lot of experimentation with pipeline steps and model selection as well as hyperparameter tuning one approach i could think of would be to do all of the experimentation with interleaved commentsdiscussion in a notebook say experimentsipynb and then once i pin down the best pipeline configuration for my problem reproduce the entire pipeline in a script modelpy which trains a model from the raw data and saves it are there better ways to do this it is hard to find ex

# 3. Preprocessor and Data Generator

In [7]:
def preprocess_data(pairs, tokenizer, model, max_length=128):
    post_text = [post for post, title in pairs]
    post_encoded = tokenizer.batch_encode_plus(
        post_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    post_input_ids = np.array(post_encoded["input_ids"], dtype="int32")
    post_attention_masks = np.array(post_encoded["attention_mask"], dtype="int32")

    title_text = [title for post, title in pairs]
    title_encoded = tokenizer.batch_encode_plus(
        title_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    label_ids = np.array(title_encoded['input_ids'])
    decoder_input_ids = model._shift_right(label_ids)

    return [post_input_ids, post_attention_masks, decoder_input_ids], label_ids

In [8]:
class SummarizeDataGenerator(tf.keras.utils.Sequence):

    def __init__(self,
                 tokenizer,
                 model,
                 n_examples,
                 data_filename,
                 max_length=128,
                 batch_size=16,
                 shuffle=True):

        self.tokenizer = tokenizer
        self.model = model
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

    def __len__(self):
        # Return the number of batches in the full dataset
        return self.n_examples // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)

        pairs = df[['post', 'title']].values.astype(str).tolist()

        batch_data = preprocess_data(
            pairs,
            self.tokenizer,
            self.model,
            self.max_length
        )

        return batch_data

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

# 4. Pretrained model

---

In [9]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior w

In [10]:
# Create the data generators for train and validation data, tensorflow version

max_length = 64
batch_size = 16

train_data_generator = SummarizeDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(train_pairs),
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = SummarizeDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(valid_pairs),
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [11]:
def build_t5_training_wrapper_model(t5_model, max_length, learning_rate=0.00005):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [12]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length, learning_rate=0.00005)

In [13]:
# add a model checkpoint callback to save
# the trained model weights after each epoch.

checkpoint_dir = 'drive/My Drive/W266/Final Project/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + 't5_reddit_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)


In [14]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=3,
                  callbacks=[model_checkpoint_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f91e854a5c0>

# 5. Evaluation

In [15]:
test_pairs[0]

{'post': 'some background i have been learning python for over a year now and i know some sql a bit of r and i have completed some small projects using data science data engineering practices i also know how to work in excel but i do not really have experience using databases i am totally willing to make the time and money investment in something like a bootcamp and i have the means to do fulltime training but i do not want to do this if there is a better faster way to get into the industry what i really want to know is what can i do that will get me a job in the field asap is there some specific bootcamp that will make this happen if so what are the best bootcamps or some particular tech skill i could learn that would basically guarantee that i am hireable very soon if i something like learned microsoft sql server or tableau and given my other skills would this be likely to get me hired i have been looking into bootcamps like thinkful springboard and data application lab the concern i

In [18]:
prefix = 'create headline for post: '

for test_input_text in ['Hello all - I have an upcoming live case interview at CVS for their data science role. Can yall please share your experience of how the interview went? \
                        Did it involve quantitative analysis or was it just qualitative in nature? \
                        Did they supplement the discussion with some data? \
                        Were they expecting a technical ML solution? Or did they only want to guage the candidates thought process and structured communication?']:

    test_inputs = t5_tokenizer([prefix + test_input_text], return_tensors='tf')
    test_output_ids = t5_model.generate(test_inputs['input_ids'])

    print([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                               clean_up_tokenization_spaces=False) for out_ids in test_output_ids])



['upcoming live case interview']


In [21]:
#load the saved model weights
checkpoint_dir = 'drive/My Drive/W266/Final Project/model_checkpoints/'

checkpoint_filepath = checkpoint_dir + 't5_reddit_weights.03-0.88.hdf5'
model_wrapper.load_weights(checkpoint_filepath)

In [27]:
# Still works?
for test_input_text in ['Hello all - I have an upcoming live case interview at CVS for their data science role. Can yall please share your experience of how the interview went? \
                        Did it involve quantitative analysis or was it just qualitative in nature? \
                        Did they supplement the discussion with some data? \
                        Were they expecting a technical ML solution? Or did they only want to guage the candidates thought process and structured communication?']:
    test_inputs = t5_tokenizer([prefix + test_input_text], return_tensors='tf')
    test_output_ids = t5_model.generate(test_inputs['input_ids'])

    print([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                               clean_up_tokenization_spaces=False) for out_ids in test_output_ids])



['upcoming live case interview']


In [28]:
# Candidates: these are the actual Reddit titles from the test set
EXAMPLES_NUM = 10

test_posts = [t5_tokenizer(item['post'], max_length=512, truncation=True, return_tensors='tf') for item in test_pairs[:EXAMPLES_NUM]]

# Generating output ids for each tokenized post
test_output_ids = [t5_model.generate(post['input_ids'],
                                    num_beams=3,
                                    no_repeat_ngram_size=2,
                                    num_return_sequences=1,  # returns the # of sequences for each post
                                    max_length=128) for post in test_posts]

In [29]:
t5_model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
Total params: 222903552 (850.31 MB)
Trainable params: 222903552 (850.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
EXAMPLES_NUM = len(test_pairs)

print(f"Generating {EXAMPLES_NUM} samples ... ")

# Generate input ids for each tokenized post
test_posts = [t5_tokenizer(item['post'], max_length=512, truncation=True, return_tensors='tf') for item in test_pairs[:EXAMPLES_NUM]]

# Generating output ids for each tokenized post
start_time = time.time()
test_output_ids = [t5_model.generate(post['input_ids'],
                                    num_beams=3,
                                    no_repeat_ngram_size=2,
                                    num_return_sequences=1,  # returns the # of sequences for each post
                                    max_length=128) for post in test_posts]

end_time = time.time()
run_time = end_time - start_time

print(f"Generate Time elapsed: {run_time} seconds.\n")

# initialize list of candidates
candidates = []

# Decode each output in the batch of generated outputs
for out_ids in test_output_ids:
    # Decode each output in the batch of generated outputs
    candidates_batch = [t5_tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for ids in out_ids]
    candidates.extend(candidates_batch)  # Extend the main candidates list with the batch

# Inspect references list
for idx, candidate in enumerate(candidates):
  print("Candidate #",idx,":\t ",candidate)
  #print("\t",candidate)


Generating 100 samples ... 
Generate Time elapsed: 576.8904876708984 seconds.

Candidate # 0 :	  how do you organize your results
Candidate # 1 :	  request flickmatrix movie score data
Candidate # 2 :	  should i do a phd in data science
Candidate # 3 :	  what are the job prospects of a data scientist
Candidate # 4 :	  data analyst job or not
Candidate # 5 :	  what is the best way to build an api for describing and defining entities in a video
Candidate # 6 :	  is it normal to have a bad working environment for an analytics manager
Candidate # 7 :	  what is the best way to get a masters in ds
Candidate # 8 :	  is it a good idea to learn python without using other languages
Candidate # 9 :	  looking for crime scene datasets
Candidate # 10 :	  what is the best bootcamp for getting a job in the field asap
Candidate # 11 :	  python sngram objs
Candidate # 12 :	  what is the name of this model
Candidate # 13 :	  how to set up a recurring job in retraining
Candidate # 14 :	  question about ma

In [31]:
# References: we will compare the generated titles against these actual test values
start_time = time.time()

# reference list
references = [item['title'] for item in test_pairs[:EXAMPLES_NUM]]

# original post
references_post = [item['post'] for item in test_pairs[:EXAMPLES_NUM]]

# Inspect references list
for idx, reference in enumerate(references):
    print("Reference #",idx,":\t ",reference)

end_time = time.time()
run_time = end_time - start_time
print(f"Generate Time elapsed: {run_time} seconds.\n")

Reference # 0 :	  what does a good scikitlearn workflow look like
Reference # 1 :	  dataset request ranking best films of all time
Reference # 2 :	  ds masters subsequent phd studies
Reference # 3 :	  career change to data science
Reference # 4 :	  how should i start
Reference # 5 :	  building an api query language for rich data like images and video
Reference # 6 :	  joining firm company culture concern
Reference # 7 :	  is a pricey masters degree worth it
Reference # 8 :	  seeking advice for learning path
Reference # 9 :	  crime scene datasetphoto and video database
Reference # 10 :	  i want to transition into the data scienceanalyst or related field rather than asking whether i should choose a particular boootcamp or learn some language i would like to hear opinions on what path should i choose that will land me a job of some kind in the field as soon as possible
Reference # 11 :	  psngram linguistic features for improving machine learning and deep learning model accuracy for the fi

In [32]:
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=candidates, references=references)

pd.DataFrame([rouge_results])

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,0.239797,0.08335,0.219164,0.220178


In [33]:
bleu = evaluate.load("bleu")
bleu_results = bleu.compute(predictions=candidates, references=references)

pd.DataFrame([bleu_results])

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.059214,"[0.2523041474654378, 0.09244791666666667, 0.03...",0.903587,0.90795,868,956


In [34]:
# Combine Candidates and References

df = pd.DataFrame({'ref_post': references_post,
                   'ref_title': references,
                   'candidate_title': candidates
                   })
# Inspect DF
print(df.head())

# Export to CSV
df.to_csv('drive/My Drive/W266/t5_fine_tune_results.csv', index=True, index_label='Index_ID')

                                            ref_post  \
0  suppose you are on a data science team and are...   
1  i would like to request a datasite that ranks ...   
2  i have been considering a midcareer switch fro...   
3  hi everyone i am a practicing attorney conside...   
4  hey everyone i want to become a data analyst a...   

                                         ref_title  \
0  what does a good scikitlearn workflow look like   
1   dataset request ranking best films of all time   
2                ds masters subsequent phd studies   
3                    career change to data science   
4                               how should i start   

                                  candidate_title  
0                how do you organize your results  
1            request flickmatrix movie score data  
2               should i do a phd in data science  
3  what are the job prospects of a data scientist  
4                         data analyst job or not  
