# 1. Setup

In [1]:
# install packages
!pip install -q transformers==4.37.2
!pip install -q sentencepiece
!pip install -q evaluate
!pip install -q tensorflow==2.15
!pip install -q rouge_score

In [2]:
# import libraries
import os
import re
import numpy as np
import pandas as pd
import evaluate
import time

import tensorflow as tf
print(f"Tensorflow v{tf.__version__}")
from tensorflow import keras
from tensorflow.keras import layers


from transformers import AutoTokenizer, TFBartForConditionalGeneration


from google.colab import drive

Tensorflow v2.15.0


In [3]:
# mount Colab to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# verify data exists in Google Drive dir
!ls 'drive/My Drive/W266'

bart_results.csv  model_checkpoints    reddit_subset_cleaned.csv  train_pairs.csv
gpt_results.csv   opt350m_results.csv  test_pairs.csv		  valid_pairs.csv


# 2. Load Data

In [5]:
# load clean Reddit ds
pd.set_option('display.max_colwidth', None)

df = pd.read_csv('drive/My Drive/W266/reddit_subset_cleaned.csv')

df.head(3)

Unnamed: 0,title,post
0,google is invasive nonanonymized ad targeting a quick confirmation of previously suspected privacy issues,i am cross posting this from rcyberlaw hopefully you guys find it as interesting as i didit deals with google analytics so quite awhile ago i ordered a papa john is pizza online my job largely involves looking at ads that appear online so afterwards i was quick to notice i was getting a lot of papa johns ads especially at night being served through a google owned company doubleclick media yesterday one of these ads popped up again on youtube a place that typically serves using the adwords program not doubleclick so i decided to copy the url for those not in the advertising field making full use of google is analytics tool means that certain information about the advertising campaign is leaked in the url so let is break it apart gt heresscs hereampadurl first off we see sscs sscs is doubleclick is redirect variable so rather than directly serving adwords ads they overrode it to serve through doubleclick then redirect through what would otherwise be an adwords link this is tighter integration than is generally seen with adwordsdoubleclick the interesting part is the end variables utm_sourcegooglenetworkamputm_mediumdisplaycpcamputm_campaigngoogleremarketing displaycpcgooglenetwork confirmation that doubleclick is now more finely integrated with adwords googleremarketing huh let is take a look at the definition for remarketing gtusing past campaign information to target a particular message to an audience while in the past behavioral targetting has largely been based on the sum of your use this is an interestingthough no doubt more widespread than is known change in that explicitly targeting old customers though a massive network of sites just thought i would put this out there i am sure it is not new to a lot of people but at least to me it was interesting to see concepts like this actually put into practice on such a large scale ps i did a quick survey across several thousand domains and for the record right now the most common external resource locations on the internet aregoogle owned is bolded pageadgooglesyndicationcom googleadsgdoubleclicknet edgequantservecom addoubleclicknet bscorecardresearchcom smdnnet dgspecificclicknet viewatdmtcom ajaxgoogleapiscom partnergoogleadservicescom that is a lot of data
1,potential job in web analytics need to analyze some data what are they looking for,i decided grad school physics was not for me and i am branching out into the job market a web analytics place is interested in me and i am interested in any kind of data analysis the exercise is to use a comparison of three or more months of data to prepare a to slide powerpoint presentation of any significant information about site visitors what they are doing how they arrive at our site that we could use to improve site performance as an acquisition source he said i should notell a story this is a field i am unfamiliar with so i am looking for any basic tips common pitfalls and expectations thanks i am quite familiar with data analysis in general
2,how to identify which google analytics account is tracking my site,hey all my gf is having trouble with ga and has not gotten any response in days from posting in the google help forums so i figure i would try here question as follows i have a client that we coded a website for and google analytics was plugged into it we would like to look at the statistics for the site and no one can identify what account is associated with the tracking code that is embedded i have pulled the user account number from the source code i am just not sure how to identify what the login associated with it is can anyone help this is a fairly urgent request thanks in advance for any help


In [6]:
# Split data: 80% Train, 15% Test, 5% Validation
pairs = []
generate_pairs = False

if generate_pairs:

  np.random.shuffle(pairs)
  num_valid_samples = int(0.10 * len(pairs))
  num_train_samples = len(pairs) - 2 * num_valid_samples #allocating 80% of dataset for training

  train_pairs = pairs[:num_train_samples]
  valid_pairs = pairs[num_train_samples : int(num_train_samples + num_valid_samples * 1.5)]
  test_pairs = pairs[int(num_train_samples + num_valid_samples * 1.5):]

In [7]:
# Regenerate train, validaiton, test file
generate_files = False

# Save splits to separate csv files, to load only part at a time later
train_file = 'drive/My Drive/W266/train_pairs.csv'
valid_file = 'drive/My Drive/W266/valid_pairs.csv'
test_file = 'drive/My Drive/W266/test_pairs.csv'

if generate_files:
  pd.DataFrame(train_pairs).to_csv(train_file, index=False)
  pd.DataFrame(valid_pairs).to_csv(valid_file, index=False)
  pd.DataFrame(test_pairs).to_csv(test_file, index=False)

# Load the CSV files into lists of dictionaries
train_pairs = pd.read_csv(train_file).to_dict('records')
valid_pairs = pd.read_csv(valid_file).to_dict('records')
test_pairs = pd.read_csv(test_file).to_dict('records')

In [8]:
print(f"{len(pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

0 total pairs
20000 training pairs
3750 validation pairs
1250 test pairs


In [9]:
# Display the first few items of each list

print("Training pairs:")
print(train_pairs[:5])

print("\nValidation pairs:")
print(valid_pairs[:5])

print("\nTest pairs:")
print(test_pairs[:5])

Training pairs:
[{'post': 'my goal is to detect an object that is being flashed in front of a camera therefore the input is a video converted into images frames and the sequence matters i am trying to figure out how to go about training it usually video object detection algorithms detect objects in each frame my problem is that the objects i am trying to classify are similar and the object is not fully visible in any single frame because of a hand holding it in order to correctly tell what the object is you have to look at multiple frames i found which looks at the data sequentially but i am not sure how to map it to my situation anyone else knows about any implementations of a similar problem or potential ways of solving it', 'title': 'video object detection detecting object in the video frames sequentially'}, {'post': 'hi everyone for my master is thesis at the vrije universiteit amsterdam i am researching the environment for it professionals to report wrongdoing related to software 

# 3. Preprocessor and Data Generator

In [10]:

def shift_right(input_ids, sos_token_id=0):
    # Create a tensor of start-of-sequence tokens
    sos_tokens = tf.fill([input_ids.shape[0], 1], sos_token_id)

    # Concatenate the start-of-sequence tokens to the beginning of the input_ids
    # and remove the last token to keep the same length
    shifted_input_ids = tf.concat([sos_tokens, input_ids[:, :-1]], axis=-1)

    return shifted_input_ids.numpy()

def preprocess_data(pairs, tokenizer, max_length=128):
    post_text = [post for post, title in pairs]
    post_encoded = tokenizer.batch_encode_plus(
        post_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    post_input_ids = np.array(post_encoded["input_ids"], dtype="int32")
    post_attention_masks = np.array(post_encoded["attention_mask"], dtype="int32")

    title_text = [title for post, title in pairs]
    title_encoded = tokenizer.batch_encode_plus(
        title_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    label_ids = np.array(title_encoded['input_ids'], dtype='int32')

    #decoder_input_ids = model._shift_right(label_ids)
    decoder_input_ids = shift_right(label_ids)

    return [post_input_ids, post_attention_masks, decoder_input_ids], label_ids

In [11]:
class SummarizeDataGenerator(tf.keras.utils.Sequence):

    def __init__(self,
                 tokenizer,
                 model,
                 n_examples,
                 data_filename,
                 max_length=128,
                 batch_size=16,
                 shuffle=True):

        self.tokenizer = tokenizer
        self.model = model
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

    def __len__(self):
        # Return the number of batches in the full dataset
        return self.n_examples // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = np.concatenate((self.row_order[:batch_start], self.row_order[batch_end:]))

        #batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]

        df = pd.read_csv(self.data_filename, skiprows=lambda x: x in batch_idx_skip)
        #df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)


        pairs = df[['post', 'title']].values.astype(str).tolist()

        DEBUG = False

        if DEBUG:
            print(f"data_filename: {self.data_filename,}")
            print(f"batch_end: {batch_end}")
            print(f"batch_idx_skip: {batch_idx_skip}")
            print(f"pairs: {pairs}")

            #print(f"pairs: {pairs.head()}")


        batch_data, labels = preprocess_data(
            pairs,
            self.tokenizer,
            self.max_length
        )

        inputs = {
            'input_ids': batch_data[0],
            'attention_mask': batch_data[1],
            'decoder_input_ids': batch_data[2]
        }

        return inputs, labels

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

# 4. Pretrained model

---

In [12]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

checkpoint = 'facebook/bart-large-cnn'
model = TFBartForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [13]:
# Create the data generators for train and validation data, tensorflow version

max_length = 64 #32
batch_size = 16

train_data_generator = SummarizeDataGenerator(
    tokenizer=tokenizer,
    model=model,
    n_examples=len(train_pairs),
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = SummarizeDataGenerator(
    tokenizer=tokenizer,
    model=model,
    n_examples=len(valid_pairs),
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

test_data_generator = SummarizeDataGenerator(
    tokenizer=tokenizer,
    model=model,
    n_examples=len(test_pairs),
    data_filename=test_file,
    max_length=max_length,
    batch_size=batch_size
)

In [14]:
from transformers import TFBartForConditionalGeneration
import tensorflow as tf
from tensorflow.keras import layers

def build_bart_training_model(max_length=64, learning_rate=0.0005):

    input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name='decoder_input_ids')

    outputs = model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids).logits

    training_model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids], outputs=[outputs])

    # Compile the model with an optimizer, loss, and metrics
    training_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                           metrics=['accuracy'])

    return training_model


In [15]:
bart_model = build_bart_training_model(max_length=64, learning_rate=0.0005)


In [16]:
# add a model checkpoint callback to save
# the trained model weights after each epoch.

checkpoint_dir = 'drive/MyDrive/W266/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + 'bart2_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
    )


In [None]:
NUM_EPOCHS = 1

# fit model
bart_model.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=NUM_EPOCHS,
                  callbacks=[model_checkpoint_callback],
                  batch_size=8
)

In [None]:
for test_input_text in ['Hello all - I have an upcoming live case interview at CVS for their data science role. Can yall please share your experience of how the interview went? \
                        Did it involve quantitative analysis or was it just qualitative in nature? \
                        Did they supplement the discussion with some data? \
                        Were they expecting a technical ML solution? Or did they only want to guage the candidates thought process and structured communication?']:

    test_inputs = tokenizer(test_input_text, max_length=512, truncation=True, return_tensors='tf')
    test_output_ids = model.generate(test_inputs['input_ids'], num_beams=4, max_length=56)

    print([tokenizer.decode(out_ids, skip_special_tokens=True,
                             clean_up_tokenization_spaces=False) for out_ids in test_output_ids])

# Actual Title: CVS Data scientist case interview


# 5. Evaluation

In [None]:
test_pairs[0]

In [None]:
# Candidates: these are the actual Reddit titles from the test set
EXAMPLES_NUM = 10
# Start Compute Units: 114.38 @ 4.91 per hour
# Start Compute Units: 100.77 @ 4.91 per hour

#EXAMPLES_NUM = len(test_pairs)
print(f"Generating {EXAMPLES_NUM} samples ... ")

# Generate input ids for each tokenized post
test_posts = [tokenizer(item['post'], max_length=512, truncation=True, return_tensors='tf') for item in test_pairs[:EXAMPLES_NUM]]
#test_posts = [p_tokenizer(item['post'], max_length=512, truncation=True, return_tensors='tf') for item in test_pairs[:]]

# Generating output ids for each tokenized post
start_time = time.time()
test_output_ids = [model.generate(post['input_ids'],
                                    num_beams=3,
                                    no_repeat_ngram_size=2,
                                    num_return_sequences=1,  # returns the # of sequences for each post
                                    max_length=128) for post in test_posts]

end_time = time.time()
run_time = end_time - start_time

print(f"Generate Time elapsed: {run_time} seconds.\n")
# Ending Compute Units: 114.38

# initialize list of candidates
candidates = []

# Decode each output in the batch of generated outputs
for out_ids in test_output_ids:
    # Decode each output in the batch of generated outputs
    candidates_batch = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for ids in out_ids]
    candidates.extend(candidates_batch)  # Extend the main candidates list with the batch

# Inspect references list
for idx, candidate in enumerate(candidates):
  print("Candidate #",idx,":\t ",candidate)
  #print("\t",candidate)

In [None]:
# References: we will compare the generated titles against these actual test values
start_time = time.time()

# reference list
references = [item['title'] for item in test_pairs[:EXAMPLES_NUM]]

# original post
references_post = [item['post'] for item in test_pairs[:EXAMPLES_NUM]]

# Inspect references list
for idx, reference in enumerate(references):
    print("Reference #",idx,":\t ",reference)

end_time = time.time()
run_time = end_time - start_time
print(f"Generate Time elapsed: {run_time} seconds.\n")

In [None]:
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=candidates, references=references)

pd.DataFrame([rouge_results])

In [None]:
bleu = evaluate.load("bleu")
bleu_results = bleu.compute(predictions=candidates, references=references)

pd.DataFrame([bleu_results])

In [None]:
# Combine Candidates and References

df = pd.DataFrame({'ref_post': references_post,
                   'ref_title': references,
                   'candidate_title': candidates
                   })
# Inspect DF
print(df.head())

# Export to CSV
df.to_csv('drive/My Drive/W266/opt350m_results.csv', index=True, index_label='Index_ID')