# 1. Setup

In [1]:
# install packages
!pip install -q transformers==4.37.2
!pip install -q sentencepiece
!pip install -q evaluate
!pip install -q tensorflow==2.15
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
# import libraries
import os
import re
import numpy as np
import pandas as pd
import evaluate

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [32]:
import time

In [3]:
# mount Colab to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# verify data exists in Google Drive dir
!ls 'drive/My Drive/W266/Final Project'

 model_checkpoints	     test_pairs.csv    W266_Final_Project_Data_Cleanup.ipynb
'old dataset split'	     train_pairs.csv   W266_Final_Project_Pegasus_Ray.ipynb
 reddit_subset_cleaned.csv   valid_pairs.csv  ' W266_Final_Project_T5_Dipika.ipynb'


# 2. Load Data

In [5]:
# load clean Reddit ds
pd.set_option('display.max_colwidth', None)

df = pd.read_csv('drive/My Drive/W266/Final Project/reddit_subset_cleaned.csv')

df.head(3)

Unnamed: 0,title,post
0,google is invasive nonanonymized ad targeting a quick confirmation of previously suspected privacy issues,i am cross posting this from rcyberlaw hopefully you guys find it as interesting as i didit deals with google analytics so quite awhile ago i ordered a papa john is pizza online my job largely involves looking at ads that appear online so afterwards i was quick to notice i was getting a lot of papa johns ads especially at night being served through a google owned company doubleclick media yesterday one of these ads popped up again on youtube a place that typically serves using the adwords program not doubleclick so i decided to copy the url for those not in the advertising field making full use of google is analytics tool means that certain information about the advertising campaign is leaked in the url so let is break it apart gt heresscs hereampadurl first off we see sscs sscs is doubleclick is redirect variable so rather than directly serving adwords ads they overrode it to serve through doubleclick then redirect through what would otherwise be an adwords link this is tighter integration than is generally seen with adwordsdoubleclick the interesting part is the end variables utm_sourcegooglenetworkamputm_mediumdisplaycpcamputm_campaigngoogleremarketing displaycpcgooglenetwork confirmation that doubleclick is now more finely integrated with adwords googleremarketing huh let is take a look at the definition for remarketing gtusing past campaign information to target a particular message to an audience while in the past behavioral targetting has largely been based on the sum of your use this is an interestingthough no doubt more widespread than is known change in that explicitly targeting old customers though a massive network of sites just thought i would put this out there i am sure it is not new to a lot of people but at least to me it was interesting to see concepts like this actually put into practice on such a large scale ps i did a quick survey across several thousand domains and for the record right now the most common external resource locations on the internet aregoogle owned is bolded pageadgooglesyndicationcom googleadsgdoubleclicknet edgequantservecom addoubleclicknet bscorecardresearchcom smdnnet dgspecificclicknet viewatdmtcom ajaxgoogleapiscom partnergoogleadservicescom that is a lot of data
1,potential job in web analytics need to analyze some data what are they looking for,i decided grad school physics was not for me and i am branching out into the job market a web analytics place is interested in me and i am interested in any kind of data analysis the exercise is to use a comparison of three or more months of data to prepare a to slide powerpoint presentation of any significant information about site visitors what they are doing how they arrive at our site that we could use to improve site performance as an acquisition source he said i should notell a story this is a field i am unfamiliar with so i am looking for any basic tips common pitfalls and expectations thanks i am quite familiar with data analysis in general
2,how to identify which google analytics account is tracking my site,hey all my gf is having trouble with ga and has not gotten any response in days from posting in the google help forums so i figure i would try here question as follows i have a client that we coded a website for and google analytics was plugged into it we would like to look at the statistics for the site and no one can identify what account is associated with the tracking code that is embedded i have pulled the user account number from the source code i am just not sure how to identify what the login associated with it is can anyone help this is a fairly urgent request thanks in advance for any help


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   25000 non-null  object
 1   post    25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [7]:
prefix = 'create headline for post: '
pairs = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    post = row['post']
    title = row['title']
    pairs.append({'post': prefix + post, 'title': title})

print(pairs[0])

{'post': 'create headline for post: i am cross posting this from rcyberlaw hopefully you guys find it as interesting as i didit deals with google analytics so quite awhile ago i ordered a papa john is pizza online my job largely involves looking at ads that appear online so afterwards i was quick to notice i was getting a lot of papa johns ads especially at night being served through a google owned company doubleclick media yesterday one of these ads popped up again on youtube a place that typically serves using the adwords program not doubleclick so i decided to copy the url for those not in the advertising field making full use of google is analytics tool means that certain information about the advertising campaign is leaked in the url so let is break it apart gt heresscs hereampadurl first off we see sscs sscs is doubleclick is redirect variable so rather than directly serving adwords ads they overrode it to serve through doubleclick then redirect through what would otherwise be an

In [9]:
# Let's create some splits
np.random.shuffle(pairs)
num_valid_samples = int(0.10 * len(pairs))
num_train_samples = len(pairs) - 2 * num_valid_samples #allocating 80% of dataset for training

train_pairs = pairs[:num_train_samples]
valid_pairs = pairs[num_train_samples : int(num_train_samples + num_valid_samples * 1.5)]
test_pairs = pairs[int(num_train_samples + num_valid_samples * 1.5):]


print(f"{len(pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")


25000 total pairs
20000 training pairs
3750 validation pairs
1250 test pairs


In [None]:
# Save splits to separate csv files, to load only part at a time later
train_file = 'drive/My Drive/W266/Final Project/train_pairs.csv'
valid_file = 'drive/My Drive/W266/Final Project/valid_pairs.csv'
test_file = 'drive/My Drive/W266/Final Project/test_pairs.csv'

pd.DataFrame(train_pairs).to_csv(train_file, index=False)
pd.DataFrame(valid_pairs).to_csv(valid_file, index=False)
pd.DataFrame(test_pairs).to_csv(test_file, index=False)

In [None]:
# Display the first few items of each list
print("Training pairs:")
print(train_pairs[:5])

print("\nValidation pairs:")
print(valid_pairs[:5])

print("\nTest pairs:")
print(test_pairs[:5])

Training pairs:
[{'post': 'create headline for post: hello i am working with classification and regression using a dataset about movies the problem is i have too many features in my dataset and it makes my neural network overfit pretty fast but i cannot remove these features because i am trying to study their influence in my classificationregression methods one thing i am trying to do is to use autoencoders to try to create a new representation of my data in a smaller dimension the problem is no matter how big i have tried some configuration of networks my network is my autoencoder cannot seem to learn even in the train data as i was testing how to build this autoencoder i was trying to make it overfit so i could reduce complexity later and try to balance the training accuracy with the validation accuracy but i could not do this i would like to know what am i doing wrong or if there are any tips that you could give me to make it work better describing my dataset my dataset is about mov

In [10]:
# File paths for the split CSV files
train_file = 'drive/My Drive/W266/Final Project/train_pairs.csv'
valid_file = 'drive/My Drive/W266/Final Project/valid_pairs.csv'
test_file = 'drive/My Drive/W266/Final Project/test_pairs.csv'

# Load the CSV files into lists of dictionaries
train_pairs = pd.read_csv(train_file).to_dict('records')
valid_pairs = pd.read_csv(valid_file).to_dict('records')
test_pairs = pd.read_csv(test_file).to_dict('records')

# Display the first few items of each list
print("Training pairs:")
print(train_pairs[:5])

print("\nValidation pairs:")
print(valid_pairs[:5])

print("\nTest pairs:")
print(test_pairs[:5])

Training pairs:
[{'post': 'my goal is to detect an object that is being flashed in front of a camera therefore the input is a video converted into images frames and the sequence matters i am trying to figure out how to go about training it usually video object detection algorithms detect objects in each frame my problem is that the objects i am trying to classify are similar and the object is not fully visible in any single frame because of a hand holding it in order to correctly tell what the object is you have to look at multiple frames i found which looks at the data sequentially but i am not sure how to map it to my situation anyone else knows about any implementations of a similar problem or potential ways of solving it', 'title': 'video object detection detecting object in the video frames sequentially'}, {'post': 'hi everyone for my master is thesis at the vrije universiteit amsterdam i am researching the environment for it professionals to report wrongdoing related to software 

# 3. Preprocessor and Data Generator

In [11]:
def preprocess_data(pairs, tokenizer, model, max_length=128):
    post_text = [post for post, title in pairs]
    post_encoded = tokenizer.batch_encode_plus(
        post_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    post_input_ids = np.array(post_encoded["input_ids"], dtype="int32")
    post_attention_masks = np.array(post_encoded["attention_mask"], dtype="int32")

    title_text = [title for post, title in pairs]
    title_encoded = tokenizer.batch_encode_plus(
        title_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    label_ids = np.array(title_encoded['input_ids'])
    decoder_input_ids = model._shift_right(label_ids)

    return [post_input_ids, post_attention_masks, decoder_input_ids], label_ids

In [12]:
class SummarizeDataGenerator(tf.keras.utils.Sequence):

    def __init__(self,
                 tokenizer,
                 model,
                 n_examples,
                 data_filename,
                 max_length=128,
                 batch_size=16,
                 shuffle=True):

        self.tokenizer = tokenizer
        self.model = model
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

    def __len__(self):
        # Return the number of batches in the full dataset
        return self.n_examples // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)

        pairs = df[['post', 'title']].values.astype(str).tolist()

        batch_data = preprocess_data(
            pairs,
            self.tokenizer,
            self.model,
            self.max_length
        )

        return batch_data

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

# 4. Pretrained model

---

In [13]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [14]:
# Create the data generators for train and validation data, tensorflow version

max_length = 32
batch_size = 16

train_data_generator = SummarizeDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(train_pairs),
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = SummarizeDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(valid_pairs),
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [18]:
def build_t5_training_wrapper_model(t5_model, max_length, learning_rate=0.0005):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [23]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length, learning_rate=0.0005)

In [24]:
# add a model checkpoint callback to save
# the trained model weights after each epoch.

checkpoint_dir = 'drive/My Drive/W266/Final Project/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + 't5_reddit_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)


In [25]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=3,
                  callbacks=[model_checkpoint_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7af0902da8f0>

# 5. Evaluation

In [26]:
test_pairs[0]

{'post': 'some background i have been learning python for over a year now and i know some sql a bit of r and i have completed some small projects using data science data engineering practices i also know how to work in excel but i do not really have experience using databases i am totally willing to make the time and money investment in something like a bootcamp and i have the means to do fulltime training but i do not want to do this if there is a better faster way to get into the industry what i really want to know is what can i do that will get me a job in the field asap is there some specific bootcamp that will make this happen if so what are the best bootcamps or some particular tech skill i could learn that would basically guarantee that i am hireable very soon if i something like learned microsoft sql server or tableau and given my other skills would this be likely to get me hired i have been looking into bootcamps like thinkful springboard and data application lab the concern i

In [27]:
for test_input_text in ['Hello all - I have an upcoming live case interview at CVS for their data science role. Can yall please share your experience of how the interview went? \
                        Did it involve quantitative analysis or was it just qualitative in nature? \
                        Did they supplement the discussion with some data? \
                        Were they expecting a technical ML solution? Or did they only want to guage the candidates thought process and structured communication?']:

    test_inputs = t5_tokenizer([prefix + test_input_text], return_tensors='tf')
    test_output_ids = t5_model.generate(test_inputs['input_ids'])

    print([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                               clean_up_tokenization_spaces=False) for out_ids in test_output_ids])



['upcoming live case interview at CVS']


In [28]:
#load the saved model weights
checkpoint_dir = 'drive/My Drive/W266/Final Project/model_checkpoints/'

checkpoint_filepath = checkpoint_dir + 't5_reddit_weights.03-0.75.hdf5'
model_wrapper.load_weights(checkpoint_filepath)

In [29]:
# Still works?
for test_input_text in ['Hello all - I have an upcoming live case interview at CVS for their data science role. Can yall please share your experience of how the interview went? \
                        Did it involve quantitative analysis or was it just qualitative in nature? \
                        Did they supplement the discussion with some data? \
                        Were they expecting a technical ML solution? Or did they only want to guage the candidates thought process and structured communication?']:
    test_inputs = t5_tokenizer([prefix + test_input_text], return_tensors='tf')
    test_output_ids = t5_model.generate(test_inputs['input_ids'])

    print([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                               clean_up_tokenization_spaces=False) for out_ids in test_output_ids])

['upcoming live case interview at CVS']


In [None]:
# Candidates: these are the actual Reddit titles from the test set
EXAMPLES_NUM = 10

test_posts = [t5_tokenizer(item['post'], max_length=512, truncation=True, return_tensors='tf') for item in test_pairs[:EXAMPLES_NUM]]

# Generating output ids for each tokenized post
test_output_ids = [t5_model.generate(post['input_ids'],
                                    num_beams=3,
                                    no_repeat_ngram_size=2,
                                    num_return_sequences=1,  # returns the # of sequences for each post
                                    max_length=128) for post in test_posts]

In [30]:
t5_model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
Total params: 222903552 (850.31 MB)
Trainable params: 222903552 (850.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
EXAMPLES_NUM = len(test_pairs)

print(f"Generating {EXAMPLES_NUM} samples ... ")

# Generate input ids for each tokenized post
test_posts = [t5_tokenizer(item['post'], max_length=512, truncation=True, return_tensors='tf') for item in test_pairs[:EXAMPLES_NUM]]

# Generating output ids for each tokenized post
start_time = time.time()
test_output_ids = [t5_model.generate(post['input_ids'],
                                    num_beams=3,
                                    no_repeat_ngram_size=2,
                                    num_return_sequences=1,  # returns the # of sequences for each post
                                    max_length=128) for post in test_posts]

end_time = time.time()
run_time = end_time - start_time

print(f"Generate Time elapsed: {run_time} seconds.\n")

# initialize list of candidates
candidates = []

# Decode each output in the batch of generated outputs
for out_ids in test_output_ids:
    # Decode each output in the batch of generated outputs
    candidates_batch = [t5_tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for ids in out_ids]
    candidates.extend(candidates_batch)  # Extend the main candidates list with the batch

# Inspect references list
for idx, candidate in enumerate(candidates):
  print("Candidate #",idx,":\t ",candidate)
  #print("\t",candidate)


Generating 1250 samples ... 
Generate Time elapsed: 7685.047694444656 seconds.

Candidate # 0 :	  what is the best way to get a job in the field of data science
Candidate # 1 :	  python spacy for ngrams
Candidate # 2 :	  what is the best way of calculating the weights for a neural network
Candidate # 3 :	  what is the best way of storing a large amount of data in the retraining process
Candidate # 4 :	  how do i get the best results out of the mahalanobis dataset
Candidate # 5 :	  i am a data science intern and ive been looking for feedbacks on my internship
Candidate # 6 :	  what is the average salary for a data scientist
Candidate # 7 :	  ms in statistics vs cs for data science
Candidate # 8 :	  what is the difference between a phd and data science
Candidate # 9 :	  how do i prepare for a phd in data science
Candidate # 10 :	  python for data science
Candidate # 11 :	  what is the best way to get a job in data science
Candidate # 12 :	  introducing cafi for college sports
Candidate #

In [36]:
# References: we will compare the generated titles against these actual test values
start_time = time.time()

# reference list
references = [item['title'] for item in test_pairs[:EXAMPLES_NUM]]

# original post
references_post = [item['post'] for item in test_pairs[:EXAMPLES_NUM]]

# Inspect references list
for idx, reference in enumerate(references):
    print("Reference #",idx,":\t ",reference)

end_time = time.time()
run_time = end_time - start_time
print(f"Generate Time elapsed: {run_time} seconds.\n")

Reference # 0 :	  i want to transition into the data scienceanalyst or related field rather than asking whether i should choose a particular boootcamp or learn some language i would like to hear opinions on what path should i choose that will land me a job of some kind in the field as soon as possible
Reference # 1 :	  psngram linguistic features for improving machine learning and deep learning model accuracy for the first time in python new release
Reference # 2 :	  does anyone use any type of gradient smoothing for binned data
Reference # 3 :	  setting up a model for retraining in production
Reference # 4 :	  is mahalanobisdistance matching between points not compatible with onehotencoded datasets
Reference # 5 :	  education level on applications
Reference # 6 :	  how many yoe should i expect to have before i hit kk salary range
Reference # 7 :	  a potential blind spot for new data science degrees
Reference # 8 :	  should i get a mastersphd
Reference # 9 :	  two years until i apply f

In [37]:
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=candidates, references=references)

pd.DataFrame([rouge_results])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,0.184472,0.051011,0.165052,0.165165


In [38]:
bleu = evaluate.load("bleu")
bleu_results = bleu.compute(predictions=candidates, references=references)

pd.DataFrame([bleu_results])

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.029681,"[0.18707643814026792, 0.05043706293706294, 0.015996074582924437, 0.005141962888441762]",1.0,1.023883,12690,12394


In [39]:
# Combine Candidates and References

df = pd.DataFrame({'ref_post': references_post,
                   'ref_title': references,
                   'candidate_title': candidates
                   })
# Inspect DF
print(df.head())

# Export to CSV
df.to_csv('drive/My Drive/W266/t5_results.csv', index=True, index_label='Index_ID')

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        