# Initial Commit
First py notebook for full run through of data pipeline

In [None]:
try:
    !pip install transformers
    !pip install bertviz
    !pip install tensorflow
    !pip install seaborn
except:
    pass

In [None]:
#@title Create generator for gpt2 pipeline {display-mode: "form"}

import os
from transformers import pipeline, set_seed


training_text_filename = "method-generation.txt"
training_text_file_location = os.path.join(os.getcwd(), "Train", "method-generation.txt")

generator = pipeline('text-generation', model='gpt2', tokenizer='gpt2', output_attentions=True)
set_seed(42)

with open(training_text_file_location) as f:
    data = f.read()

In [None]:
#@title Generate new words, tokenize said words {display-mode: "form"}

from transformers import AutoTokenizer, AutoModel

generated_text = generator(data, max_new_tokens=500, num_return_sequences=1)

for a in range(len(generated_text)):
    generated_text[a] = generated_text[a]['generated_text']
    print(generated_text[a])

new_words = generated_text[0][len(data):]

tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokens = tokenizer(new_words, return_tensors='pt')
tokens_out = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])


In [None]:
#@title Visualize frequency count of output {display-mode: "form"}

import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

counts = Counter(tokens_out)

token_freq = pd.DataFrame(counts.items(),columns=['token','frequency']).sort_values(by='frequency',ascending=False)

token_freq = token_freq.head(20)

sns.barplot(x='frequency',y='token',data=token_freq)

In [None]:
#@title Prepare by running model, getting attention and input token ids {display-mode: "form"}

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
model = AutoModel.from_pretrained("gpt2", output_attentions=True)

inputs = tokenizer(data[120:170], return_tensors='pt')
print(inputs)
outputs = model(**inputs)
attention = outputs.attentions  # Output includes attention weights when output_attentions=True

tokens_input = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(tokens_input)


In [None]:
#@title Demo BertViz head view and model view {display-mode: "form"}

from bertviz import head_view, model_view

head_view(attention, tokens_input)

In [None]:
model_view(attention, tokens_input)