In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
%pip install transformers

In [None]:
import tensorflow as tf
from threading import Thread
from time import perf_counter
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

import os
import csv
import random

SEED = 42
ITERS = 10
MIN_TOKENS = 45
MAX_TOKENS = 60
MODEL_NAME = "gpt2-medium"
DATASET_PATH = f'/content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-{MODEL_NAME}'
TOPICS = [
  "Indian climate",
  "Indian defense",
  "Indian economy",
  "Indian infrastructure",
]

# return instance of GPT2Tokenizer and TFGPT2LMHeadModel
def get_transformer() -> tuple:
  tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
  model = TFGPT2LMHeadModel.from_pretrained(MODEL_NAME, pad_token_id=tokenizer.eos_token_id)

  return tokenizer, model

# return encoded tensorflow tensor
def get_encoding(tokenizer: GPT2Tokenizer, prompt: str) -> dict:
  return tokenizer.encode(prompt, return_tensors='tf')

# return decoded generated texts
def get_text(tokenizer: GPT2Tokenizer, embedding: dict) -> str:
  return tokenizer.batch_decode(
    embedding,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
  )[0]

def generate_text(tokenizer: GPT2Tokenizer, model: TFGPT2LMHeadModel, prompt) -> None:
  # call function to create a directory for the prompt
  create_directory(prompt)
  # Call Function to create CSV file for the prompt
  create_csv_file(prompt)

  # Encode The Prompt
  input_embedding = get_encoding(tokenizer, prompt)

  # Loop for generating text multiple times
  for i in range(ITERS):
    MAX_LEN = random.randint(MIN_TOKENS, MAX_TOKENS)
    print(f'Generating text for "{prompt}"... ({i + 1}/{ITERS})')

    # Generate text using the model
    output_embedding = model.generate(
        input_embedding,
        no_repeat_ngram_size = 2,
        do_sample = True,
        max_length = MAX_LEN,
        temperature = 0.9,
        top_k = 40,
        top_p = 0.95
    )

    # Decode the generated output
    generated_text = get_text(tokenizer, output_embedding)

    # Append decoded output to the CSV file
    append_to_csv_file(prompt, generated_text, i, MAX_LEN)

# Create a directory for the prompt
def create_directory(prompt: str) -> None:
  if not os.path.exists(f'{DATASET_PATH}/{prompt}'):
    os.makedirs(f'{DATASET_PATH}/{prompt}')

  print(f'Created directory for "{prompt}" at {DATASET_PATH}/{prompt}...')

# Create a CSV file for the prompt
def create_csv_file(prompt: str) -> None:
  fields = ['Prompt', 'Generated Text', 'i', 'MAX_LEN']

  with open(f'{DATASET_PATH}/{prompt}/data.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=fields, lineterminator='\n')
    writer.writeheader()

  print(f'Created CSV file for "{prompt}" at {DATASET_PATH}/{prompt}/data.csv...')

def append_to_csv_file(prompt: str, generated_text: str, i: int, MAX_LEN: int) -> None:
  with open(f'{DATASET_PATH}/{prompt}/data.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow([prompt, generated_text, i, MAX_LEN])

def main() -> None:
  threads = []
  tokenizer, model = get_transformer()
 # For each topic we are creating a thread
  for topic in TOPICS:
    thread = Thread(target=generate_text, args=(tokenizer, model, topic,))
    thread.start()
    threads.append(thread)

# Waiting for all threads to complete
  for thread in threads:
    thread.join()

In [None]:
tf.random.set_seed(SEED) # for reproducibility

start_time = perf_counter()
print("Starting...")
main()
print("Done!")
end_time = perf_counter()

print(f"Time Taken: {end_time - start_time:0.4f} second(s) elapsed")

Starting...


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Created directory for "Indian climate" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-gpt2-medium/Indian climate...
Created CSV file for "Indian climate" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-gpt2-medium/Indian climate/data.csv...
Created directory for "Indian defense" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-gpt2-medium/Indian defense...
Created CSV file for "Indian defense" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-gpt2-medium/Indian defense/data.csv...
Generating text for "Indian defense"... (1/10)
Generating text for "Indian climate"... (1/10)
Created directory for "Indian infrastructure" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-gpt2-medium/Indian infrastructure...
Created CSV file for "Indian infrastructure" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-gpt2-medium/Indian infrastructure/data.csv...
Ge

### Exploration

In [None]:
import pandas as pd

In [None]:
dfs = {}

for topic in TOPICS:
  df = pd.read_csv(f"{DATASET_PATH}/{topic}/data.csv")
  dfs[topic] = df

In [None]:
dfs['Indian economy']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,Indian economy,"Indian economy,"" as many of them are of Pakist...",0,50
1,Indian economy,Indian economy is the only country where growt...,1,55
2,Indian economy,Indian economy will survive unless you reduce ...,2,48
3,Indian economy,Indian economy.\n\nThe economy of Kerala is gr...,3,53
4,Indian economy,"Indian economy to recover.""\n\nThe report, whi...",4,47
5,Indian economy,"Indian economy will suffer, especially in the ...",5,58
6,Indian economy,Indian economy is based on exports and investm...,6,49
7,Indian economy,Indian economy is becoming smaller and smaller...,7,50
8,Indian economy,Indian economy has not been the most stable of...,8,49
9,Indian economy,"Indian economy,"" the report said.\n\nMr. Pang ...",9,60


In [None]:
dfs['Indian climate']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,Indian climate,Indian climate is very different from the typi...,0,45
1,Indian climate,Indian climate is a very good guide for how th...,1,56
2,Indian climate,Indian climate is known for its temperate temp...,2,57
3,Indian climate,"Indian climate is very different from Europe, ...",3,57
4,Indian climate,"Indian climate, which includes warmer water te...",4,49
5,Indian climate,"Indian climate, as a whole, has the potential ...",5,49
6,Indian climate,Indian climate is in a much better state than ...,6,47
7,Indian climate,Indian climate may be one of the strongest nat...,7,52
8,Indian climate,"Indian climate"" which is not so hot, nor cold ...",8,55
9,Indian climate,Indian climate and weather patterns also cause...,9,55


In [None]:
dfs['Indian defense']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,Indian defense,Indian defense minister Motti Yahya said the a...,0,58
1,Indian defense,Indian defense minister confirmed in a televis...,1,52
2,Indian defense,Indian defense officials say the attack was ca...,2,46
3,Indian defense,Indian defense officials have confirmed the de...,3,46
4,Indian defense,Indian defense minister Fikri Ismaili said Isr...,4,55
5,Indian defense,Indian defense ministry spokesman Maj. Gen. Ig...,5,56
6,Indian defense,"Indian defense minister, Major-General Binyami...",6,46
7,Indian defense,Indian defense minister Avigdor Lieberman told...,7,49
8,Indian defense,"Indian defense minister, Avigdor Lieberman, sa...",8,51
9,Indian defense,Indian defense minister's comment was taken as...,9,52


In [None]:
dfs['Indian infrastructure']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,Indian infrastructure,"Indian infrastructure, including the railway a...",0,46
1,Indian infrastructure,"Indian infrastructure,"" said Ajay Singh, a sen...",1,50
2,Indian infrastructure,Indian infrastructure investment in the form o...,2,57
3,Indian infrastructure,"Indian infrastructure to improve mobility, esp...",3,50
4,Indian infrastructure,"Indian infrastructure, including railways, air...",4,53
5,Indian infrastructure,Indian infrastructure. We have invested billio...,5,49
6,Indian infrastructure,Indian infrastructure is in dire straits. Whil...,6,47
7,Indian infrastructure,"Indian infrastructure.""\n\nBut he went on to e...",7,47
8,Indian infrastructure,"Indian infrastructure, it's easy to forget tha...",8,45
9,Indian infrastructure,Indian infrastructure is the most expensive an...,9,47


In [None]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
df_eco = dfs['Indian economy']
texts = df_eco['Generated Text']

In [None]:
texts

0    Indian economy," as many of them are of Pakist...
1    Indian economy is the only country where growt...
2    Indian economy will survive unless you reduce ...
3    Indian economy.\n\nThe economy of Kerala is gr...
4    Indian economy to recover."\n\nThe report, whi...
5    Indian economy will suffer, especially in the ...
6    Indian economy is based on exports and investm...
7    Indian economy is becoming smaller and smaller...
8    Indian economy has not been the most stable of...
9    Indian economy," the report said.\n\nMr. Pang ...
Name: Generated Text, dtype: object

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
for text in texts[1:]:
  print('text:', text)
  sentences = sent_tokenize(text)
  for sentence in sentences:
    print('------')
    print('sentence:', sentence)
    words = word_tokenize(sentence)
    print('pos tags:', nltk.pos_tag(words))
    words = [word.lower() for word in words if word not in stopwords.words('english') and word.isalpha()]
    print('processed:', words)
  break

text: Indian economy is the only country where growth is below 7%. The average growth rate is 1.2%.

This is not because of weak demand, but a lack of investment. According to the report, India is ranked as the second worst place for India's economic growth
------
sentence: Indian economy is the only country where growth is below 7%.
pos tags: [('Indian', 'JJ'), ('economy', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('only', 'JJ'), ('country', 'NN'), ('where', 'WRB'), ('growth', 'NN'), ('is', 'VBZ'), ('below', 'IN'), ('7', 'CD'), ('%', 'NN'), ('.', '.')]
processed: ['indian', 'economy', 'country', 'growth']
------
sentence: The average growth rate is 1.2%.
pos tags: [('The', 'DT'), ('average', 'JJ'), ('growth', 'NN'), ('rate', 'NN'), ('is', 'VBZ'), ('1.2', 'CD'), ('%', 'NN'), ('.', '.')]
processed: ['the', 'average', 'growth', 'rate']
------
sentence: This is not because of weak demand, but a lack of investment.
pos tags: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('because', 'IN'), ('of

### Preprocessing