In [None]:
%%capture
%pip install -q accelerate peft bitsandbytes transformers trl tensorboard huggingface_hub[cli] xformers

In [None]:
%%capture
%pip install sentencepiece

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root

In [None]:
import os
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig
from trl import SFTTrainer
import transformers

In [None]:
SEED = 42
ITERS = 10
MIN_TOKENS = 45
MAX_TOKENS = 60
DATASET_PATH = '/content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-llama2'
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
TOPICS = [
  "indian climate",
  "indian defense",
  "indian economy",
  "indian infrastructure",
]

In [None]:
# Load the entire model on the GPU 0
device_map = {"": 0}

# Set base model loading in 4-bits
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [None]:
def get_transformer() -> tuple:
  tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

  compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=use_4bit,
      bnb_4bit_quant_type=bnb_4bit_quant_type,
      bnb_4bit_compute_dtype=compute_dtype,
      bnb_4bit_use_double_quant=use_nested_quant
  )

  # Load the base model
  model = LlamaForCausalLM.from_pretrained(
      MODEL_NAME,
      device_map=device_map,
      quantization_config=bnb_config,
  )
  model.config.use_cache = False
  model.config.pretraining_tp = 1

  return tokenizer, model

In [None]:
def get_encoding(tokenizer, prompt: str) -> dict:
  return tokenizer(prompt, return_tensors='pt')

def get_text(tokenizer, embedding: dict) -> str:
  return tokenizer.batch_decode(
    embedding,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
  )[0]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def generate_text(tokenizer, model, prompt) -> None:
  create_directory(prompt)
  create_csv_file(prompt)

  input_embedding = get_encoding(tokenizer, prompt)

  for i in range(ITERS):
    MAX_LEN = random.randint(MIN_TOKENS, MAX_TOKENS)
    print(f'Generating text for "{prompt}"... ({i + 1}/{ITERS})')

    output_embedding = model.generate(
      input_embedding.input_ids,
      max_new_tokens=MAX_LEN,
      no_repeat_ngram_size = 2,
      do_sample=True,
      top_k=40,
      top_p=0.95,
      temperature=1.2, # randomly chosen
      eos_token_id=tokenizer.eos_token_id
    )

    generated_text = get_text(tokenizer, output_embedding)
    append_to_csv_file(prompt, generated_text, i, MAX_LEN)

def create_directory(prompt: str) -> None:
  if not os.path.exists(f'{DATASET_PATH}/{prompt}'):
    os.makedirs(f'{DATASET_PATH}/{prompt}')

  print(f'Created directory for "{prompt}" at {DATASET_PATH}/{prompt}...')

def create_csv_file(prompt: str) -> None:
  fields = ['Prompt', 'Generated Text', 'i', 'MAX_LEN']

  with open(f'{DATASET_PATH}/{prompt}/data.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=fields, lineterminator='\n')
    writer.writeheader()

  print(f'Created CSV file for "{prompt}" at {DATASET_PATH}/{prompt}/data.csv...')

def append_to_csv_file(prompt: str, generated_text: str, i: int, MAX_LEN: int) -> None:
  with open(f'{DATASET_PATH}/{prompt}/data.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow([prompt, generated_text, i, MAX_LEN])

def main() -> None:
  threads = []
  tokenizer, model = get_transformer()

  for topic in TOPICS:
    thread = Thread(target=generate_text, args=(tokenizer, model, topic,))
    thread.start()
    threads.append(thread)

  for thread in threads:
    thread.join()

In [None]:
from threading import Thread
from time import perf_counter

import os
import csv
import random
import torch

torch.manual_seed(SEED)

start_time = perf_counter()
print("Starting...")
main()
print("Done!")
end_time = perf_counter()

print(f"Time Taken: {end_time - start_time:0.4f} second(s) elapsed")

Starting...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Created directory for "indian climate" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-llama2/indian climate...
Created CSV file for "indian climate" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-llama2/indian climate/data.csv...
Generating text for "indian climate"... (1/10)
Created directory for "indian defense" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-llama2/indian defense...
Created directory for "indian economy" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-llama2/indian economy...
Created directory for "indian infrastructure" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-llama2/indian infrastructure...
Created CSV file for "indian defense" at /content/drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/data-llama2/indian defense/data.csv...
Generating text for "indian defense"... (1/10)
Created CSV file for "indian economy" at /content/driv



Generating text for "indian defense"... (2/10)
Generating text for "indian infrastructure"... (2/10)
Generating text for "indian economy"... (2/10)
Generating text for "indian climate"... (2/10)
Generating text for "indian economy"... (3/10)
Generating text for "indian defense"... (3/10)
Generating text for "indian infrastructure"... (3/10)
Generating text for "indian climate"... (3/10)
Generating text for "indian defense"... (4/10)
Generating text for "indian economy"... (4/10)
Generating text for "indian infrastructure"... (4/10)
Generating text for "indian climate"... (4/10)
Generating text for "indian climate"... (5/10)
Generating text for "indian defense"... (5/10)
Generating text for "indian economy"... (5/10)
Generating text for "indian infrastructure"... (5/10)
Generating text for "indian climate"... (6/10)
Generating text for "indian defense"... (6/10)
Generating text for "indian climate"... (7/10)
Generating text for "indian infrastructure"... (6/10)
Generating text for "indi

In [None]:
import pandas as pd

In [None]:
dfs = {}

for topic in TOPICS:
  df = pd.read_csv(f"{DATASET_PATH}/{topic}/data.csv")
  dfs[topic] = df

In [None]:
dfs['indian economy']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,indian economy,"indian economy\n Begriffe der Makroökonomie, i...",0,48
1,indian economy,indian economy at crossroads\n hopefully there...,1,47
2,indian economy,indian economy after demonetisation | demoniti...,2,53
3,indian economy,"indian economy after gst council in 1597, june...",3,49
4,indian economy,indian economy in hindi essays for class 6\n о...,4,59
5,indian economy,"indian economy, inflation, Inflation Rate\n ev...",5,54
6,indian economy,indian economy india economy\n everybody has t...,6,46
7,indian economy,"indian economy after demonetisation\n Einzeln,...",7,55
8,indian economy,indian economy: Govt working on policy to addr...,8,48
9,indian economy,"indian economy is a great place for investors,...",9,45


In [None]:
dfs['indian climate']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,indian climate,indian climate and wildlife essay\n sierpien 5...,0,56
1,indian climate,indian climate class 7\n Begriffe für:\n1. Kl...,1,55
2,indian climate,indian climate in 21st century pdf\n Einzeln d...,2,60
3,indian climate,indian climate map of india,3,58
4,indian climate,indian climate for class 1\n everybody else i...,4,46
5,indian climate,"indian climate, rain & season wise, climate & ...",5,56
6,indian climate,indian climate\n Unterscheidung herrscht!«\n\n...,6,46
7,indian climate,indian climate pdf\n prüfungsfragen pdf zusamm...,7,47
8,indian climate,indian climate map\n сайт заключение\nКружок з...,8,54
9,indian climate,indian climate and its characteristics pdf\n E...,9,58


In [None]:
dfs['indian defense']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,indian defense,indian defense ministry\n савезная\nзападная р...,0,55
1,indian defense,indian defense\n Einzeln und mehrteilig 16\n# ...,1,54
2,indian defense,indian defense: how will defense minister’s fi...,2,56
3,indian defense,indian defense news| defense updates| defence ...,3,55
4,indian defense,indian defense chief\n everybody can't get a h...,4,55
5,indian defense,indian defense\n живелонс ыныма\nTRAVELING MEN...,5,52
6,indian defense,indian defense blog forum\n geprüft на\nCruz i...,6,51
7,indian defense,indian defense minister narendra modi visit to...,7,47
8,indian defense,indian defense minister\nnahmoda\nAHMEDABAD: D...,8,45
9,indian defense,indian defense minister\n Hinweis: Um die Über...,9,54


In [None]:
dfs['indian infrastructure']

Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,indian infrastructure,indian infrastructure\n Hinweis\nWie ausführli...,0,48
1,indian infrastructure,"indian infrastructure\n sierp 23, 10:02\nIndia...",1,56
2,indian infrastructure,indian infrastructure\n everybody knows that t...,2,45
3,indian infrastructure,indian infrastructure sector analysis 2021\n o...,3,48
4,indian infrastructure,indian infrastructure | The World Economic...\...,4,46
5,indian infrastructure,indian infrastructure investment bank (i3b) ha...,5,45
6,indian infrastructure,indian infrastructure development\n styczwialn...,6,52
7,indian infrastructure,indian infrastructure: ADB to help upgrade 188...,7,49
8,indian infrastructure,indian infrastructure\nего имя — Олег\nHello. ...,8,52
9,indian infrastructure,indian infrastructure\n everybody is busy prep...,9,52


In [None]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
df_eco = dfs['indian economy']
texts = df_eco['Generated Text']

In [None]:
texts

0    indian economy\n Begriffe der Makroökonomie, i...
1    indian economy at crossroads\n hopefully there...
2    indian economy after demonetisation | demoniti...
3    indian economy after gst council in 1597, june...
4    indian economy in hindi essays for class 6\n о...
5    indian economy, inflation, Inflation Rate\n ev...
6    indian economy india economy\n everybody has t...
7    indian economy after demonetisation\n Einzeln,...
8    indian economy: Govt working on policy to addr...
9    indian economy is a great place for investors,...
Name: Generated Text, dtype: object

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
for text in texts[8:]:
  print('text:', text)
  sentences = sent_tokenize(text)
  for sentence in sentences:
    print('------')
    print('sentence:', sentence)
    words = word_tokenize(sentence)
    print('pos tags:', nltk.pos_tag(words))
    words = [word.lower() for word in words if word not in stopwords.words('english') and word.isalpha()]
    print('processed:', words)
  break

text: indian economy: Govt working on policy to address issue of ‘import surge’ by Chinese companies to sidestep import duty hike: Minister
 живело. अनुपस्थितकर. क�
------
sentence: indian economy: Govt working on policy to address issue of ‘import surge’ by Chinese companies to sidestep import duty hike: Minister
 живело.
pos tags: [('indian', 'JJ'), ('economy', 'NN'), (':', ':'), ('Govt', 'NNP'), ('working', 'VBG'), ('on', 'IN'), ('policy', 'NN'), ('to', 'TO'), ('address', 'VB'), ('issue', 'NN'), ('of', 'IN'), ('‘', 'NNP'), ('import', 'NN'), ('surge', 'NN'), ('’', 'NN'), ('by', 'IN'), ('Chinese', 'JJ'), ('companies', 'NNS'), ('to', 'TO'), ('sidestep', 'VB'), ('import', 'NN'), ('duty', 'NN'), ('hike', 'NN'), (':', ':'), ('Minister', 'NNP'), ('живело', 'NNP'), ('.', '.')]
processed: ['indian', 'economy', 'govt', 'working', 'policy', 'address', 'issue', 'import', 'surge', 'chinese', 'companies', 'sidestep', 'import', 'duty', 'hike', 'minister', 'живело']
------
sentence: अनुपस्थितकर.
po