In [1]:
!pip install transformers




In [2]:
import os
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments




In [3]:
from transformers import LineByLineTextDataset

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [6]:
# Define paths
json_dir = "data/json_file"
text_dir = "data/text_files"
preprocessed_output_dir = "data/preprocessed_output"

def preprocess_text(text):
    # Tokenize the text
    tokenized_text = tokenizer.tokenize(text)

    # Get the tokenizer vocabulary
    vocab = tokenizer.get_vocab()

    # Replace unknown words with <unk>
    tokenized_text = [token if token in vocab else tokenizer.unk_token for token in tokenized_text]

    # Remove special characters and symbols
    cleaned_text = [token for token in tokenized_text if token.isalnum() or token in ["!", ".", "?"]]

    # Convert tokens to lowercase
    cleaned_text_lower = [token.lower() for token in cleaned_text]

    # Join tokens into a single string with spaces
    preprocessed_text = " ".join(cleaned_text_lower)
    
    return preprocessed_text
    
# Function to preprocess JSON files
def preprocess_json_file(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
        questions_answers = data.get("questions_answers", [])
        preprocessed_texts = []
        for qa in questions_answers:
            question = qa.get("question", "")
            answer = qa.get("answer", "")
            preprocessed_question = preprocess_text(question)
            preprocessed_answer = preprocess_text(answer)
            preprocessed_text = f"{preprocessed_question} {preprocessed_answer}"
            preprocessed_texts.append(preprocessed_text)
        return preprocessed_texts

# Function to preprocess text files
def preprocess_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
        preprocessed_text = preprocess_text(text)
        return preprocessed_text

# Preprocess JSON files
preprocessed_json_files = []
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):
        file_path = os.path.join(json_dir, file_name)
        preprocessed_texts = preprocess_json_file(file_path)
        preprocessed_json_files.extend(preprocessed_texts)

# Preprocess text files
preprocessed_text_files = []
for file_name in os.listdir(text_dir):
    if file_name.endswith(".txt"):  # Assuming all files are text files
        file_path = os.path.join(text_dir, file_name)
        preprocessed_text = preprocess_text_file(file_path)
        preprocessed_text_files.append(preprocessed_text)


In [8]:
preprocessed_json_files


['what ġled ġthe ġspeaker ġto ġengage ġin ġnegative ġbehaviors ġlike ġgambling ġand ġavoidance ? the ġspeaker ġinitially ġinterpreted ġthe ġphrase everything ġis ġokay ġas ġjustification ġfor ġnegative ġbehaviors .',
 'how ġdid ġthe ġspeaker ġinterpretation ġof everything ġis ġokay ġchange ġover ġtime ? the ġspeaker ġeventually ġrealized ġthat everything ġis ġokay ġshould ġbe ġinterpreted ġas ġbeing ġaware ġof ġone ġactions ġand ġtheir ġconsequences ġrather ġthan ġjustifying ġnegative ġbehaviors .',
 'who ġis ġcredited ġwith ġawakening ġindividuals ġto ġthe ġdual ities ġof ġlife ġin ġthe ġtext ? s ri ġr ishi ġpr ab h aka ġis ġcredited ġwith ġawakening ġindividuals ġto ġthe ġdual ities ġof ġlife ġand ġhelping ġthem ġexperience ġa ġdeeper ġunderstanding ġof ġexistence .',
 'what ġrole ġdoes v ive ka ġplay ġin ġthe ġspeaker ġjourney ġdescribed ġin ġthe ġtext ? v ive ka ġplays ġa ġcrucial ġrole ġin ġthe ġspeaker ġtransformation ġby ġenabling ġthem ġto ġdiscern ġbetween ġright ġand ġwrong ġ

In [9]:
preprocessed_text_files

['this ġ ġwhat ġ ġwas ġalways ġfinding . ġthat ġreally ġ ġdon ġhave ġany ġobligation ġthey ġdon ġhave ġany ġobligation . ġand ġ ġknew ġthat ġright . ġ ġyou ġjust ġleave ġwith ġ ġvery ġclosely ġyou ġjust ġleave ġwith ġ ġvery ġvery ġclosely . ġyou ġhave ġ ġ ġvery ġclosely ġwith ġthen ġyou ġwill ġnotice . ġ ġleave ġthis ġstate . ġwhat ġone ġhas ġ ġlearn ġ ġhow ġ ġ ġletting ġ ġfree . ġ ġyou ġtry ġ ġcontrol ġthis ġwill ġ ġvery ġfantastic . ġguru ji ġ ġnot ġcontrolling ġ ġ ġwill ġcontrol ġguru ji . ġthat ġwill ġnot ġhappen . ġhow ġ ġdominate ġ ġcannot ġhappen . ġbecause ġguru ji ġcannot ġ ġcaught ġlike ġthat . ġjust ġlike ġg hand i ġsaid ġyou ġknow ġyou ġcan ġhave ġ ġbody ġ ġnot ġ ġbody . ġ ġ ġlike ġthat . ġwhen ġthe ġenglish ġwere ġparticipating ġthey ġwere ġbeating ġ ġsaid ġ ġ ġprepared ġ ġdie . ġyou ġwill ġhave ġ ġdead ġbody ġbut ġnot ġ ġbody . ġthis ġrequires ġfantastic ġstrength . ġ ġlike ġthat ġguru ji ġcannot ġ ġcontrolled . ġbut ġeverybody ġtries ġ ġcontrol . ġbecause ġthey ġare ġuse

In [10]:
def fix_spaces(text):
    return text.replace('ġ', ' ')

# Apply the fix to each item in your preprocessed list
fixed_preprocessed_texts = [fix_spaces(text) for text in preprocessed_json_files]

In [11]:
for text in fixed_preprocessed_texts:
    print(text)

what  led  the  speaker  to  engage  in  negative  behaviors  like  gambling  and  avoidance ? the  speaker  initially  interpreted  the  phrase everything  is  okay  as  justification  for  negative  behaviors .
how  did  the  speaker  interpretation  of everything  is  okay  change  over  time ? the  speaker  eventually  realized  that everything  is  okay  should  be  interpreted  as  being  aware  of  one  actions  and  their  consequences  rather  than  justifying  negative  behaviors .
who  is  credited  with  awakening  individuals  to  the  dual ities  of  life  in  the  text ? s ri  r ishi  pr ab h aka  is  credited  with  awakening  individuals  to  the  dual ities  of  life  and  helping  them  experience  a  deeper  understanding  of  existence .
what  role  does v ive ka  play  in  the  speaker  journey  described  in  the  text ? v ive ka  plays  a  crucial  role  in  the  speaker  transformation  by  enabling  them  to  discern  between  right  and  wrong  actions  and  

In [12]:
# Apply the fix to each item in your preprocessed list
fixed_preprocessed_texts += [fix_spaces(text) for text in preprocessed_text_files]

In [13]:
fixed_preprocessed_texts

['what  led  the  speaker  to  engage  in  negative  behaviors  like  gambling  and  avoidance ? the  speaker  initially  interpreted  the  phrase everything  is  okay  as  justification  for  negative  behaviors .',
 'how  did  the  speaker  interpretation  of everything  is  okay  change  over  time ? the  speaker  eventually  realized  that everything  is  okay  should  be  interpreted  as  being  aware  of  one  actions  and  their  consequences  rather  than  justifying  negative  behaviors .',
 'who  is  credited  with  awakening  individuals  to  the  dual ities  of  life  in  the  text ? s ri  r ishi  pr ab h aka  is  credited  with  awakening  individuals  to  the  dual ities  of  life  and  helping  them  experience  a  deeper  understanding  of  existence .',
 'what  role  does v ive ka  play  in  the  speaker  journey  described  in  the  text ? v ive ka  plays  a  crucial  role  in  the  speaker  transformation  by  enabling  them  to  discern  between  right  and  wrong  

In [14]:
# Define the path to the corpus.txt file
corpus_file_path = 'corpus.txt'

# Write the preprocessed texts to the corpus.txt file
with open(corpus_file_path, 'w', encoding='utf-8') as file:
    for text in fixed_preprocessed_texts:
        file.write(text + '\n')

print(f"Corpus saved to {corpus_file_path}")

Corpus saved to corpus.txt


In [15]:
with open(corpus_file_path, 'r', encoding='utf-8') as file:
    corpus_content = file.read()

print(corpus_content)

what  led  the  speaker  to  engage  in  negative  behaviors  like  gambling  and  avoidance ? the  speaker  initially  interpreted  the  phrase everything  is  okay  as  justification  for  negative  behaviors .
how  did  the  speaker  interpretation  of everything  is  okay  change  over  time ? the  speaker  eventually  realized  that everything  is  okay  should  be  interpreted  as  being  aware  of  one  actions  and  their  consequences  rather  than  justifying  negative  behaviors .
who  is  credited  with  awakening  individuals  to  the  dual ities  of  life  in  the  text ? s ri  r ishi  pr ab h aka  is  credited  with  awakening  individuals  to  the  dual ities  of  life  and  helping  them  experience  a  deeper  understanding  of  existence .
what  role  does v ive ka  play  in  the  speaker  journey  described  in  the  text ? v ive ka  plays  a  crucial  role  in  the  speaker  transformation  by  enabling  them  to  discern  between  right  and  wrong  actions  and  

In [17]:
pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
     ---------------------------------------- 0.0/137.6 kB ? eta -:--:--
     -- ------------------------------------- 10.2/137.6 kB ? eta -:--:--
     ----------------------- --------------- 81.9/137.6 kB 1.2 MB/s eta 0:00:01
     -------------------------------------- 137.6/137.6 kB 1.4 MB/s eta 0:00:00
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
   ---------------------------------------- 0.0/9.0 MB ? eta -:--:--
   - -------------------------------------- 0.4/9.0 MB 7.6 MB/s eta 0:00:02
   ---- ----------------------------------- 1.0/9.0 MB 12.9 MB/s eta 0:00:01
   ------ --------------------------------- 1.5/9.0 MB 13.2 MB/s eta 0:00:01
   -------- ------------------------------- 1.9/9.0 MB 11.1 MB/s eta 0:00:01
   -------- -------------------------

  You can safely remove it manually.


In [20]:
pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
   ---------------------------------------- 0.0/542.0 kB ? eta -:--:--
   ---- ----------------------------------- 61.4/542.0 kB 1.6 MB/s eta 0:00:01
   ------------------ --------------------- 256.0/542.0 kB 3.9 MB/s eta 0:00:01
   -------------------------------- ------- 440.3/542.0 kB 3.9 MB/s eta 0:00:01
   ---------------------------------------- 542.0/542.0 kB 4.3 MB/s eta 0:00:00
Downloading multiprocess-0.70.16-py311-

In [31]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load dataset
dataset = load_dataset('text', data_files={'train': 'corpus.txt'}, split='train')

# Define padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        padding='max_length', 
        truncation=True, 
        max_length=512,
        return_tensors='pt'  # Ensure output is PyTorch tensors
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False,
)
# Check available splits in the tokenized dataset
print(tokenized_datasets.info.splits)


{'train': SplitInfo(name='train', num_bytes=657622, num_examples=133, shard_lengths=None, dataset_name='text')}


In [37]:
from transformers.trainer_utils import AcceleratorConfig, Accelerator
# Use the correct split name
train_dataset = tokenized_datasets["text"]

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-training",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)

# Create AcceleratorConfig
accelerator_config = AcceleratorConfig(use_seedable_sampler=False)

# Initialize Accelerator
accelerator = Accelerator(config=accelerator_config)

# Prepare the trainer for distributed training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Use the accelerator to prepare the trainer
trainer = accelerator.prepare(trainer)

# Train the model
trainer.train()

# Save model and tokenizer
model.save_pretrained("./gpt2-trained")
tokenizer.save_pretrained("./gpt2-trained")

ImportError: cannot import name 'AcceleratorConfig' from 'transformers.trainer_utils' (C:\Users\Admin\anaconda3\Lib\site-packages\transformers\trainer_utils.py)

In [30]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
   ---------------------------------------- 0.0/297.6 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/297.6 kB 660.6 kB/s eta 0:00:01
   --------------------------- ------------ 204.8/297.6 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 297.6/297.6 kB 3.1 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.29.3


In [32]:
pip install accelerate==0.21.0

Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl.metadata (17 kB)
Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
   ---------------------------------------- 0.0/244.2 kB ? eta -:--:--
   ---------------------------------------- 244.2/244.2 kB 7.3 MB/s eta 0:00:00
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.29.3
    Uninstalling accelerate-0.29.3:
      Successfully uninstalled accelerate-0.29.3
Successfully installed accelerate-0.21.0
Note: you may need to restart the kernel to use updated packages.


In [34]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.
