In [None]:
! pip install -U accelerate
! pip install transformers[torch]

In [2]:
import accelerate
accelerate.__version__

'0.25.0'

In [3]:
from transformers import AutoTokenizer, AutoModel

## Data

In [None]:
!cp -r drive/MyDrive/shared_content/ILDC_Data/ILDC_multi ./

In [None]:
import numpy as np
import pandas as pd
import torch

In [None]:
# with pd.read_csv(filename, chunksize=chunksize) as reader:
#     for chunk in reader:
#         process(chunk)

In [None]:
chunksize = 10**6

In [None]:
data = pd.read_csv('./ILDC_multi/ILDC_multi.csv')

In [None]:
data.info()

In [None]:
print(data['text'][0])

 Uday Umesh Lalit, J. These appeals arise out of the Judgment and Order dated 09.12.2015 passed by the Division Bench of the High Court 1 dismissing S.T. No.192 of 2014 and other companynected matters and thereby affirming the decision of the Single Judge of the High Court passed on 12.03.2014 in Writ Petition No.20650  W  of 2013 which in turn had found Sections 8, 10, 11 and 12 of the West Bengal Madrasah Service Commission Act, 2008  the Commission Act, for short  to be ultra vires. 1 The High Court of Judicature at Calcutta Civil Appeal No.5808 of 2017 SK. MD. Rafique vs. Managing Committee, companytai Rahamania High Madrasah and Others The aforementioned Writ Petition No.20650 W  of 2013 was filed by the Managing Committee of Contai Rahmania High Madrasah challenging validity of Sections 8, 10, 11 and 12 of the Commission Act submitting, inter alia, that by virtue of the provisions of the Commission Act, the process of appointment of teachers in an aided Madrasah, which was recogn

In [None]:
## Write Data to file

In [None]:
datatxtfull = "\n".join(data['text'].tolist())
datatxtsample = "\n".join(data['text'][:2000].tolist())

In [None]:
len(datatxtfull), len(datatxtsample)

(666469234, 51595056)

In [None]:
!rm -f text_data_full.txt
with open('text_data_full.txt', 'a') as f:
  f.write(datatxtfull)

In [None]:
!rm -f text_data_sample.txt
with open('text_data_sample.txt', 'a') as f:
  f.write(datatxtsample)

In [None]:
with open('text_data_full.txt', 'r') as f:
  print(len(f.read()))
with open('text_data_sample.txt', 'r') as f:
  print(len(f.read()))

666469234
51595056


In [None]:
## Copy saved Data file to drive:

!cp text_data_full.txt drive/MyDrive/
!cp text_data_sample.txt drive/MyDrive/

In [None]:
## Copy saved Data file from drive:

!cp drive/MyDrive/text_data_sample.txt ./
!cp drive/MyDrive/text_data_full.txt ./

## Model

In [None]:
import torch
import torch.nn as nn

In [None]:
!rm -rf full_text
!rm -rf sample_text_model_ipc

In [None]:
## Get pretrained from drive
!cp -r drive/MyDrive/full_text ./

### GPT2 pretrained

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [None]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [None]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  # trainer.train(resume_from_checkpoint = True)
  trainer.train()
  trainer.save_model()

In [None]:
train_file_path = "ipcformat.txt"
model_name = 'gpt2'
output_dir = './sample_text_model_ipc'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 70.0
save_steps = 50000


In [None]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,1.8119
1000,1.068
1500,0.6255
2000,0.3739
2500,0.2405
3000,0.1701
3500,0.1309
4000,0.1069
4500,0.0919
5000,0.0826


In [None]:
# copy to google drive
!cp -r ./full_text/config.json ./drive/MyDrive/full_text/
!cp -r ./full_text/generation_config.json ./drive/MyDrive/full_text/
!cp -r ./full_text/merges.txt ./drive/MyDrive/full_text/
!cp -r ./full_text/model.safetensors ./drive/MyDrive/full_text/
!cp -r ./full_text/special_tokens_map.json ./drive/MyDrive/full_text/
!cp -r ./full_text/tokenizer_config.json ./drive/MyDrive/full_text/
!cp -r ./full_text/training_args.bin ./drive/MyDrive/full_text/
!cp -r ./full_text/vocab.json ./drive/MyDrive/full_text/

In [None]:
!cp -r ./sample_text_model_ipc/config.json ./drive/MyDrive/sample_text_model_ipc/
!cp -r ./sample_text_model_ipc/generation_config.json ./drive/MyDrive/sample_text_model_ipc/
!cp -r ./sample_text_model_ipc/merges.txt ./drive/MyDrive/sample_text_model_ipc/
!cp -r ./sample_text_model_ipc/model.safetensors ./drive/MyDrive/sample_text_model_ipc/
!cp -r ./sample_text_model_ipc/special_tokens_map.json ./drive/MyDrive/sample_text_model_ipc/
!cp -r ./sample_text_model_ipc/tokenizer_config.json ./drive/MyDrive/sample_text_model_ipc/
!cp -r ./sample_text_model_ipc/training_args.bin ./drive/MyDrive/sample_text_model_ipc/
!cp -r ./sample_text_model_ipc/vocab.json ./drive/MyDrive/sample_text_model_ipc/

### Predict

In [4]:
!cp -r drive/MyDrive/full_text ./
!cp -r drive/MyDrive/sample_text_model_ipc ./

In [5]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [6]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):
    offset = len(sequence) + 1
    sequence += "<|endoftext|>"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
        num_return_sequences=3,
    )
    return [tokenizer.decode(out, skip_special_tokens=True)[offset:] for out in final_outputs ]


In [7]:
model1_path = "./sample_text_model_ipc"
sequence1 = "what is the punishment for murder"
max_len = 50
out = generate_text(model1_path, sequence1, max_len)
sentences = [sequence1] + out
sentences

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


['what is the punishment for murder',
 'Section 109:- Punishment of murder\nWhoever commits murder shall be punished with death, or imprisonment for life, and shall also be liable to fine.\n',
 'Section 109:- Punishment of murder\nIn every case, of murder, the offender shall be punished with death.\n',
 'Section 109:- Punishment of murder\nIn every case in which culpable homicide is murder, the offender shall be punished with death.\nIn every case in which culpable homicide is murder, the offender shall']

In [None]:
! pip install -U sentence-transformers

In [None]:
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")

tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
model = AutoModel.from_pretrained("law-ai/InLegalBERT")

config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/534M [00:00<?, ?B/s]

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('law-ai/InLegalBERT')
#Encoding:
sen_embeddings = model.encode(sentences)
sen_embeddings.shape

.gitattributes:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/534M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]



(4, 768)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#let's calculate cosine similarity for sentence 0:
similarity = cosine_similarity(
    [sen_embeddings[0]],
    sen_embeddings[1:]
)

In [None]:
similarity

array([[0.6506703 , 0.6506703 , 0.64345187]], dtype=float32)

In [None]:
print(sentences[np.argmax(similarity) + 1])

Section 109:- Punishment of murder
Whoever commits murder shall be punished with death, or imprisonment for life, and shall also be liable to fine.

