In [40]:
!pip install transformers scikit-learn torch




In [41]:
!pip install datasets



KeyboardInterrupt: 

In [None]:
!pip install -U transformers


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
import torch
import transformers
import openai
from tqdm import tqdm

In [None]:
print(transformers.__version__)

In [None]:

dataset = pd.read_csv("/content/allQuestions.csv")
print(dataset.head())
print(dataset['act'].value_counts())

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset["question"], dataset["act"], test_size=0.2, stratify=dataset["act"], random_state=42
)

## USING LOGISTIC REGRESSION

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, train_labels)

preds = clf.predict(X_test)
print(classification_report(test_labels, preds))

## TWEAKING THE MODEL TO HANDLE IMBALANCE

In [None]:
X = dataset["question"]
y = dataset["act"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
clf = LogisticRegression(
    max_iter=300,
    class_weight="balanced",   # <--- key tweak
    solver="liblinear",        # good for small/medium datasets
    multi_class="ovr"  )
clf.fit(X_train_tfidf, y_train)

In [None]:
y_pred = clf.predict(X_test_tfidf)

# Report
print(classification_report(y_test, y_pred))

### CONFUSION MATRIX

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=clf.classes_,
    yticklabels=clf.classes_
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Legal Query Classifier")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## USING RoBERTa

In [None]:
Set, test_df = train_test_split(dataset, test_size=0.2, stratify=dataset["act"], random_state=42)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(Set)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
labels = dataset["act"].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def preprocess(batch):
    return tokenizer(batch["question"], truncation=True, padding="max_length", max_length=128)

In [None]:
train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

train_dataset = train_dataset.rename_column("act", "labels")
train_dataset = train_dataset.map(lambda x: {"labels": label2id[x["labels"]]})
test_dataset = test_dataset.rename_column("act", "labels")
test_dataset = test_dataset.map(lambda x: {"labels": label2id[x["labels"]]})

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)


In [None]:
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted")
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
results = trainer.evaluate()
print(results)


In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
y_true = preds.label_ids

print(classification_report(y_true, y_pred, target_names=list(label2id.keys())))
print(confusion_matrix(y_true, y_pred))


In [None]:

output_dir = "./roberta_classifier"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./roberta_classifier /content/drive/MyDrive/Uhaki_Models/


In [None]:
from transformers import pipeline

clf = pipeline("text-classification", model="./roberta_classifier", tokenizer="./roberta_classifier")
print(clf("Can my landlord evict me without notice?"))


In [None]:
text = "What is the meaning of superior landlord?"
prediction = clf(text)

print(prediction)


### USING GPT FOR DATA AUGMENTATION

In [None]:
#Set = pd.read_csv("/content/allQuestions.csv")

In [None]:
#Set.head()

In [None]:
#low_support_threshold = 15
#class_counts = Set['act'].value_counts()
#low_support_classes = class_counts[class_counts <= low_support_threshold].index.tolist()
#print("Low-support classes:", low_support_classes)


In [None]:
"""def gpt_paraphrase(query, n=3, model="gpt-3.5-turbo"):
    """
   # Generate n paraphrases for a legal query using the new OpenAI Python API.
    """
    #messages = [
        {"role": "system", "content": "You are a legal language assistant."},
        {"role": "user", "content": f"Paraphrase the following legal question {n} times, keeping the meaning exactly the same and preserving all legal terminology. Return each paraphrase on a separate line.\n\nQuestion: \"{query}\""}
    ]

    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.7,
        max_tokens=200
    )

    text = response.choices[0].message.content
    paraphrases = [line.strip() for line in text.split("\n") if line.strip()]
    return paraphrases[:n]"""


In [None]:
"""augmented_rows = []

for act in tqdm(low_support_classes, desc="Augmenting low-support classes"):
    subset = Set[Set['act'] == act]
    for _, row in subset.iterrows():
        original_query = row['question']
        paraphrases = gpt_paraphrase(original_query, n=5)
        for p in paraphrases:
            augmented_rows.append({'question': p, 'act': act})"""


In [None]:
"""augmented_df = pd.DataFrame(augmented_rows)
Set_augmented = pd.concat([Set, augmented_df], ignore_index=True)
Set_augmented = Set_augmented.sample(frac=1).reset_index(drop=True)

print(f"Original samples: {len(Set)}, After augmentation: {len(Set_augmented)}")"""


In [None]:
#Set_augmented.to_csv("allQuestions_augmented.csv", index=False)


In [None]:
#class_counts = Set_augmented['act'].value_counts()
#print(class_counts)

###TRAINING WITH AUGMENTED DATASET

In [42]:
dataset=pd.read_csv("/content/allQuestions_augmented.csv")

In [43]:
train_df, test_df = train_test_split(
    dataset,
    test_size=0.2,
    stratify=dataset["act"],
    random_state=42
)

In [44]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

labels = dataset["act"].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}


In [45]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def preprocess(batch):
    return tokenizer(batch["question"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

Map:   0%|          | 0/724 [00:00<?, ? examples/s]

In [46]:
train_dataset = train_dataset.rename_column("act", "labels")
train_dataset = train_dataset.map(lambda x: {"labels": label2id[x["labels"]]})
test_dataset = test_dataset.rename_column("act", "labels")
test_dataset = test_dataset.map(lambda x: {"labels": label2id[x["labels"]]})

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

Map:   0%|          | 0/724 [00:00<?, ? examples/s]

In [47]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",             # save model every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
    # remove load_best_model_at_end to avoid transformers 4.5 issue
)




In [49]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted")
    }


In [52]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics

)

trainer.train()


  trainer = Trainer(


Step,Training Loss
50,1.5354
100,1.1592
150,1.2308
200,1.0779
250,0.897
300,0.848
350,0.8482
400,0.6504
450,0.6561
500,0.6196


TrainOutput(global_step=1810, training_loss=0.45343121072864007, metrics={'train_runtime': 1213.5433, 'train_samples_per_second': 23.848, 'train_steps_per_second': 1.492, 'total_flos': 1903881953249280.0, 'train_loss': 0.45343121072864007, 'epoch': 10.0})

In [53]:
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
y_true = preds.label_ids



print(classification_report(y_true, y_pred, target_names=list(label2id.keys())))
print(confusion_matrix(y_true, y_pred))

                               precision    recall  f1-score   support

     Work Injury Benefits Act       0.83      0.71      0.77        28
               Income Tax Act       0.77      0.79      0.78       104
                 Evidence Act       0.79      0.80      0.80        51
           Tax Procedures Act       0.66      0.76      0.71        55
          Law of Contract Act       1.00      1.00      1.00        11
                 Marriage Act       0.76      0.67      0.71        24
         Labour Relations Act       0.63      0.73      0.68        30
        Law of Succession Act       0.89      0.72      0.80        47
       Small Claims Court Act       0.76      0.87      0.81        15
Persons with Disabilities Act       0.70      0.70      0.70        33
        Constitution of Kenya       0.83      0.83      0.83        98
               Employment Act       0.62      0.68      0.65        37
                     Land Act       0.80      0.82      0.81        78
     

In [54]:

save_path = "./roberta_classifier_augmented"

trainer.save_model(save_path)

tokenizer.save_pretrained(save_path)


('./roberta_classifier_augmented/tokenizer_config.json',
 './roberta_classifier_augmented/special_tokens_map.json',
 './roberta_classifier_augmented/vocab.json',
 './roberta_classifier_augmented/merges.txt',
 './roberta_classifier_augmented/added_tokens.json')

In [55]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./roberta_classifier_augmented /content/drive/MyDrive/Uhaki_Models/roberta_classifier_augmented

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
from transformers import pipeline

# Load your trained classifier into a pipeline
clf = pipeline(
    "text-classification",
    model="./roberta_classifier_augmented",
    tokenizer="./roberta_classifier_augmented"
)

# Run a sample query
text = "What is the meaning of superior landlord?"
prediction = clf(text)

print(prediction)


Device set to use cuda:0


[{'label': 'Distress for Rent Act', 'score': 0.7742564678192139}]


In [57]:
from transformers import pipeline

# Load your trained classifier into a pipeline
clf = pipeline(
    "text-classification",
    model="./roberta_classifier_augmented",
    tokenizer="./roberta_classifier_augmented"
)

# Run a sample query
text =  "How is a marriage legally dissolved?"
prediction = clf(text)

print(prediction)

Device set to use cuda:0


[{'label': 'Marriage Act', 'score': 0.9908789992332458}]


## USING LEGAL BERT

In [58]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "nlpaueb/legal-bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
df = pd.read_csv("/content/allQuestions_augmented.csv")

In [60]:
labels = df['act'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['act'], random_state=42)


In [61]:

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [62]:
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def preprocess(batch):
    return tokenizer(batch["question"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)


Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Map:   0%|          | 0/724 [00:00<?, ? examples/s]

In [63]:
train_dataset = train_dataset.rename_column("act", "labels")
train_dataset = train_dataset.map(lambda x: {"labels": label2id[x["labels"]]})
test_dataset = test_dataset.rename_column("act", "labels")
test_dataset = test_dataset.map(lambda x: {"labels": label2id[x["labels"]]})

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

Map:   0%|          | 0/724 [00:00<?, ? examples/s]

In [64]:
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)


In [66]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted")
    }


In [67]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
50,2.8091
100,2.6678
150,2.2708
200,1.7772
250,1.5114
300,1.3141
350,1.2366
400,1.0058
450,0.8831
500,0.8695


Step,Training Loss
50,2.8091
100,2.6678
150,2.2708
200,1.7772
250,1.5114
300,1.3141
350,1.2366
400,1.0058
450,0.8831
500,0.8695


TrainOutput(global_step=1448, training_loss=0.8330206653689811, metrics={'train_runtime': 838.312, 'train_samples_per_second': 27.617, 'train_steps_per_second': 1.727, 'total_flos': 1523105562599424.0, 'train_loss': 0.8330206653689811, 'epoch': 8.0})

In [68]:
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
y_true = preds.label_ids

print("Classification Report:\n")
print(classification_report(y_true, y_pred, target_names=labels))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_true, y_pred))



Classification Report:

                               precision    recall  f1-score   support

     Work Injury Benefits Act       0.79      0.68      0.73        28
               Income Tax Act       0.85      0.79      0.82       104
                 Evidence Act       0.84      0.82      0.83        51
           Tax Procedures Act       0.62      0.78      0.69        55
          Law of Contract Act       1.00      1.00      1.00        11
                 Marriage Act       0.89      0.71      0.79        24
         Labour Relations Act       0.76      0.73      0.75        30
        Law of Succession Act       0.86      0.68      0.76        47
       Small Claims Court Act       0.67      0.53      0.59        15
Persons with Disabilities Act       0.72      0.70      0.71        33
        Constitution of Kenya       0.83      0.87      0.85        98
               Employment Act       0.59      0.70      0.64        37
                     Land Act       0.83      0.90  

In [69]:
save_path = "./legal_bert_classifier"

trainer.save_model(save_path)

tokenizer.save_pretrained(save_path)


('./legal_bert_classifier/tokenizer_config.json',
 './legal_bert_classifier/special_tokens_map.json',
 './legal_bert_classifier/vocab.txt',
 './legal_bert_classifier/added_tokens.json')

In [70]:
from transformers import pipeline

# Load your trained classifier into a pipeline
clf = pipeline(
    "text-classification",
    model="./legal_bert_classifier",
    tokenizer="./legal_bert_classifier"
)

# Run a sample query
text =  "How is a marriage legally dissolved?"
prediction = clf(text)

print(prediction)

Device set to use cuda:0


[{'label': 'Marriage Act', 'score': 0.9485592842102051}]


In [72]:
from transformers import pipeline

# Load your trained classifier into a pipeline
clf = pipeline(
    "text-classification",
    model="./legal_bert_classifier",
    tokenizer="./legal_bert_classifier"
)

# Run a sample query
text =  "Can the government pass a law that limits freedom of speech without parliamentary approval?"
prediction = clf(text)

print(prediction)

Device set to use cuda:0


[{'label': 'Constitution of Kenya', 'score': 0.9883834719657898}]


In [73]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./legal_bert_classifier /content/drive/MyDrive/Uhaki_Models/legal_bert_classifier

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
