In [2]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset('ninadn/indian-legal')
train_df = pd.DataFrame(dataset['train'])

# Clean dataset
def clean_dataframe(df):
    df = df.dropna(subset=['Text', 'Summary'])
    df = df[(df['Text'].str.strip() != '') & (df['Summary'].str.strip() != '')]
    return df

train_df = clean_dataframe(train_df)

# Split into 90% train and 10% validation
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Optional: Remove stopwords
def remove_stopwords(text):
    stop_words = set(['the', 'is', 'in', 'and', 'to', 'with', 'a', 'of', 'for', 'on', 'that', 'as', 'at', 'by', 'it', 'this', 'an', 'are', 'or'])
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

train_df['Text'] = train_df['Text'].apply(remove_stopwords)
val_df['Text'] = val_df['Text'].apply(remove_stopwords)


In [3]:
class LegalDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Text']
        summary = self.data.iloc[idx]['Summary']

        text_encodings = self.tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=self.max_len
        )

        summary_encodings = self.tokenizer(
            summary,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=self.max_len
        )

        return text_encodings.input_ids.squeeze(), text_encodings.attention_mask.squeeze(), summary_encodings.input_ids.squeeze()


In [4]:
class PointerGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PointerGenerator, self).__init__()
        self.encoder = BertModel.from_pretrained('bert-base-uncased')
        self.decoder = nn.LSTM(embedding_dim, hidden_dim)
        self.pointer_layer = nn.Linear(hidden_dim, vocab_size)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, input_ids, attention_mask):
        encoder_outputs = self.encoder(input_ids, attention_mask=attention_mask)[0]
        decoder_outputs, _ = self.decoder(encoder_outputs)
        final_logits = self.pointer_layer(decoder_outputs)
        return final_logits


In [5]:
# Initialize tokenizer and datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = LegalDataset(train_df, tokenizer)
val_dataset = LegalDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Model, optimizer, and loss
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = PointerGenerator(vocab_size=tokenizer.vocab_size, embedding_dim=768, hidden_dim=768).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Train function
def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in data_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.view(-1, tokenizer.vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(data_loader)




In [6]:
epochs = 5
for epoch in range(epochs):
    loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss}")

KeyboardInterrupt: 

In [7]:
def save_model(model, tokenizer, model_path, tokenizer_path):
    torch.save(model.state_dict(), model_path)
    tokenizer.save_pretrained(tokenizer_path)

def load_model(model, tokenizer, model_path, tokenizer_path):
    model.load_state_dict(torch.load(model_path))
    tokenizer.from_pretrained(tokenizer_path)
    return model, tokenizer

# Paths for saving and loading
model_path = "/kaggle/working/pointer_generator_model.pth"
tokenizer_path = "/kaggle/working/tokenizer"

# Save model and tokenizer
save_model(model, tokenizer, model_path, tokenizer_path)

# Load saved model and tokenizer
model, tokenizer = load_model(model, tokenizer, model_path, tokenizer_path)


  model.load_state_dict(torch.load(model_path))


In [8]:
model, tokenizer = load_model(model, tokenizer, model_path, tokenizer_path)

  model.load_state_dict(torch.load(model_path))


In [9]:
def generate_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    if len(tokens) < n:
        return Counter()  # Return an empty counter if not enough tokens
    ngrams_list = list(ngrams(tokens, n))
    ngram_counts = Counter(ngrams_list)
    total_ngrams = sum(ngram_counts.values())
    ngram_scores = {ngram: total_ngrams / freq for ngram, freq in ngram_counts.items()}
    return ngram_scores


def score_sentence_with_ngrams(sentence, ngram_scores, n=3):
    tokens = nltk.word_tokenize(sentence)
    if len(tokens) < n:
        # If not enough tokens for n-grams, assign a default score of 0
        return 0

    sentence_ngrams = list(ngrams(tokens, n))
    score = sum(ngram_scores.get(ngram, 0) for ngram in sentence_ngrams)
    return score


def generate_summary_with_model(text, model, tokenizer, ngram_scores, max_tokens=None, n=3):
    sentences = nltk.sent_tokenize(text)
    sentences= sentences[:40]
    inputs = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        sentence_embeddings = outputs[:, 0, :].cpu()

    torch.cuda.empty_cache()
    sentence_scores = []
    for idx, sentence in enumerate(sentences):
        ngram_score = score_sentence_with_ngrams(sentence, ngram_scores, n)
        sentence_score = ngram_score * torch.norm(sentence_embeddings[idx])
        sentence_scores.append((sentence, sentence_score))

    sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)

    # If max_tokens is None, include all sentences
    if max_tokens is None:
        return " ".join([sent for sent, _ in sentence_scores])

    summary_tokens, selected_sentences = 0, []
    for sent, _ in sentence_scores:
        tokens = len(tokenizer.tokenize(sent))
        if summary_tokens + tokens > max_tokens:
            break
        selected_sentences.append(sent)
        summary_tokens += tokens

    return " ".join(selected_sentences)


In [10]:
import torch

# Ensure you are using GPU 1
device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0")
print(f"Using device: {device}")

# Move model to GPU 1
model = model.to(device)

def evaluate_model_on_validation(val_df, model, tokenizer, max_tokens_list):
    total_rouge_scores = {tokens: {'rouge1': 0, 'rouge2': 0, 'rougeL': 0} for tokens in max_tokens_list}

    for idx in range(len(val_df)):
        text = val_df.iloc[idx]['Text']
        reference_summary = val_df.iloc[idx]['Summary']
        ngram_scores = generate_ngrams(text, 3)

        # Generate summaries for each token limit
        for max_tokens in max_tokens_list:
            generated_summary = generate_summary_with_model(text, model, tokenizer, ngram_scores, max_tokens)

            scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
            rouge_scores = scorer.score(reference_summary, generated_summary)

            total_rouge_scores[max_tokens]['rouge1'] += rouge_scores['rouge1'].fmeasure
            total_rouge_scores[max_tokens]['rouge2'] += rouge_scores['rouge2'].fmeasure
            total_rouge_scores[max_tokens]['rougeL'] += rouge_scores['rougeL'].fmeasure

    # Normalize scores
    for max_tokens in total_rouge_scores:
        for metric in total_rouge_scores[max_tokens]:
            total_rouge_scores[max_tokens][metric] /= len(val_df)

    return total_rouge_scores



max_tokens_list = [300, 600]

# Evaluate and print results
rouge_scores = evaluate_model_on_validation(val_df, model, tokenizer, max_tokens_list)

for max_tokens, scores in rouge_scores.items():
    print(f"\nResults for summaries up to {max_tokens} tokens:")
    print(f"ROUGE-1: {scores['rouge1']}")
    print(f"ROUGE-2: {scores['rouge2']}")
    print(f"ROUGE-L: {scores['rougeL']}")


Using device: cuda:1

Results for summaries up to 300 tokens:
ROUGE-1: 0.2543399457127066
ROUGE-2: 0.07534923595466274
ROUGE-L: 0.12287232867720368

Results for summaries up to 600 tokens:
ROUGE-1: 0.33299165679905457
ROUGE-2: 0.10284371873330596
ROUGE-L: 0.1412258131158518
