In [2]:
pip install torch

^C
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

ModuleNotFoundError: No module named 'torch'

In [None]:
import tensorflow as tf
import numpy as np
import os
import re
from transformers import AutoTokenizer, TFBertModel

# Load Legal-BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
model = TFBertModel.from_pretrained('nlpaueb/legal-bert-base-uncased')

# Function to clean legal text
def clean_text(text):
    text = re.sub(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', '[DATE]', text)  # Replace dates
    text = re.sub(r'\b\d{4}\b', '[YEAR]', text)  # Replace years
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetical characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    text = re.sub(r'\bAIR\s\d{4}\sSC\s\d{3,4}\b', '[CITATION]', text)  # Replace case citations
    
    # Replace legal terms with simpler terms
    legal_dict = {
        'hereinabove': 'above',
        'hereinafter': 'below',
        'plaintiff': 'claimant',
        'defendant': 'respondent'
    }
    for term, replacement in legal_dict.items():
        text = text.replace(term, replacement)
    
    # Remove boilerplate legal phrases
    boilerplate_phrases = [
        'the learned counsel submitted that',
        'in light of the above discussion',
        'the facts of the case are as follows'
    ]
    for phrase in boilerplate_phrases:
        text = text.replace(phrase, '')
    
    return text

# Function to read file content
def read(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        return text

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    judgment_text_path = file_path[0].numpy().decode('utf-8')

    # Read and clean the legal judgment text
    judgment_text = read(judgment_text_path)
    cleaned_judgment_text = clean_text(judgment_text)

    # Tokenization of the cleaned judgment text using Legal-BERT tokenizer
    original_text_tokened = tokenizer(cleaned_judgment_text, return_tensors='tf', truncation=True, padding='max_length', max_length=512)
    
    # Generate embeddings from Legal-BERT
    embeddings = model(original_text_tokened)[0]  # Shape: (batch_size, sequence_length, hidden_size)
    
    return original_text_tokened['input_ids'], original_text_tokened['attention_mask'], embeddings

In [None]:
dataset_dir = "C:/Users/prasa/Downloads/7152317/dataset/dataset/IN-Abs"
train_judgement_dir = os.path.join(dataset_dir, 'train-data', 'judgement')
train_summary_dir = os.path.join(dataset_dir, 'train-data', 'summary')

# Prepare file paths for judgments and summaries
train_files = [(os.path.join(train_judgement_dir, file), os.path.join(train_summary_dir, file)) for file in os.listdir(train_judgement_dir)]

# Create a TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(train_files)
train_dataset = train_dataset.map(lambda x: tf.py_function(load_and_preprocess_data, [x], [tf.int32, tf.int32, tf.int32]))
train_dataset = train_dataset.shuffle(buffer_size=len(train_files))
train_dataset = train_dataset.padded_batch(16, padded_shapes=([None], [None], [None]))
train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
import numpy as np
docs = []
for file_name in os.listdir(train_judgement_dir):
    file_path = os.path.join(train_judgement_dir, file_name)
    text = read(file_path)
    docs.append(text)
stopwords=stopwords.words('english')
vectorizer=CountVectorizer(max_df=0.9, min_df=2, stop_words=stopwords)
term_matrix=vectorizer.fit_transform(docs)
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(term_matrix)
terms = np.array(vectorizer.get_feature_names_out())
for idx, topic in enumerate(lda_model.components_):
    print(f"Topic {idx}:")
    print(" ".join(terms[i] for i in topic.argsort()[-10:]))

In [None]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(term_matrix)

In [None]:
import networkx as nx
def graph_build(docs, term_matrix, tfidf_matrix, lda_model, terms):
    G = nx.Graph()
    inputs=tokenizer(docs, return_tensors='tf',truncation=True,padding='max_length',max_length=512)
    outputs=model(inputs)
    sen_embeddings=outputs.last_hidden_state.mean(axis=1)
    for idx,doc in enumerate(docs):
        G.add_node(doc,embedding=sen_embeddings[idx].numpy(),type='sentence')
    for topic_idx, topic in enumerate(lda_model.components_):
        G.add_node(f"topic_{topic_idx}", type='topic', words=[terms[i] for i in topic.argsort()[-10:]])
    for tfidf_idx, doc_tfidf in enumerate(tfidf_matrix):
        feature_index = doc_tfidf.nonzero()[1]
        tfidf_scores = zip(feature_index, [doc_tfidf[0, x] for x in feature_index])
        for word_idx, score in tfidf_scores:
            word = terms[word_idx]
            # Find topic containing this word and connect
            for topic_idx, topic_words in enumerate([terms[i] for i in topic.argsort()[-10:]] for topic in lda_model.components_):
                if word in topic_words:
                    G.add_edge(f"sentence_{tfidf_idx}", f"topic_{topic_idx}", weight=score)
    return G
graph=graph_build(docs, term_matrix, tfidf_matrix, lda_model, terms)
print(nx.info(graph))


In [None]:
class GAT(tf.keras.layers.Layer):
    def __init__(self, output_dim, num_heads=1):
        super(GAT, self).__init__()
        self.output_dim = output_dim
        self.num_heads = num_heads
    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], self.output_dim), initializer='random_normal', trainable=True)
        self.a = self.add_weight(shape=(2 * self.output_dim, 1), initializer='random_normal', trainable=True)
    def call(self, node_features, adj_matrix):
        node_features_transformed = tf.matmul(node_features, self.W)
        N = tf.shape(node_features)[0]  # Number of nodes
        attention_scores = []
        for i in range(N):
            for j in range(N):
                if adj_matrix[i][j] > 0:  # If nodes are connected
                    concatenated_features = tf.concat([node_features_transformed[i], node_features_transformed[j]], axis=-1)
                    score = tf.nn.leaky_relu(tf.matmul(concatenated_features, self.a))
                    attention_scores.append((i, j, score))
        adj_matrix = tf.convert_to_tensor(adj_matrix, dtype=tf.float32)
        attention_scores_softmax = tf.nn.softmax(adj_matrix)
        updated_node_features = []
        for i in range(N):
            neighbors = [j for j in range(N) if adj_matrix[i][j] > 0]
            weighted_sum = tf.zeros(self.output_dim)
            for j in neighbors:
                weight = attention_scores_softmax[i][j]
                weighted_sum += weight * node_features_transformed[j]
            updated_node_features.append(weighted_sum)

        return tf.convert_to_tensor(updated_node_features)
def graph_to_adj_matrix(graph):
    adj_matrix = nx.to_numpy_matrix(graph)
    return adj_matrix
        

In [None]:
adj_matrix=graph_to_adj_matrix(graph)
gat_layer=GAT(output_dim=128,num_heads=1)
node_features = np.random.rand(len(graph.nodes), 768)
node_features = tf.convert_to_tensor(node_features, dtype=tf.float32)
updated_node_features = gat_layer(node_features, adj_matrix)

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

def attention_pooling(embeddings):
  attention_weights = tf.nn.softmax(tf.keras.layers.Lambda(lambda x: x)(embeddings), axis=1)
  attention_weights = tf.cast(attention_weights, dtype=embeddings.dtype)
  pooled_embeddings = tf.reduce_sum(embeddings * attention_weights, axis=1)
  return pooled_embeddings

# Custom Loss Function (same as before)
def custom_loss(labels, y_pred, original_embeddings, summary_embeddings):
    cross_entropy_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(labels, y_pred)
    original_embeddings_pooled = attention_pooling(original_embeddings)
    summary_embeddings_pooled = attention_pooling(summary_embeddings)
    original_embeddings_norm = tf.nn.l2_normalize(original_embeddings_pooled, axis=-1)
    summary_embeddings_norm = tf.nn.l2_normalize(summary_embeddings_pooled, axis=-1)
    cosine_sim = tf.reduce_sum(tf.multiply(original_embeddings_norm, summary_embeddings_norm), axis=-1)
    cosine_sim_loss = tf.reduce_mean(1 - cosine_sim)
    combined_loss = cross_entropy_loss + cosine_sim_loss
    return combined_loss

# Load pre-trained BART model and tokenizer
bart_model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(bart_model_name)

# Training Step using BART Decoder with GAT Embeddings
@tf.function
def train_step_gat(input_embeddings, decoder_input_ids, labels, optimizer):
    with tf.GradientTape() as tape:
        # Forward pass: GAT embeddings (bypassing BART encoder)
        outputs = model.model.decoder(input_ids=decoder_input_ids, encoder_hidden_states=input_embeddings, training=True).logits
        
        # Custom Loss: Cross-Entropy + Cosine Similarity Loss
        loss = custom_loss(labels, outputs, input_embeddings, outputs)

        # Mixed precision loss scaling
        scaled_loss = optimizer.get_scaled_loss(loss)

    # Compute gradients
    scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables)
    gradients = optimizer.get_unscaled_gradients(scaled_gradients)

    # Apply gradients
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)

# Initialize optimizer
base_optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
optimizer = tf.keras.mixed_precision.LossScaleOptimizer(base_optimizer)
train_loss = tf.keras.metrics.Mean(name='train_loss')

# Example training loop with correct dataset usage
epochs = 10

# Assuming `updated_node_features` is the input embeddings from the GAT layer
for epoch in range(epochs):
    for (batch, (input_ids, attention_mask, decoder_input_ids)) in enumerate(train_dataset):
        labels = decoder_input_ids[:, 1:]  # Shifted for the decoder
        decoder_input_ids = decoder_input_ids[:, :-1]

        # Use the GAT embeddings (assume precomputed) as input
        input_embeddings = updated_node_features  # These embeddings are from the GAT layer for the current batch

        # Train with GAT embeddings and decoder inputs (summary)
        train_step_gat(input_embeddings, decoder_input_ids, labels, optimizer)

    print(f'Epoch {epoch + 1}, Loss: {train_loss.result()}')