In [1]:
# Cell 1: environment check and installations (if any)
# This cell is checking for GPU availability and printing device info.
import tensorflow as tf
import os
print("TensorFlow version is", tf.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT available")
try:
    print("Physical GPUs:", tf.config.list_physical_devices('GPU'))
except Exception as e:
    print("GPU check is raising:", e)


TensorFlow version is 2.19.0
GPU is available
Physical GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# Cell 2: imports and global settings
# This cell is importing required libraries and setting global hyperparameters.
import zipfile
import glob
import pandas as pd
import numpy as np
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# This cell is fixing random seeds for reproducibility.
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
import tensorflow as tf
tf.random.set_seed(SEED)

# This cell is setting core hyperparameters which you can change.
maxVocabSize = 30000
maxSeqLen = 256
embeddingDim = 128
batchSize = 128
numEpochs = 12
validationSplit = 0.1


In [4]:
# Cell 3: data loader and preprocessing classes
# This cell is defining classes for loading, preprocessing, and generating pairs.

class ZipDatasetLoader:
    """This class is handling unzipping and reading csv files."""
    def __init__(self, zipPath="archive.zip", extractTo="/content/archive"):
        self.zipPath = zipPath
        self.extractTo = extractTo
        os.makedirs(self.extractTo, exist_ok=True)

    def extractZip(self):
        """This function is extracting archive.zip into the working directory."""
        with zipfile.ZipFile(self.zipPath, 'r') as z:
            z.extractall(self.extractTo)
        print("Archive is extracted to", self.extractTo)

    def loadCsvFiles(self):
        """This function is reading all csv files from the extracted archive and returning a dataframe list."""
        csvPaths = glob.glob(os.path.join(self.extractTo, "**", "*.csv"), recursive=True)
        dfs = []
        for p in csvPaths:
            df = pd.read_csv(p)
            # This block is ensuring columns 'clause_text' or similar are normalized.
            # This code is trying common name patterns.
            cols = [c.lower() for c in df.columns]
            if 'clause_text' in cols:
                textCol = df.columns[cols.index('clause_text')]
            elif 'text' in cols:
                textCol = df.columns[cols.index('text')]
            elif 'clause' in cols:
                textCol = df.columns[cols.index('clause')]
            else:
                # fallback to first column
                textCol = df.columns[0]
            # This code is trying to find a label/type column.
            if 'clause_type' in cols:
                typeCol = df.columns[cols.index('clause_type')]
            elif 'type' in cols:
                typeCol = df.columns[cols.index('type')]
            elif 'label' in cols:
                typeCol = df.columns[cols.index('label')]
            else:
                # fallback to second column if exists
                typeCol = df.columns[1] if df.shape[1] > 1 else None

            # This code is constructing standardized dataframe
            tmp = pd.DataFrame()
            tmp['clauseText'] = df[textCol].astype(str).fillna("")
            tmp['clauseType'] = df[typeCol].astype(str).fillna("") if typeCol is not None else ""
            tmp['sourceFile'] = os.path.basename(p)
            dfs.append(tmp)
        if not dfs:
            raise FileNotFoundError("No CSV files are found in the archive extract path.")
        allDf = pd.concat(dfs, ignore_index=True)
        print("Total clauses are", len(allDf))
        return allDf

class PairGenerator:
    """This class is generating similarity pairs from clause dataframe."""
    def __init__(self, clauseDf):
        self.df = clauseDf.copy()
        # This line is cleaning clauseType to ensure string values.
        self.df['clauseType'] = self.df['clauseType'].astype(str).fillna("")
        self.typeGroups = self.df.groupby('clauseType').indices

    def simpleTextClean(self, text):
        """This function is doing basic text cleaning."""
        text = text.lower()
        # This line is removing repeated whitespace and non-printable characters.
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def generatePairs(self, maxPairsPerType=5000, negativeToPositiveRatio=1.0):
        """This function is creating pairs labelled 1 for similar and 0 for not similar.
           Similarity is being approximated by same clauseType; different types are considered not similar.
        """
        # This code is normalizing texts.
        self.df['cleanText'] = self.df['clauseText'].apply(self.simpleTextClean)
        types = list(self.typeGroups.keys())
        posPairs = []
        # This loop is creating positive pairs from same clause type.
        for t, idxs in self.typeGroups.items():
            idxList = list(idxs)
            if len(idxList) < 2:
                continue
            # This code is sampling and creating pairs.
            random.shuffle(idxList)
            # generate up to n positive pairs per type
            count = 0
            for i in range(len(idxList)):
                for j in range(i+1, len(idxList)):
                    posPairs.append((self.df.loc[idxList[i],'cleanText'], self.df.loc[idxList[j],'cleanText'], 1))
                    count += 1
                    if count >= maxPairsPerType:
                        break
                if count >= maxPairsPerType:
                    break

        # This code is creating negative pairs by pairing clauses from different types.
        negPairs = []
        numNegNeeded = int(len(posPairs) * negativeToPositiveRatio)
        allIdxs = list(self.df.index)
        while len(negPairs) < numNegNeeded:
            a, b = random.sample(allIdxs, 2)
            if self.df.loc[a, 'clauseType'] != self.df.loc[b, 'clauseType']:
                negPairs.append((self.df.loc[a,'cleanText'], self.df.loc[b,'cleanText'], 0))

        pairs = posPairs + negPairs
        random.shuffle(pairs)
        print("Pairs are generated: positives =", len(posPairs), "negatives =", len(negPairs))
        # This code is splitting into inputs and labels.
        lefts = [p[0] for p in pairs]
        rights = [p[1] for p in pairs]
        labels = [p[2] for p in pairs]
        return lefts, rights, np.array(labels)


In [5]:
# Cell 4: tokenization and dataset preparation
# This cell is extracting the zip, loading data, generating pairs, tokenizing and preparing train/test splits.

# This block is extracting archive.zip and reading all CSVs.
loader = ZipDatasetLoader(zipPath="/content/archive.zip", extractTo="/content/archive")
loader.extractZip()
dfClauses = loader.loadCsvFiles()

# This block is generating pairs.
pairGen = PairGenerator(dfClauses)
leftTexts, rightTexts, labels = pairGen.generatePairs(maxPairsPerType=300, negativeToPositiveRatio=1.0)

# This block is tokenizing texts using Keras Tokenizer trained from scratch.
allTexts = leftTexts + rightTexts
tokenizer = Tokenizer(num_words=maxVocabSize, oov_token="<OOV>")
tokenizer.fit_on_texts(allTexts)
print("Tokenizer is fitted. Vocab size (approx) =", min(maxVocabSize, len(tokenizer.word_index)+1))

# This block is converting texts to sequences and padding them.
leftSeq = tokenizer.texts_to_sequences(leftTexts)
rightSeq = tokenizer.texts_to_sequences(rightTexts)
leftPad = pad_sequences(leftSeq, maxlen=maxSeqLen, padding='post', truncating='post')
rightPad = pad_sequences(rightSeq, maxlen=maxSeqLen, padding='post', truncating='post')

# This block is creating train/test split.
X = np.stack([leftPad, rightPad], axis=1)  # shape = (N, 2, seqLen)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)
# This block is creating validation split from train for the fit call using validationSplit param in Keras.
print("Data shapes are:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)


Archive is extracted to /content/archive
Total clauses are 150881
Pairs are generated: positives = 118305 negatives = 118305
Tokenizer is fitted. Vocab size (approx) = 30000
Data shapes are: (201118, 2, 256) (35492, 2, 256) (201118,) (35492,)


In [6]:
# Cell 5: model builder (Siamese BiLSTM and Attention Encoder)
# This cell is defining a modular ModelBuilder class that is creating two models.

class ModelBuilder:
    """This class is building multiple neural architectures for clause similarity."""
    def __init__(self, vocabSize, embeddingDim=128, maxLen=256):
        self.vocabSize = vocabSize
        self.embeddingDim = embeddingDim
        self.maxLen = maxLen

    def buildEmbeddingLayer(self):
        """This function is returning a Keras Embedding layer for shared usage."""
        return layers.Embedding(input_dim=self.vocabSize, output_dim=self.embeddingDim,
                                input_length=self.maxLen, mask_zero=False, name="sharedEmbedding")

    def buildSiameseBiLstm(self, lstmUnits=64, dropout=0.2):
        """This function is building a siamese BiLSTM encoder with distance-based head."""
        embed = self.buildEmbeddingLayer()
        # This block is defining the encoder model used for both inputs.
        seqInput = Input(shape=(self.maxLen,), dtype='int32', name="seqInput")
        x = embed(seqInput)
        x = layers.Bidirectional(layers.LSTM(lstmUnits, return_sequences=False), name="bilstm")(x)
        x = layers.Dropout(dropout)(x)
        encoded = layers.Dense(lstmUnits, activation='relu', name="encodedDense")(x)
        encoder = Model(seqInput, encoded, name="siameseEncoder")

        # This block is creating two inputs and computing absolute difference and multiplication.
        leftInput = Input(shape=(self.maxLen,), name="leftInput")
        rightInput = Input(shape=(self.maxLen,), name="rightInput")
        leftVec = encoder(leftInput)
        rightVec = encoder(rightInput)
        # This line is combining vectors using abs diff and multiply (Siamese style).
        absDiff = layers.Lambda(lambda tensors: tf.abs(tensors[0] - tensors[1]))([leftVec, rightVec])
        mult = layers.Multiply()([leftVec, rightVec])
        merged = layers.Concatenate()([absDiff, mult])
        merged = layers.Dense(128, activation='relu')(merged)
        merged = layers.Dropout(0.2)(merged)
        out = layers.Dense(1, activation='sigmoid')(merged)
        model = Model([leftInput, rightInput], out, name="siameseBiLstm")
        model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def buildAttentionEncoder(self, lstmUnits=64, dropout=0.2):
        """This function is building a dual-encoder using BiLSTM + self-attention over tokens."""
        embed = self.buildEmbeddingLayer()
        def encoderBlock(namePrefix):
            seqInput = Input(shape=(self.maxLen,), dtype='int32', name=f"{namePrefix}_input")
            x = embed(seqInput)
            x = layers.Bidirectional(layers.LSTM(lstmUnits, return_sequences=True), name=f"{namePrefix}_bilstm")(x)
            # This block is computing a simple token-level attention (learnable).
            attnScores = layers.Dense(1, activation='tanh')(x)   # (batch, seqLen, 1)
            attnWeights = layers.Softmax(axis=1)(attnScores)     # normalize across seqLen
            context = layers.Multiply()([x, attnWeights])        # weighted tokens
            contextVec = layers.Lambda(lambda z: tf.reduce_sum(z, axis=1))(context)  # sum pooling
            contextVec = layers.Dense(lstmUnits, activation='relu')(contextVec)
            return seqInput, contextVec

        leftInput, leftVec = encoderBlock("left")
        rightInput, rightVec = encoderBlock("right")

        # This line is composing vector interactions.
        combined = layers.Concatenate()([leftVec, rightVec, layers.Lambda(lambda t: tf.abs(t[0]-t[1]))([leftVec, rightVec])])
        combined = layers.Dense(256, activation='relu')(combined)
        combined = layers.Dropout(dropout)(combined)
        combined = layers.Dense(64, activation='relu')(combined)
        out = layers.Dense(1, activation='sigmoid')(combined)

        model = Model([leftInput, rightInput], out, name="attentionEncoder")
        model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
        return model


In [9]:
# Cell 6: helper training & evaluation functions
# This cell is defining train and evaluate utilities (OOP friendly).

def fitModel(model, X_train, y_train, modelName="model", epochs=numEpochs, batch_size=batchSize):
    """This function is training given model and returning training history and best file path."""
    leftTrain = X_train[:,0]
    rightTrain = X_train[:,1]
    bestPath = f"/content/{modelName}_best.h5"
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
        ModelCheckpoint(bestPath, monitor='val_loss', save_best_only=True, verbose=1)
    ]
    history = model.fit([leftTrain, rightTrain], y_train,
                        validation_split=validationSplit,
                        batch_size=batch_size,
                        epochs=epochs,
                        callbacks=callbacks,
                        verbose=2)
    return history, bestPath

def evaluateModel(model, X_eval, y_eval):
    """This function is evaluating model on test set and returning common metrics."""
    leftEval = X_eval[:,0]
    rightEval = X_eval[:,1]
    probs = model.predict([leftEval, rightEval], batch_size=256, verbose=0).ravel()
    preds = (probs >= 0.5).astype(int)
    acc = accuracy_score(y_eval, preds)
    prec, recall, f1, _ = precision_recall_fscore_support(y_eval, preds, average='binary', zero_division=0)
    # This block is handling case when only single class is present (roc_auc would fail)
    try:
        roc = roc_auc_score(y_eval, probs)
    except Exception as e:
        roc = float("nan")
    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1, "roc_auc": roc}, probs, preds

def showQualitativeExamples(leftTexts, rightTexts, y_true, y_pred, probs, tokenizerObj, maxLen=256, numExamples=10):
    """This function is decoding a few examples and printing correct and incorrect cases."""
    # This block is decoding sequences back to words via tokenizer index
    indexWord = {v:k for k,v in tokenizerObj.word_index.items()}
    def decodeSeq(seq):
        # This function is converting integer sequence to readable text (partial).
        words = []
        for id in seq:
            if id == 0:
                continue
            w = indexWord.get(id, "<OOV>")
            words.append(w)
            if len(words) > 30:
                break
        return " ".join(words)
    # This block is collecting some correct and incorrect indices.
    correctIdx = [i for i,(a,b) in enumerate(zip(y_true, y_pred)) if a==b]
    incorrectIdx = [i for i,(a,b) in enumerate(zip(y_true, y_pred)) if a!=b]
    print("Total correct are", len(correctIdx), "Total incorrect are", len(incorrectIdx))
    print("\n--- Examples of incorrect predictions ---")
    for idx in incorrectIdx[:numExamples]:
        print(f"\nLabel={y_true[idx]} Pred={y_pred[idx]} Prob={probs[idx]:.3f}")
        print("Left:", decodeSeq(leftTexts[idx]))
        print("Right:", decodeSeq(rightTexts[idx]))
    print("\n--- Examples of correct predictions ---")
    for idx in correctIdx[:numExamples]:
        print(f"\nLabel={y_true[idx]} Pred={y_pred[idx]} Prob={probs[idx]:.3f}")
        print("Left:", decodeSeq(leftTexts[idx]))
        print("Right:", decodeSeq(rightTexts[idx]))


In [10]:
# Cell 7: train and evaluate Siamese BiLSTM
# This cell is building and training the Siamese BiLSTM model.

vocabSizeActual = min(maxVocabSize, len(tokenizer.word_index) + 1)
builder = ModelBuilder(vocabSize=vocabSizeActual, embeddingDim=embeddingDim, maxLen=maxSeqLen)
siameseModel = builder.buildSiameseBiLstm(lstmUnits=64, dropout=0.2)
siameseModel.summary()

# This block is fitting the siamese model.
historySiamese, bestPathSiamese = fitModel(siameseModel, X_train, y_train, modelName="siameseBiLstm", epochs=numEpochs)

# This block is evaluating on test set.
metricsSiamese, probsSiamese, predsSiamese = evaluateModel(siameseModel, X_test, y_test)
print("Siamese BiLSTM metrics are:", metricsSiamese)




Epoch 1/12

Epoch 1: val_loss improved from inf to 0.01566, saving model to /content/siameseBiLstm_best.h5




1415/1415 - 73s - 52ms/step - accuracy: 0.9691 - loss: 0.0754 - val_accuracy: 0.9956 - val_loss: 0.0157
Epoch 2/12

Epoch 2: val_loss improved from 0.01566 to 0.01277, saving model to /content/siameseBiLstm_best.h5




1415/1415 - 69s - 48ms/step - accuracy: 0.9956 - loss: 0.0172 - val_accuracy: 0.9971 - val_loss: 0.0128
Epoch 3/12

Epoch 3: val_loss did not improve from 0.01277
1415/1415 - 67s - 47ms/step - accuracy: 0.9968 - loss: 0.0127 - val_accuracy: 0.9973 - val_loss: 0.0132
Epoch 4/12

Epoch 4: val_loss did not improve from 0.01277
1415/1415 - 69s - 49ms/step - accuracy: 0.9973 - loss: 0.0101 - val_accuracy: 0.9964 - val_loss: 0.0137
Epoch 5/12

Epoch 5: val_loss improved from 0.01277 to 0.00822, saving model to /content/siameseBiLstm_best.h5




1415/1415 - 67s - 48ms/step - accuracy: 0.9979 - loss: 0.0078 - val_accuracy: 0.9978 - val_loss: 0.0082
Epoch 6/12

Epoch 6: val_loss improved from 0.00822 to 0.00661, saving model to /content/siameseBiLstm_best.h5




1415/1415 - 67s - 47ms/step - accuracy: 0.9984 - loss: 0.0057 - val_accuracy: 0.9983 - val_loss: 0.0066
Epoch 7/12

Epoch 7: val_loss did not improve from 0.00661
1415/1415 - 67s - 47ms/step - accuracy: 0.9986 - loss: 0.0050 - val_accuracy: 0.9986 - val_loss: 0.0083
Epoch 8/12

Epoch 8: val_loss did not improve from 0.00661
1415/1415 - 66s - 47ms/step - accuracy: 0.9988 - loss: 0.0044 - val_accuracy: 0.9984 - val_loss: 0.0087
Epoch 9/12

Epoch 9: val_loss did not improve from 0.00661
1415/1415 - 82s - 58ms/step - accuracy: 0.9988 - loss: 0.0040 - val_accuracy: 0.9986 - val_loss: 0.0077
Epoch 9: early stopping
Restoring model weights from the end of the best epoch: 6.
Siamese BiLSTM metrics are: {'accuracy': 0.9982531274653443, 'precision': 0.9966857656443097, 'recall': 0.9998309478192269, 'f1': 0.9982558793743671, 'roc_auc': np.float64(0.9998996207030221)}


In [11]:
# Cell 8: train and evaluate Attention-based Encoder
# This cell is building and training the attention encoder model.

attentionModel = builder.buildAttentionEncoder(lstmUnits=64, dropout=0.2)
attentionModel.summary()

historyAttn, bestPathAttn = fitModel(attentionModel, X_train, y_train, modelName="attentionEncoder", epochs=numEpochs)
metricsAttn, probsAttn, predsAttn = evaluateModel(attentionModel, X_test, y_test)
print("Attention Encoder metrics are:", metricsAttn)




Epoch 1/12

Epoch 1: val_loss improved from inf to 0.02501, saving model to /content/attentionEncoder_best.h5




1415/1415 - 82s - 58ms/step - accuracy: 0.9395 - loss: 0.1300 - val_accuracy: 0.9953 - val_loss: 0.0250
Epoch 2/12

Epoch 2: val_loss improved from 0.02501 to 0.01834, saving model to /content/attentionEncoder_best.h5




1415/1415 - 75s - 53ms/step - accuracy: 0.9947 - loss: 0.0234 - val_accuracy: 0.9964 - val_loss: 0.0183
Epoch 3/12

Epoch 3: val_loss improved from 0.01834 to 0.01822, saving model to /content/attentionEncoder_best.h5




1415/1415 - 75s - 53ms/step - accuracy: 0.9961 - loss: 0.0183 - val_accuracy: 0.9960 - val_loss: 0.0182
Epoch 4/12

Epoch 4: val_loss improved from 0.01822 to 0.01496, saving model to /content/attentionEncoder_best.h5




1415/1415 - 82s - 58ms/step - accuracy: 0.9973 - loss: 0.0139 - val_accuracy: 0.9973 - val_loss: 0.0150
Epoch 5/12

Epoch 5: val_loss did not improve from 0.01496
1415/1415 - 75s - 53ms/step - accuracy: 0.9975 - loss: 0.0125 - val_accuracy: 0.9951 - val_loss: 0.0184
Epoch 6/12

Epoch 6: val_loss did not improve from 0.01496
1415/1415 - 74s - 53ms/step - accuracy: 0.9981 - loss: 0.0098 - val_accuracy: 0.9969 - val_loss: 0.0176
Epoch 7/12

Epoch 7: val_loss did not improve from 0.01496
1415/1415 - 74s - 53ms/step - accuracy: 0.9980 - loss: 0.0096 - val_accuracy: 0.9959 - val_loss: 0.0225
Epoch 7: early stopping
Restoring model weights from the end of the best epoch: 4.
Attention Encoder metrics are: {'accuracy': 0.9970979375633946, 'precision': 0.9950614512598911, 'recall': 0.9991547390961344, 'f1': 0.9971038942780824, 'roc_auc': np.float64(0.9981596848393327)}


In [12]:
# Cell 9: comparative results and qualitative examples
# This cell is printing comparative metrics side-by-side and showing qualitative examples.

print("=== Comparative Metrics ===")
print("Model\t\tAccuracy\tPrecision\tRecall\t\tF1\t\tROC-AUC")
print("SiameseBiLSTM\t{acc:.4f}\t{prec:.4f}\t\t{rec:.4f}\t\t{f1:.4f}\t{roc:.4f}".format(
    acc=metricsSiamese['accuracy'], prec=metricsSiamese['precision'], rec=metricsSiamese['recall'],
    f1=metricsSiamese['f1'], roc=metricsSiamese['roc_auc']))
print("AttentionEnc\t{acc:.4f}\t{prec:.4f}\t\t{rec:.4f}\t\t{f1:.4f}\t{roc:.4f}".format(
    acc=metricsAttn['accuracy'], prec=metricsAttn['precision'], rec=metricsAttn['recall'],
    f1=metricsAttn['f1'], roc=metricsAttn['roc_auc']))

# This block is showing qualitative examples for the better model (based on f1).
betterModelName = "siamese" if metricsSiamese['f1'] >= metricsAttn['f1'] else "attention"
print("\nBetter model by F1 is", betterModelName)

# This block is preparing sequences for decoding (we need original sequences).
leftPadAll = np.vstack([X_test[:,0]])
# But earlier we had leftPad/rightPad from the whole dataset; we will reconstruct arrays for the test set.
leftTestSeqs = X_test[:,0]
rightTestSeqs = X_test[:,1]

if betterModelName == "siamese":
    showQualitativeExamples(leftTestSeqs, rightTestSeqs, y_test, predsSiamese, probsSiamese, tokenizer, maxLen=maxSeqLen, numExamples=6)
else:
    showQualitativeExamples(leftTestSeqs, rightTestSeqs, y_test, predsAttn, probsAttn, tokenizer, maxLen=maxSeqLen, numExamples=6)


=== Comparative Metrics ===
Model		Accuracy	Precision	Recall		F1		ROC-AUC
SiameseBiLSTM	0.9983	0.9967		0.9998		0.9983	0.9999
AttentionEnc	0.9971	0.9951		0.9992		0.9971	0.9982

Better model by F1 is siamese
Total correct are 35430 Total incorrect are 62

--- Examples of incorrect predictions ---

Label=0 Pred=1 Prob=0.767
Left: documents servicing system data tape
Right: support during the first sixty 60 days following the effective date the “initial period” oanda will provide licensee with up to two free hours of telephone support during <OOV> regular customer

Label=0 Pred=1 Prob=0.826
Left: investment company act none of the borrower or any subsidiary is required to register as an “investment company ” as defined in the investment company act
Right: investment company the company is not required to be registered as and is not an affiliate of and immediately following the closing will not be required to register as an “investment

Label=0 Pred=1 Prob=0.979
Left: reimbursement if the i