#The goal of the project is to build a CNN for Textual data and use it to classify text documents. The documents are mostly from financial domain like - various loan contracts, derivative contracts - currency swaps.

In [None]:
import tensorflow as tf
import numpy as np


In [None]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


# Preprocess files

In [None]:
def process_temp_file():
    train_dir = '/drive/MyDrive/Colab Notebooks/contracts/train/interest-rate-swap/interest-rate-swap-04.txt'
    tmp_dir = '/drive/MyDrive/Colab Notebooks/contracts/train/interest-rate-swap/interest-rate-swap-tmp-04.txt'
    skip = 0
    with open (tmp_dir, 'w',encoding="utf-8") as fwrite:
      with open (train_dir, 'r',encoding="utf-8") as fread:
          for line in fread:
            line = line.upper().strip()
            
            if(line.startswith("TABLE OF CONTENTS")):
                skip = 1
            elif(line.startswith("PAGE")):
                skip = 1
            elif(line.startswith("ARTICLE")):
                skip = 1
            elif(line.startswith("SECTION")):
                skip = 1
            elif(line.startswith("EXHIBITS") or line.startswith("EXHIBIT")):
                skip = 1
            elif(line.startswith("SCHEDULE") or line.startswith("SCHEDULES")):
                skip = 1            
            elif(line.count == 0):
                skip = 1
            elif(line.find("_") >= 0):
                skip = 1
            else:
                skip = 0
            
            if(skip == 0 and len(line) > 3 ):
                  fwrite.write(line)
                  fwrite.write('\n')


In [None]:
process_temp_file()

FileNotFoundError: ignored

In [None]:
batch_size = 32
train_dir = '/drive/MyDrive/Colab Notebooks/contracts/train'
test_dir = '/drive/MyDrive/Colab Notebooks/contracts/test'
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.3,
    subset="training",
    seed=1337,
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.3,
    subset="validation",
    seed=1337,
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    test_dir, batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Found 14 files belonging to 3 classes.
Using 10 files for training.
Found 14 files belonging to 3 classes.
Using 4 files for validation.
Found 8 files belonging to 3 classes.
Number of batches in raw_train_ds: 1
Number of batches in raw_val_ds: 1
Number of batches in raw_test_ds: 1


In [None]:
from tensorflow.keras.layers import TextVectorization
import string
import re

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")    
    stripped_whitespaces = tf.strings.regex_replace(stripped_html,"\\r\\n"," ")
    stripped_hex = tf.strings.regex_replace(stripped_whitespaces,"\\xe2\\x80\\x9c","")       
    
    return tf.strings.regex_replace(
        stripped_hex, f"[{re.escape(string.punctuation)}]", ""
    )

max_features = 20000
embedding_dim = 512
sequence_length = 2500


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)




In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)


In [None]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_test_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'FIRSTSECOND AMENDMENT TO CREDIT AGREEMENT\nCREDIT AGREEMENT\nAMONG\nRYAN SPECIALTY GROUP, LLC,\nAS BORROWER,\nTHE GUARANTORS FROM TIME TO TIME PARTY HERETO,\nTHE SEVERAL LENDERS FROM TIME TO TIME PARTY HERETO,\nJPMORGAN CHASE BANK, N.A.,\nAS ADMINISTRATIVE AGENT\nDATED AS OF SEPTEMBER 1, 2020,\nAS AMENDED BY THE FIRST AMENDMENT, DATED AS OF MARCH 30, 2021,\nAND AS FURTHER AMENDED BY THE SECOND AMENDMENT, DATED AS OF JULY 26, 2021\nJPMORGAN CHASE BANK, N.A.,\nBARCLAYS BANK PLC,\nBMO CAPITAL MARKETS CORP.\nWELLS FARGO SECURITIES, LLC,\nGOLDMAN SACHS BANK USA,\nBANK OF MONTREAL,\nAS JOINT LEAD ARRANGERS AND JOINT BOOKRUNNERS\nROYAL BANK OF CANADA\nCAPITAL ONE, N.A.\nUBS SECURITIES LLC,\nAS CO-SYNDICATION AGENTS\n1.1\t \tDEFINED TERMS\t  \t \t1\n1.2\t \tOTHER INTERPRETIVE PROVISIONS\t  \t \t865\n1.3\t \tACCOUNTING\t  \t \t876\n1.4\t \tLIMITED CONDITION TRANSACTIONS\t  \t \t887\n1.5\t \tFINANCIAL RATIO CALCULATIONS\t  \t \t898\n1.6\t \tCURRENCY EQUIVALENTS GENERALLY\t  \t

# Build the model

In [None]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a softmax:
predictions = layers.Dense(1, activation="softmax", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with categorical crossentropy loss and an adam optimizer.
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 512)         10240000  
                                                                 
 dropout (Dropout)           (None, None, 512)         0         
                                                                 
 conv1d (Conv1D)             (None, None, 128)         458880    
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         114816    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                             

#Train the model

In [None]:
epochs = 5

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/5


  return dispatch_target(*args, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0f9d759840>

#Evaluate the model

In [None]:
model.evaluate(test_ds)



[0.0, 0.75]

#Predict the model

In [None]:
prediction = model.predict(test_ds)
print(prediction)


[[0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]]
