In this ipynb file we handle contractions in our datsets, and also normalize it before sending it to LLM for summarization and NER tasks

#### **Preprocessing**:

In [1]:
import os
import re
import nltk
import spacy

from nltk.tokenize import sent_tokenize
from typing import List, Dict, Any, Set

##### **Handle contractions**

In [2]:
# average token length of bankrupt company files is 3345 & healthy company files is 5975

# can remove contraction with the help of dictionary mapping
# do this before removing punctuations 
contractions_dict = { 
    "ain't": "are not", "'s":" is", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", 
    "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", 
    "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", 
    "so've": "so have", "that'd": "that would", "that'd've": "that would have", "there'd": "there would", "there'd've": "there would have", "they'd": "they would", "they'd've": "they would have","they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
    "weren't": "were not","what'll": "what will", "what'll've": "what will have", "what're": "what are", "what've": "what have", "when've": "when have", "where'd": "where did", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who've": "who have", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", 
    "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"
 }

# groups the contractions into a regular expression like "(can't|won't)"
contractions_re = re.compile('(%s)'%'|'.join(contractions_dict.keys()))

def expand_contractions(text: str, contractions_dict: Dict[str, str] = contractions_dict) -> str:
    """
    Expand contractions in the text
    """
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

##### **Normalize the data**

In [3]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

Handle contractions first, then tokenize the sentence

In [9]:
# can create chunks of text for each line in the file and save it

# bankrupt_files_path = r'Dataset\Final Dataset\Bankrupt'
# healthy_files_path = r'Dataset\Final Dataset\Healthy'

# write a function which can take token_size as input and create chunks of text for the document
def process_data(file_path: str, token_size: int = 2000, generate_chunks: bool = False):
    """
    read a file and if need to chunk, then create chunks of text and
    rewrite it on the same file.
    """
    # not going to remove stopwords as they can change the meaning!
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        text = expand_contractions(text)
        sentences = sent_tokenize(text)
        sentences = [ " ".join(sentence.split('\n')) for sentence in sentences]
        sentences = " ".join(sentences)
        # convert to word tokenization to use it as tokens
        sentences = sentences.split()
        if(len(sentences) <= 50):
            print("removed invalid file")
            return 1
        # print(len(sentences))
        # if need to chunk the text
        if(generate_chunks):
            chunks = []
            for i in range(0, len(sentences), token_size):
                chunk = " ".join(sentences[i:i+token_size])
                chunks.append(chunk)
            with open(file_path, 'w', encoding='utf-8') as file:
                multiline_text = "\n".join(chunks)
                file.write(multiline_text)
        else:
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(" ".join(sentences))
    return 0

###### Run this only **once**

In [10]:
bankrupt_files_path = r'Dataset\Phase-II\Bankrupt'
healthy_files_path = r'Dataset\Phase-II\Healthy'

for files in os.listdir(bankrupt_files_path):
    process_data(os.path.join(bankrupt_files_path, files), generate_chunks=True)

for files in os.listdir(healthy_files_path):
    flag = process_data(os.path.join(healthy_files_path, files), generate_chunks=True)
    os.remove(os.path.join(healthy_files_path, files)) if flag == 1 else None

In [None]:
# checking if chunking working properly or not
sample_file = r'AMTEKAUTO_2019_MDA.txt'
chunks = []
with open(sample_file, 'r', encoding='utf-8') as file:
    text = file.read()
    sentences = text.split()
    print(len(sentences))
    for i in range(0, len(sentences), 5000):
        chunk = (" ".join(sentences[i:i+5000]))
        chunks.append(chunk)
    print('\n'.join(chunks))

# print(len(chunks))   