# **v.0.4**

# Text Cleaning Class

## setup nltk

In [8]:
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
#nltk.download('popular') # No need to download this.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Class Definition

In [9]:
from nltk.sem.logic import typecheck
import os, re, string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint

class CleanText:
    """
    Text Cleaning Class: With this class you can load the raw text.  Split into tokens. Convert to lowercase. Remove punctuation from each token.Filter out remaining tokens that are not alphabetic.
    Filter out tokens that are stop words. Remove html tags. Keep or remove some words that include puntuations. Replace newline with space. Apply custom regex pattern and pipeline function.
    """
    STOPWORDS = set(stopwords.words('english'))

    def __init__(self, text=None, file_path=None) -> None:
        self.text = text if text else self.load_data(file_path) if file_path else None
        self.tokens = None
        self.ctext = None

    # load data
    def load_data(self,filename=None):
        file = open(filename, 'rt')
        self.text = file.read()
        file.close()
        return self.text


    # split into words
    def tokenize(self, clean=False):
        if clean:
            if self.ctext:
                self.tokens = word_tokenize(self.ctext)
                return self.tokens
            else:
                raise Exception("For clean tokens use 'remove_punc' method cleaning before tokenizing!")

        
        self.tokens = word_tokenize(self.text)

    # remove punctuation from text
    def remove_punc(self, light=False):
        if light:
            regex = r"[!\"#$%&|\'*+,\-.\/:;=?@\[\\\]^_`{|}~ ]{2,}"          # magic
            self.ctext = re.sub(regex, self.my_replace, self.text, 0)
            return self.ctext

        self.ctext = re.sub(r'[^\w\d\s_\-()]+', ' ', self.text)
        return self.ctext

    def my_replace(self, match):
        match = match.group()
        return match[0] + (" " if " " in match else "")

    def remove_word(self, remove_punc_pattern):
        try:
            self.ctext = ' '.join([word for word in self.ctext.split() if not any([phrase in word for phrase in remove_punc_pattern])])     # lots happen here
            return self.ctext
        except Exception as e:
            print("Before this method be sure remove_punc method is implemented on text")
            print(e)
    
    def keep_word(self, keep_punc_pattern):
        try:
            self.ctext = re.sub(keep_punc_pattern, " ", self.ctext)
            return self.ctext
        except Exception as e:
            print("Before this method be sure remove_punc method is implemented on text")
            print(e)

    # filter out stop words
    def remove_stopwords(self):
        if self.tokens:
            tokens = [w for w in self.tokens if not w in self.STOPWORDS]
            return tokens
        else:
            self.tokens = self.tokenize()
            return self.remove_stopwords()

    #replace newline with space
    def replace_newline(self):
        if self.text:
            text = re.sub("\n"," ", self.ctext if self.ctext else self.text)
            return text
        else:
            raise FileNotFoundError("load text before replacing newline chars")

    # Remove html and urls:
    def remove_html_urls(self):
        ctext = re.sub(r'^https?:\/\/.*[\r\n]*', '', self.ctext, flags=re.MULTILINE)
        ctext = re.sub('<.*?>+', '', ctext, flags=re.MULTILINE)
        return ctext

    # Apply custom function:
    def custom_clean(self, pattern, new_value=''):
        if self.ctext == None:
            self.text = re.sub(pattern, new_value, self.text)
            return self.text

        self.ctext = re.sub(pattern, new_value, self.ctext)
        return self.ctext
        
    # Pipeline for the all
    def run_pipeline(self, light=True, clean=True, keep="[|*]", remove="[¥‰¤]", lower=True, output=None):
        self.ctext = self.remove_punc(light)
        self.ctext = self.keep_word(keep)
        self.ctext = self.remove_word(remove)
        if output == "token":
            self.tokens = self.tokenize(clean)
            if lower:
                return [w.lower() for w in self.tokens]
            return self.tokens
        
        #self.tokens = self.remove_stopwords()
        #text = self.replace_newline()
        return self.ctext
    
    def __str__(self):
        if self.text:
            return f'Size of file:  {len(self.text)} bytes.\nNumber of tokens:  {len(self.tokens) if self.tokens else None}\nType:  {self.__class__.__name__}...'
        else:
            return None



# Operations

## **work on text**

In [39]:
text = "I like this city|istanbul very much!!! :\\ $$%%&& It is because it contains many good food!!!! dir¤¤¤ty wor¥¥d % Unfortunately,,,, I need to go back home tomorrow. . \
0.5666, 648mg, I had***** such.... high. never, lost hopes for_arc this! (good) <html> title </html> long number 0.45678908765431444 \
COVID-19 RO7496998 (AT-527) Non-Profit https://colab.research.google.com" 

In [40]:
txt = CleanText(text)

In [41]:
print(txt.text)                         # Original text
#print(txt.remove_punc(light=True))      # keep one punctuation and delete rest
print(txt.remove_punc(light=False))     # remove all punc except underscore and hypen

I like this city|istanbul very much!!! :\ $$%%&& It is because it contains many good food!!!! dir¤¤¤ty wor¥¥d % Unfortunately,,,, I need to go back home tomorrow. . 0.5666, 648mg, I had***** such.... high. never, lost hopes for_arc this! (good) <html> title </html> long number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit https://colab.research.google.com
I like this city istanbul very much      It is because it contains many good food  dir ty wor d   Unfortunately  I need to go back home tomorrow    0 5666  648mg  I had  such  high  never  lost hopes for_arc this  (good)  html  title  html  long number 0 45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit https colab research google com


In [42]:
pprint(txt.remove_punc(light=True))  

('I like this city|istanbul very much! It is because it contains many good '
 'food! dir¤¤¤ty wor¥¥d  Unfortunately, I need to go back home tomorrow. '
 '0.5666, 648mg, I had* such. high. never, lost hopes for_arc this! (good) '
 '<html> title </html> long number 0.45678908765431444 COVID-19 RO7496998 '
 '(AT-527) Non-Profit https:colab.research.google.com')


In [43]:
pprint(txt.remove_word("[¤¥]"))

('I like this city|istanbul very much! It is because it contains many good '
 'food! Unfortunately, I need to go back home tomorrow. 0.5666, 648mg, I had* '
 'such. high. never, lost hopes for_arc this! (good) <html> title </html> long '
 'number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit '
 'https:colab.research.google.com')


In [44]:
pprint(txt.keep_word("[*|]"))

('I like this city istanbul very much! It is because it contains many good '
 'food! Unfortunately, I need to go back home tomorrow. 0.5666, 648mg, I had  '
 'such. high. never, lost hopes for_arc this! (good) <html> title </html> long '
 'number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit '
 'https:colab.research.google.com')


In [45]:
pprint(txt.tokenize(clean=True))

['I',
 'like',
 'this',
 'city',
 'istanbul',
 'very',
 'much',
 '!',
 'It',
 'is',
 'because',
 'it',
 'contains',
 'many',
 'good',
 'food',
 '!',
 'Unfortunately',
 ',',
 'I',
 'need',
 'to',
 'go',
 'back',
 'home',
 'tomorrow',
 '.',
 '0.5666',
 ',',
 '648mg',
 ',',
 'I',
 'had',
 'such',
 '.',
 'high',
 '.',
 'never',
 ',',
 'lost',
 'hopes',
 'for_arc',
 'this',
 '!',
 '(',
 'good',
 ')',
 '<',
 'html',
 '>',
 'title',
 '<',
 '/html',
 '>',
 'long',
 'number',
 '0.45678908765431444',
 'COVID-19',
 'RO7496998',
 '(',
 'AT-527',
 ')',
 'Non-Profit',
 'https',
 ':',
 'colab.research.google.com']


In [38]:
print(txt)

Size of file:  370 bytes.
Number of tokens:  66
Type:  CleanText...


#### using pipeline function
using pipeline method needs more attantion about the order of the methods, because some stages effect others that comes next, for that reason stages should be arrange according to model. And recommended not using pipeline with a huge files to avoid overloading the memory. 

In [None]:
clean_text = CleanText(text)

In [None]:
clean_text.run_pipeline()

'I like this city istanbul very much! It is because it contains many good food! Unfortunately, I need to go back home tomorrow. 0.5666, 648mg, I had such. high. never, lost hopes for_arc this! (good) <html> title </html> long number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit https:colab.research.google.com'

In [None]:
# you may give custom parameters to pipeline
clean_text.run_pipeline(output="token")

['i', 'like', 'this', 'city', 'istanbul', 'very', 'much', '!', 'it', 'is', 'because', 'it', 'contains', 'many', 'good', 'food', '!', 'unfortunately', ',', 'i', 'need', 'to', 'go', 'back', 'home', 'tomorrow', '.', '0.5666', ',', '648mg', ',', 'i', 'had', 'such', '.', 'high', '.', 'never', ',', 'lost', 'hopes', 'for_arc', 'this', '!', '(', 'good', ')', '<', 'html', '>', 'title', '<', '/html', '>', 'long', 'number', '0.45678908765431444', 'covid-19', 'ro7496998', '(', 'at-527', ')', 'non-profit', 'https', ':', 'colab.research.google.com']

In [None]:
# you may give custom parameters to pipeline
clean_text.run_pipeline(light=False, lower=False, output="token")

['I', 'like', 'this', 'city', 'istanbul', 'very', 'much', 'It', 'is', 'because', 'it', 'contains', 'many', 'good', 'food', 'dir', 'ty', 'wor', 'd', 'Unfortunately', 'I', 'need', 'to', 'go', 'back', 'home', 'tomorrow', '0', '5666', '648mg', 'I', 'had', 'such', 'high', 'never', 'lost', 'hopes', 'for_arc', 'this', '(', 'good', ')', 'html', 'title', 'html', 'long', 'number', '0', '45678908765431444', 'COVID-19', 'RO7496998', '(', 'AT-527', ')', 'Non-Profit', 'https', 'colab', 'research', 'google', 'com']

In [None]:
clean_text.run_pipeline(light=False)

'I like this city istanbul very much It is because it contains many good food dir ty wor d Unfortunately I need to go back home tomorrow 0 5666 648mg I had such high never lost hopes for_arc this (good) html title html long number 0 45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit https colab research google com'

## work on file

In [26]:
file = CleanText(file_path="/content/clinical-trials-sample.txt")

In [27]:
pprint(file.text)

('Study List:                     I like this city very much!!! :\\\\ $$%%&& '
 'It is because it contains many good food!!!! $ % Unfortunately,,,, I need to '
 'go back home tomorrow. .\n'
 '0.5666, 648mg, I had***** such.... high. never, lost hopes for_arc this! '
 '(good) <html> title </html> long number 0.45678908765431444 \\\n'
 'COVID-19 RO7496998 (AT-527) Non-Profit https://colab.research.google.com\n'
 '\n'
 'Study 1:\n'
 '  Title:                        Study to Evaluate the Effects of RO7496998 '
 '(AT-527) in Non-Hospitalized Adult and Adolescent Participants With Mild or '
 'Moderate COVID-19\n'
 '  Status:                       Terminated\n'
 '  Study Results:                No Results Available\n'
 '  Conditions:                   COVID-19\n'
 '  Interventions:                Drug: RO7496998|Drug: Placebo\n'
 '  Locations:                    Instituto Medico Rio Cuarto, Cordoba, '
 'Argentina|Instituto Ave Pulmo, Mar Del Plata, Argentina|Clínica '
 'Independencia, Munro, 

In [28]:
file.remove_punc(light=True)
pprint(file.keep_word("[|*:!]"))

('Study List  I like this city very much  It is because it contains many good '
 'food  Unfortunately, I need to go back home tomorrow. \n'
 '0.5666, 648mg, I had  such. high. never, lost hopes for_arc this  (good) '
 '<html> title </html> long number 0.45678908765431444  \n'
 'COVID-19 RO7496998 (AT-527) Non-Profit https colab.research.google.com\n'
 '\n'
 'Study 1 \n'
 '  Title  Study to Evaluate the Effects of RO7496998 (AT-527) in '
 'Non-Hospitalized Adult and Adolescent Participants With Mild or Moderate '
 'COVID-19\n'
 '  Status  Terminated\n'
 '  Study Results  No Results Available\n'
 '  Conditions  COVID-19\n'
 '  Interventions  Drug  RO7496998 Drug  Placebo\n'
 '  Locations  Instituto Medico Rio Cuarto, Cordoba, Argentina Instituto Ave '
 'Pulmo, Mar Del Plata, Argentina Clínica Independencia, Munro, Argentina '
 'Instituto Medico de la Fundacion Estudios Clinicos, Rosario, Argentina '
 'Clinica Mayo de U.M.C.B. S.R.L, San Miguel de Tucumán, Argentina Sanatorio '
 'Medico d

In [29]:
pprint(file.tokenize(clean=True))

['Study',
 'List',
 'I',
 'like',
 'this',
 'city',
 'very',
 'much',
 'It',
 'is',
 'because',
 'it',
 'contains',
 'many',
 'good',
 'food',
 'Unfortunately',
 ',',
 'I',
 'need',
 'to',
 'go',
 'back',
 'home',
 'tomorrow',
 '.',
 '0.5666',
 ',',
 '648mg',
 ',',
 'I',
 'had',
 'such',
 '.',
 'high',
 '.',
 'never',
 ',',
 'lost',
 'hopes',
 'for_arc',
 'this',
 '(',
 'good',
 ')',
 '<',
 'html',
 '>',
 'title',
 '<',
 '/html',
 '>',
 'long',
 'number',
 '0.45678908765431444',
 'COVID-19',
 'RO7496998',
 '(',
 'AT-527',
 ')',
 'Non-Profit',
 'https',
 'colab.research.google.com',
 'Study',
 '1',
 'Title',
 'Study',
 'to',
 'Evaluate',
 'the',
 'Effects',
 'of',
 'RO7496998',
 '(',
 'AT-527',
 ')',
 'in',
 'Non-Hospitalized',
 'Adult',
 'and',
 'Adolescent',
 'Participants',
 'With',
 'Mild',
 'or',
 'Moderate',
 'COVID-19',
 'Status',
 'Terminated',
 'Study',
 'Results',
 'No',
 'Results',
 'Available',
 'Conditions',
 'COVID-19',
 'Interventions',
 'Drug',
 'RO7496998',
 'Drug',
 'P

## work on big text


 **File Size : 1.12GB**  
**Cleaning Time:5min.**  

In [None]:
%%time
with open("/content/drive/MyDrive/Copy of clinicalTrialTokenizedByNltkPart1.txt", "r", encoding="utf8") as r_file:
    with open("/content/drive/MyDrive/Cleaned_clinicalTrial.txt", "w+", encoding="utf8") as w_file:
        for line in r_file:
            cline = CleanText(text=line)
            cline.remove_punc(light=True)
            cline.keep_word("[|*:!]")
            cline.remove_word("[¤¥]")
            w_file.write(cline.ctext)
            

CPU times: user 4min 21s, sys: 5.18 s, total: 4min 26s
Wall time: 4min 48s


In [None]:
for i, line in enumerate(open("/content/drive/MyDrive/Copy of clinicalTrialTokenizedByNltkPart1.txt", "r", encoding="utf8")):
    print(line)
    if i == 100:break