# v.0.4

In [2]:
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
#nltk.download('popular')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.sem.logic import typecheck
import os, re, string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint

class CleanText:
    """
    Text Cleaning Class: With this class you can load the raw text.  Split into tokens. Convert to lowercase. Remove punctuation from each token.Filter out remaining tokens that are not alphabetic.
    Filter out tokens that are stop words. Remove html tags. Keep or remove some words that include puntuations. Replace newline with space. Apply custom regex pattern and pipeline function.
    """
    STOPWORDS = set(stopwords.words('english'))

    def __init__(self, text=None, file_path=None) -> None:
        self.text = text if text else self.load_data(file_path) if file_path else None
        self.tokens = None
        self.ctext = None

    # load data
    def load_data(self,filename=None):
        file = open(filename, 'rt')
        self.text = file.read()
        file.close()
        return self.text


    # split into words
    def tokenize(self, clean=False):
        if clean:
            if self.ctext:
                self.tokens = word_tokenize(self.ctext)
                return self.tokens
            else:
                raise Exception("For clean tokens use 'remove_punc' method cleaning before tokenizing!")

        
        self.tokens = word_tokenize(self.text)

    # remove punctuation from text
    def remove_punc(self, light=False):
        if light:
            regex = r"[!\"#$%&|\'*+,\-.\/:;=?@\[\\\]^_`{|}~ ]{2,}"          # magic
            self.ctext = re.sub(regex, self.my_replace, self.text, 0)
            return self.ctext

        self.ctext = re.sub(r'[^\w\d\s_\-()]+', ' ', self.text)
        return self.ctext

    def my_replace(self, match):
        match = match.group()
        return match[0] + (" " if " " in match else "")

    def remove_word(self, remove_punc_pattern):
        try:
            self.ctext = ' '.join([word for word in self.ctext.split() if not any([phrase in word for phrase in remove_punc_pattern])])     # lots happen here
            return self.ctext
        except Exception as e:
            print("Before this method be sure remove_punc method is implemented on text")
            print(e)
    
    def keep_word(self, keep_punc_pattern):
        try:
            self.ctext = re.sub(keep_punc_pattern, " ", self.ctext)
            return self.ctext
        except Exception as e:
            print("Before this method be sure remove_punc method is implemented on text")
            print(e)

    # filter out stop words
    def remove_stopwords(self):
        if self.tokens:
            tokens = [w for w in self.tokens if not w in self.STOPWORDS]
            return tokens
        else:
            self.tokens = self.tokenize()
            return self.remove_stopwords()

    #replace newline with space
    def replace_newline(self):
        if self.text:
            text = re.sub("\n"," ", self.ctext if self.ctext else self.text)
            return text
        else:
            raise FileNotFoundError("load text before replacing newline chars")

    # Remove html and urls:
    def remove_html_urls(self):
        text = re.sub(r'^https?:\/\/.*[\r\n]*', '', self.text, flags=re.MULTILINE)
        text = re.sub('<.*?>+', '', text, flags=re.MULTILINE)
        return text

    # Apply custom function:
    def custom_clean(self, pattern, new_value=''):
        return re.sub(pattern, new_value, self.text)

    # Pipeline for the all
    def run_pipeline(self, light=True, clean=True, keep="[|*]", remove="[¥‰¤]", lower=True, output=None):
        self.ctext = self.remove_punc(light)
        self.ctext = self.keep_word(keep)
        self.ctext = self.remove_word(remove)
        if output == "token":
            self.tokens = self.tokenize(clean)
            if lower:
                return [w.lower() for w in self.tokens]
            return self.tokens
        
        #self.tokens = self.remove_stopwords()
        #text = self.replace_newline()
        return self.ctext
    
    def __str__(self):
        if self.text:
            return f'Size of file:  {len(self.text)} bytes.\nNumber of tokens:  {len(self.tokens) if self.tokens else None}\nType:  {self.__class__.__name__}...'
        else:
            return None



## work on text

In [69]:
text = "I like this city|istanbul very much!!! :\\ $$%%&& It is because it contains many good food!!!! dir¤¤¤ty wor¥¥d % Unfortunately,,,, I need to go back home tomorrow. . \
0.5666, 648mg, I had***** such.... high. never, lost hopes for_arc this! (good) <html> title </html> long number 0.45678908765431444 \
COVID-19 RO7496998 (AT-527) Non-Profit https://colab.research.google.com" 

In [20]:
txt = CleanText(text)

In [21]:
print(txt.text)                         # Original text
#print(txt.remove_punc(light=True))      # keep one punctuation and delete rest
print(txt.remove_punc(light=False))     # remove all punc except underscore and hypen

I like this city|istanbul very much!!! :\ $$%%&& It is because it contains many good food!!!! dir¤¤¤ty wor¥¥d % Unfortunately,,,, I need to go back home tomorrow. . 0.5666, 648mg, I had***** such.... high. never, lost hopes for_arc this! (good) <html> title </html> long number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit https://colab.research.google.com
I like this cityistanbul very much   It is because it contains many good food dirty word  Unfortunately I need to go back home tomorrow  05666 648mg I had such high never lost hopes for_arc this (good) html title html long number 045678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit httpscolabresearchgooglecom


In [22]:
pprint(txt.remove_punc(light=True))  

('I like this city|istanbul very much! It is because it contains many good '
 'food! dir¤¤¤ty wor¥¥d  Unfortunately, I need to go back home tomorrow. '
 '0.5666, 648mg, I had* such. high. never, lost hopes for_arc this! (good) '
 '<html> title </html> long number 0.45678908765431444 COVID-19 RO7496998 '
 '(AT-527) Non-Profit https:colab.research.google.com')


In [23]:
pprint(txt.remove_word("[¤¥]"))

('I like this city|istanbul very much! It is because it contains many good '
 'food! Unfortunately, I need to go back home tomorrow. 0.5666, 648mg, I had* '
 'such. high. never, lost hopes for_arc this! (good) <html> title </html> long '
 'number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit '
 'https:colab.research.google.com')


In [24]:
pprint(txt.keep_word("[*|]"))

('I like this city istanbul very much! It is because it contains many good '
 'food! Unfortunately, I need to go back home tomorrow. 0.5666, 648mg, I had  '
 'such. high. never, lost hopes for_arc this! (good) <html> title </html> long '
 'number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit '
 'https:colab.research.google.com')


In [25]:
pprint(txt.tokenize(clean=True))

['I',
 'like',
 'this',
 'city',
 'istanbul',
 'very',
 'much',
 '!',
 'It',
 'is',
 'because',
 'it',
 'contains',
 'many',
 'good',
 'food',
 '!',
 'Unfortunately',
 ',',
 'I',
 'need',
 'to',
 'go',
 'back',
 'home',
 'tomorrow',
 '.',
 '0.5666',
 ',',
 '648mg',
 ',',
 'I',
 'had',
 'such',
 '.',
 'high',
 '.',
 'never',
 ',',
 'lost',
 'hopes',
 'for_arc',
 'this',
 '!',
 '(',
 'good',
 ')',
 '<',
 'html',
 '>',
 'title',
 '<',
 '/html',
 '>',
 'long',
 'number',
 '0.45678908765431444',
 'COVID-19',
 'RO7496998',
 '(',
 'AT-527',
 ')',
 'Non-Profit',
 'https',
 ':',
 'colab.research.google.com']


#### using pipeline function
using pipeline method needs more attantion about the order of the methods, because some stages effect others that comes next, for that reason stages should be arrange according to model. And recommended not using pipeline with a huge files to avoid overloading the memory. 

In [70]:
clean_text = CleanText(text)

In [64]:
clean_text.run_pipeline()

'I like this city istanbul very much! It is because it contains many good food! Unfortunately, I need to go back home tomorrow. 0.5666, 648mg, I had such. high. never, lost hopes for_arc this! (good) <html> title </html> long number 0.45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit https:colab.research.google.com'

In [65]:
# you may give custom parameters to pipeline
clean_text.run_pipeline(output="token")

['i', 'like', 'this', 'city', 'istanbul', 'very', 'much', '!', 'it', 'is', 'because', 'it', 'contains', 'many', 'good', 'food', '!', 'unfortunately', ',', 'i', 'need', 'to', 'go', 'back', 'home', 'tomorrow', '.', '0.5666', ',', '648mg', ',', 'i', 'had', 'such', '.', 'high', '.', 'never', ',', 'lost', 'hopes', 'for_arc', 'this', '!', '(', 'good', ')', '<', 'html', '>', 'title', '<', '/html', '>', 'long', 'number', '0.45678908765431444', 'covid-19', 'ro7496998', '(', 'at-527', ')', 'non-profit', 'https', ':', 'colab.research.google.com']

In [71]:
# you may give custom parameters to pipeline
clean_text.run_pipeline(light=False, lower=False, output="token")

['I', 'like', 'this', 'city', 'istanbul', 'very', 'much', 'It', 'is', 'because', 'it', 'contains', 'many', 'good', 'food', 'dir', 'ty', 'wor', 'd', 'Unfortunately', 'I', 'need', 'to', 'go', 'back', 'home', 'tomorrow', '0', '5666', '648mg', 'I', 'had', 'such', 'high', 'never', 'lost', 'hopes', 'for_arc', 'this', '(', 'good', ')', 'html', 'title', 'html', 'long', 'number', '0', '45678908765431444', 'COVID-19', 'RO7496998', '(', 'AT-527', ')', 'Non-Profit', 'https', 'colab', 'research', 'google', 'com']

In [72]:
clean_text.run_pipeline(light=False)

'I like this city istanbul very much It is because it contains many good food dir ty wor d Unfortunately I need to go back home tomorrow 0 5666 648mg I had such high never lost hopes for_arc this (good) html title html long number 0 45678908765431444 COVID-19 RO7496998 (AT-527) Non-Profit https colab research google com'

## work on file

In [30]:
file = CleanText(file_path="/content/clinical-trials-sample.txt")

In [31]:
pprint(file.text)

('Study List:                     I like this city very much!!! :\\\\ $$%%&& '
 'It is because it contains many good food!!!! $ % Unfortunately,,,, I need to '
 'go back home tomorrow. .\n'
 '0.5666, 648mg, I had***** such.... high. never, lost hopes for_arc this! '
 '(good) <html> title </html> long number 0.45678908765431444 \\\n'
 'COVID-19 RO7496998 (AT-527) Non-Profit https://colab.research.google.com\n'
 '\n'
 'Study 1:\n'
 '  Title:                        Study to Evaluate the Effects of RO7496998 '
 '(AT-527) in Non-Hospitalized Adult and Adolescent Participants With Mild or '
 'Moderate COVID-19\n'
 '  Status:                       Terminated\n'
 '  Study Results:                No Results Available\n'
 '  Conditions:                   COVID-19\n'
 '  Interventions:                Drug: RO7496998|Drug: Placebo\n'
 '  Locations:                    Instituto Medico Rio Cuarto, Cordoba, '
 'Argentina|Instituto Ave Pulmo, Mar Del Plata, Argentina|Clínica '
 'Independencia, Munro, 

In [32]:
file.remove_punc(light=True)
pprint(file.keep_word("[|*:!]"))

('Study List  I like this city very much  It is because it contains many good '
 'food  Unfortunately, I need to go back home tomorrow. \n'
 '0.5666, 648mg, I had  such. high. never, lost hopes for_arc this  (good) '
 '<html> title </html> long number 0.45678908765431444  \n'
 'COVID-19 RO7496998 (AT-527) Non-Profit https colab.research.google.com\n'
 '\n'
 'Study 1 \n'
 '  Title  Study to Evaluate the Effects of RO7496998 (AT-527) in '
 'Non-Hospitalized Adult and Adolescent Participants With Mild or Moderate '
 'COVID-19\n'
 '  Status  Terminated\n'
 '  Study Results  No Results Available\n'
 '  Conditions  COVID-19\n'
 '  Interventions  Drug  RO7496998 Drug  Placebo\n'
 '  Locations  Instituto Medico Rio Cuarto, Cordoba, Argentina Instituto Ave '
 'Pulmo, Mar Del Plata, Argentina Clínica Independencia, Munro, Argentina '
 'Instituto Medico de la Fundacion Estudios Clinicos, Rosario, Argentina '
 'Clinica Mayo de U.M.C.B. S.R.L, San Miguel de Tucumán, Argentina Sanatorio '
 'Medico d

In [34]:
pprint(file.tokenize(clean=True))

['Study',
 'List',
 'I',
 'like',
 'this',
 'city',
 'very',
 'much',
 'It',
 'is',
 'because',
 'it',
 'contains',
 'many',
 'good',
 'food',
 'Unfortunately',
 ',',
 'I',
 'need',
 'to',
 'go',
 'back',
 'home',
 'tomorrow',
 '.',
 '0.5666',
 ',',
 '648mg',
 ',',
 'I',
 'had',
 'such',
 '.',
 'high',
 '.',
 'never',
 ',',
 'lost',
 'hopes',
 'for_arc',
 'this',
 '(',
 'good',
 ')',
 '<',
 'html',
 '>',
 'title',
 '<',
 '/html',
 '>',
 'long',
 'number',
 '0.45678908765431444',
 'COVID-19',
 'RO7496998',
 '(',
 'AT-527',
 ')',
 'Non-Profit',
 'https',
 'colab.research.google.com',
 'Study',
 '1',
 'Title',
 'Study',
 'to',
 'Evaluate',
 'the',
 'Effects',
 'of',
 'RO7496998',
 '(',
 'AT-527',
 ')',
 'in',
 'Non-Hospitalized',
 'Adult',
 'and',
 'Adolescent',
 'Participants',
 'With',
 'Mild',
 'or',
 'Moderate',
 'COVID-19',
 'Status',
 'Terminated',
 'Study',
 'Results',
 'No',
 'Results',
 'Available',
 'Conditions',
 'COVID-19',
 'Interventions',
 'Drug',
 'RO7496998',
 'Drug',
 'P

# Operations on big text


 **File Size : 1.12GB**  
**Cleaning Time:5min.**  

In [54]:
%%time
with open("/content/drive/MyDrive/Copy of clinicalTrialTokenizedByNltkPart1.txt", "r", encoding="utf8") as r_file:
    with open("/content/drive/MyDrive/Cleaned_clinicalTrial.txt", "w+", encoding="utf8") as w_file:
        for line in r_file:
            cline = CleanText(text=line)
            cline.remove_punc(light=True)
            cline.keep_word("[|*:!]")
            cline.remove_word("[¤¥]")
            w_file.write(cline.ctext)
            

CPU times: user 4min 21s, sys: 5.18 s, total: 4min 26s
Wall time: 4min 48s


# work-outs

resources:  
https://stackoverflow.com/questions/63256077/how-to-remove-redundant-punctuations-keep-only-the-first-one-in-text  
https://stackoverflow.com/questions/43142710/remove-all-punctuation-from-string-except-if-its-between-digits  
https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string  
https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string  
https://betterprogramming.pub/5-different-ways-to-remove-specific-characters-from-a-string-in-python-b0e081839ab9#9fb3  




```
(?<=           : begin a positive lookbehind
  [^!.,>$%&-]  : match any character other than those shown in the
                 character class 
  [!.,>$%&-]   : match any character in the character class
)              : end positive lookbehind
[!.,>$%& -]+   : match any character in the character class
(?<! )         : negative lookbehind matches a space
```



In [None]:
import re

def my_replace(match):
    match = match.group()
    return match[0] + (" " if " " in match else "")

regex = r"[!\"#$%&\'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~ ]{2,}"

test_str = "I like this city very much!!! :\\\\ $%& It is because it contains many good food..! $ % Unfortunately,,, I need to go back home tomorrow. ."

result = re.sub(regex, my_replace, test_str, 0)

if result:
    print (result)

I like this city very much! It is because it contains many good food. Unfortunately, I need to go back home tomorrow. 


In [None]:
regex = r"(?<!\d)[.,;:](?!\d)"

test_str = "This is a 1example of the text. But, it only is 2.5 percent of all data"

result = re.sub(regex, "", test_str, 0)
result

'This is a 1example of the text But it only is 2.5 percent of all data'

In [161]:
import re
s = "WE  ARE GOOD!!*10"
s = re.sub(r"[^a-z0-9 ]","",s.lower().strip())
s

'we  are good10'

In [None]:
import re

sample_str = "Hel&&lo %% Wo$#rl@d"

# using isalnum()
print("".join(k for k in sample_str if k.isalnum()))


# using regex
op2 = re.sub("[^A-Za-z]", "", sample_str)
print(f"op2 = ", op2)


special_char_list = ["$", "@", "#", "&", "%"]

# using list comprehension
op1 = "".join([k for k in sample_str if k not in special_char_list])
print(f"op1 = ", op1)


# using lambda function
op3 = "".join(filter(lambda x: x not in special_char_list, sample_str))
print(f"op3 = ", op3)

HelloWorld
op2 =  HelloWorld
op1 =  Hello  World
op3 =  Hello  World


In [None]:
import re
my_text = "!where??and!!or$$then:)"
print(re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', r' \g<0> ', my_text).strip())
# => ! where ?? and !! or $$ then :)

! where ?? and !! or $$ then :)


In [None]:
from string import punctuation

a="""'?hello !mango! and, ban,ana yum apple!',  '?!,' """
new=[i.strip(punctuation) for i in a.split()]
print(" ".join(new))

hello mango and ban,ana yum apple 


In [None]:
import string

def beautify_sentence(sentence, punctuation):
        beautiful = sentence.translate(str.maketrans('', '', string.punctuation))
        return beautiful

print(beautify_sentence('?hello !mango! and, ban,ana yum apple!',  '?!,'))

hello mango and banana yum apple


In [146]:
import re
s = "See ,remove .these words from the original string or? !not"
s = re.sub(r'\s+\w+\b\S', '', s) # Matches words that end with punctuation.
s = re.sub(r'\s+\S\b\w+', '', s) # Matches words that start with punctuation.
print(s)


See words from the original string


In [165]:
pattern_keep = "[|:]"
pattern_del = "[@*!~]"
text = "KEEP|THIS  SWE~~PP T@IS. CLEAN! D**EL TEXT:ONE"


In [166]:
def keep_or_remove(text:str, pattern_to_keep:str, pattern_to_del:str)->str:
    text = re.sub(pattern_keep," ",text)
    text = ' '.join([word for word in text.split() if not any([phrase in word for phrase in pattern_to_del])])
    return text

keep_or_remove(text, "[|:]","[@*!~]")

'KEEP THIS TEXT ONE'