In [12]:
import re
import nltk
import unidecode
from nltk import sent_tokenize
from nltk import word_tokenize

In [13]:
with open('english-corpora/C00001.txt') as f:
    lines = f.readlines()

## Text Cleaning
1. Text is splitted by '\t' first.
2. Remove extra spaces
3. Tokenize the strings
4. Remove Punctuations from tokenize words
5. Remove Number
6. Remove Double quotations from tokens
7. Replace URL with url tag
8. Remove ascents from string using decode like A&deg;
9. Split camelCase word into 'camel' and 'Case'
10. Remove number from token

In [14]:
def isExtraSpace(text):
    return len(text)==0

In [15]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

In [16]:
def isASCII(token):
    if token in ['.','+','*','?','[','/', '//','\\','^','%',']', '$','(',')','{','}','=', '!', '|',':','-']:
        return True
    return False

In [17]:
def ignore_special_token(token):
    if token in ['^^^^', '==']:
        return True
    return False

In [18]:
def no_change(token):
    if token in ['Objective-J', 'I/O']:
        return True
    return False

In [19]:
def is_number(token):
    num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')
    return bool(num_regex.match(token))

In [20]:
def replace_url(token):
    replaced_text = re.sub('(http[s]?://)?((www)\.)?([a-zA-Z0-9]+)\.{1}((com)(\.(cn))?|(org))', '<url>', token)
    return replaced_text

In [55]:
def split_camelCase(token):
    return re.findall(r'[a-zA-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', token)

In [116]:
def remove_numbers_from_token(token):
    return re.sub(r'[0-9]+', '', token)

In [119]:
def remove_punctuations(tokens):
    new_tokens=[]
    for token in tokens:
        token=token.replace('"', '')
        token = replace_url(token)
        token = unidecode.unidecode(token)
        if isASCII(token) or is_number(token) or len(token)==0: continue
        if no_change(token):
            new_tokens.append(token)
            continue
        tt = re.split(', |:|\+|%|\|/|\*|$|&|@|_|-|!|;|,', token)
        for t in tt:
            t=t.replace('"', '')
            t=unidecode.unidecode(t)
            #Removing ASCII character and extra space after splitting
            if isASCII(t) or len(t)<=1 or is_number(t): continue
            #Removing '.' from the word
            split_by_dot = t.split('.')
            final_token = sorted(split_by_dot, key=len, reverse=True)[0]
            #If still some non-ascii char left in the token 
            if not final_token.isalnum():
                final_token = re.sub('[^A-Za-z0-9]+', '', final_token)
            #Remove number from token
            final_token = remove_numbers_from_token(final_token)
            #CamelCase condition based on first char if it is lower or not
            if final_token and final_token[0].islower():
                camel_tokens = split_camelCase(final_token)
                for tt in camel_tokens: new_tokens.append(tt)
            else: new_tokens.append(final_token)
    return new_tokens

In [121]:
import codecs
file_path='english-corpora/C00001.txt'
with codecs.open(file_path, mode='r', encoding='utf-8') as input_file:
    next(input_file)
    for line in input_file:
        text = line.strip().split('\t')[0]
        if(isExtraSpace(text)): continue
        text=text.replace('"', '')
        tokens = tokenize(text)
        pun_free_token = remove_punctuations(tokens)
        if pun_free_token:
    #         print("Line: ", line)
    #         print("NLTK: ", tokens)
            print("Final: ", pun_free_token)

Final:  ['Title']
Final:  ['JavaScript']
Final:  ['Text']
Final:  ['From', 'Wikipedia', 'the', 'free', 'encyclopedia']
Final:  ['This', 'is', 'the', 'latest', 'accepted', 'revision', 'reviewed', 'on', 'February']
Final:  ['Jump', 'to', 'navigation']
Final:  ['Jump', 'to', 'search']
Final:  ['mw', 'parser', 'output', 'hatnote', 'font', 'style', 'italic', 'mw', 'parser', 'output', 'hatnote', 'padding', 'left', 'em', 'margin', 'bottom', 'em', 'mw', 'parser', 'output', 'hatnote', 'font', 'style', 'normal', 'mw', 'parser', 'output', 'hatnote', 'link', 'hatnote', 'margin', 'top', 'em', 'Not', 'to', 'be', 'confused', 'with', 'Java', 'programming', 'language', 'or', 'Javanese', 'script']
Final:  ['js', 'redirects', 'here', 'For', 'the', 'Microsoft', 'dialect', 'used', 'in', 'Internet', 'Explorer', 'see', 'JScript']
Final:  ['For', 'the', 'uses', 'of', 'JavaScript', 'on', 'Wikipedia', 'see', 'Wikipedia', 'JavaScript']
Final:  ['High', 'level', 'programming', 'language']
Final:  ['mw', 'parser',

Final:  ['JavaScript', 'typically', 'relies', 'on', 'run', 'time', 'environment', 'e', 'web', 'browser', 'to', 'provide', 'objects', 'and', 'methods', 'by', 'which', 'scripts', 'can', 'interact', 'with', 'the', 'environment', 'e', 'web', 'page', 'DOM', 'These', 'environments', 'are', 'single', 'threaded', 'JavaScript', 'also', 'relies', 'on', 'the', 'run', 'time', 'environment', 'to', 'provide', 'the', 'ability', 'to', 'includeimport', 'scripts', 'e', 'HTML', 'script', 'elements', 'This', 'is', 'not', 'language', 'feature', 'per', 'se', 'but', 'it', 'is', 'common', 'in', 'most', 'JavaScript', 'implementations', 'JavaScript', 'processes', 'messages', 'from', 'queue', 'one', 'at', 'time', 'JavaScript', 'calls', 'function', 'associated', 'with', 'each', 'new', 'message', 'creating', 'call', 'stack', 'frame', 'with', 'the', 'function', 's', 'arguments', 'and', 'local', 'variables', 'The', 'call', 'stack', 'shrinks', 'and', 'grows', 'based', 'on', 'the', 'function', 's', 'needs', 'When', 't

Final:  ['WebAssembly', 'edit']
Final:  ['Since', 'web', 'browsers', 'have', 'supported', 'WebAssembly', 'binary', 'format', 'that', 'enables', 'JavaScript', 'engine', 'to', 'execute', 'performance', 'critical', 'portions', 'of', 'web', 'page', 'scripts', 'close', 'to', 'native', 'speed', 'WebAssembly', 'code', 'runs', 'in', 'the', 'same', 'sandbox', 'as', 'regular', 'JavaScript', 'code']
Final:  ['asm', 'is', 'subset', 'of', 'JavaScript', 'that', 'served', 'as', 'the', 'forerunner', 'of', 'WebAssembly']
Final:  ['Transpilers', 'edit']
Final:  ['JavaScript', 'is', 'the', 'dominant', 'client', 'side', 'language', 'of', 'the', 'Web', 'and', 'many', 'websites', 'are', 'script', 'heavy', 'Thus', 'transpilers', 'have', 'been', 'created', 'to', 'convert', 'code', 'written', 'in', 'other', 'languages', 'which', 'can', 'aid', 'the', 'development', 'process']
Final:  ['References', 'edit']
Final:  ['mw', 'parser', 'output', 'reflist', 'font', 'size', 'margin', 'bottom', 'em', 'list', 'style', '

Final:  ['frequently', 'asked', 'questions', 'asm', 'Archived', 'from', 'the', 'original', 'on', 'June', 'Retrieved', 'April']
Final:  ['Further', 'reading', 'edit']
Final:  ['See', 'also', 'ECMAScript', 'Specification', 'Documents']
Final:  ['Flanagan', 'David', 'JavaScript', 'The', 'Definitive', 'Guide', 'th', 'edition', 'Sebastopol', 'California', 'OReilly']
Final:  ['Haverbeke', 'Marijn', 'Eloquent', 'JavaScript', 'rd', 'edition', 'No', 'Starch', 'Press', 'pages', 'ISBNA', 'download']
Final:  ['Zakas', 'Nicholas', 'Principles', 'of', 'Object', 'Oriented', 'JavaScript', 'st', 'edition', 'No', 'Starch', 'Press', 'pages', 'ISBNA']
Final:  ['External', 'links', 'edit']
Final:  ['mw', 'parser', 'output', 'sister', 'box', 'border', 'px', 'solid', 'aaa', 'padding', 'background', 'fff', 'mw', 'parser', 'output', 'sister', 'box', 'div', 'padding', 'em', 'text', 'align', 'center', 'mw', 'parser', 'output', 'sister', 'box', 'div', 'display', 'block', 'mw', 'parser', 'output', 'sister', 'box',

Final:  ['Vivaldi']
Final:  ['Whale']
Final:  ['Yandex']
Final:  ['Gecko', 'based']
Final:  ['Firefox']
Final:  ['Conkeror']
Final:  ['GNU', 'IceCat']
Final:  ['IceDragon']
Final:  ['Meleon']
Final:  ['PirateBrowser']
Final:  ['SeaMonkey']
Final:  ['Tor']
Final:  ['Waterfox']
Final:  ['WebKit', 'based']
Final:  ['Dolphin']
Final:  ['Dooble']
Final:  ['GNOME', 'Web']
Final:  ['i', 'Cab']
Final:  ['Konqueror']
Final:  ['Midori']
Final:  ['Roccat']
Final:  ['Safari']
Final:  ['surf']
Final:  ['Other']
Final:  ['Avant']
Final:  ['Basilisk']
Final:  ['Cake', 'Browser']
Final:  ['CM', 'Browser']
Final:  ['eww']
Final:  ['Flow']
Final:  ['Internet', 'Explorer']
Final:  ['Links']
Final:  ['Lunascape']
Final:  ['Lynx']
Final:  ['NetFront']
Final:  ['NetSurf']
Final:  ['Pale', 'Moon']
Final:  ['QQ', 'browser']
Final:  ['qutebrowser']
Final:  ['SlimBrowser']
Final:  ['wm']
Final:  ['DiscontinuedGecko', 'based']
Final:  ['Beonex', 'Communicator']
Final:  ['Camino']
Final:  ['Classilla']
Final:  ['

In [23]:
lines

['\n',
 'Title:\n',
 'JavaScript\n',
 'Text:\n',
 '\n',
 '\t\tFrom Wikipedia, the free encyclopedia\n',
 '\t\t\n',
 'This is the latest accepted revision, reviewed on 2 February 2022.\n',
 '\n',
 '\n',
 '\n',
 '\t\t\n',
 '\t\t\n',
 '\t\t\n',
 '\t\tJump to navigation\n',
 '\t\tJump to search\n',
 '\t\t.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}Not to be confused with Java (programming language) or Javanese script.\n',
 '".js" redirects here. For the Microsoft dialect used in Internet Explorer, see JScript.\n',
 'For the uses of JavaScript on Wikipedia, see Wikipedia:JavaScript.\n',
 '\n',
 '\n',
 'High-level programming language\n',
 '.mw-parser-output .infobox-subbox{padding:0;border:none;margin:-3px;width:auto;min-width:100%;font-size:100%;clear:none;float:none;background-color:transparent}.mw-parser-output .in

In [24]:
tokens_word = nltk.word_tokenize("In JavaScript, an object is an associative array, augmented with a prototype (see below); each key provides the name for an object property, and there are two syntactical ways to specify such a name: dot notation (obj.xÂ\xa0=Â\xa010) and bracket notation (obj['x']Â\xa0=Â\xa010). A property may be added, rebound, or deleted at run-time. Most properties of an object (and any property that belongs to an object's prototype inheritance chain) can be enumerated using a for...in loop.\n")
tokens_word

['In',
 'JavaScript',
 ',',
 'an',
 'object',
 'is',
 'an',
 'associative',
 'array',
 ',',
 'augmented',
 'with',
 'a',
 'prototype',
 '(',
 'see',
 'below',
 ')',
 ';',
 'each',
 'key',
 'provides',
 'the',
 'name',
 'for',
 'an',
 'object',
 'property',
 ',',
 'and',
 'there',
 'are',
 'two',
 'syntactical',
 'ways',
 'to',
 'specify',
 'such',
 'a',
 'name',
 ':',
 'dot',
 'notation',
 '(',
 'obj.xÂ',
 '=Â',
 '10',
 ')',
 'and',
 'bracket',
 'notation',
 '(',
 'obj',
 '[',
 "'",
 'x',
 "'",
 ']',
 'Â',
 '=Â',
 '10',
 ')',
 '.',
 'A',
 'property',
 'may',
 'be',
 'added',
 ',',
 'rebound',
 ',',
 'or',
 'deleted',
 'at',
 'run-time',
 '.',
 'Most',
 'properties',
 'of',
 'an',
 'object',
 '(',
 'and',
 'any',
 'property',
 'that',
 'belongs',
 'to',
 'an',
 'object',
 "'s",
 'prototype',
 'inheritance',
 'chain',
 ')',
 'can',
 'be',
 'enumerated',
 'using',
 'a',
 'for',
 '...',
 'in',
 'loop',
 '.']

In [25]:
'\t\tJump to navigation\n'.split('\n')[0].split('\t')

['', '', 'Jump to navigation']

In [26]:
s=".mw-parser-output .infobox-subbox{padding:0;border:none;margin:-3px;width:auto;min-width:100%;font-size:100%;clear:none;float:none;background-color:transparent}.mw-parser-output .infobox-3cols-child{margin:auto}.mw-parser-output .infobox .navbar{font-size:100%}body.skin-minerva .mw-parser-output .infobox-header,body.skin-minerva .mw-parser-output .infobox-subheader,body.skin-minerva .mw-parser-output .infobox-above,body.skin-minerva .mw-parser-output .infobox-title,body.skin-minerva .mw-parser-output .infobox-image,body.skin-minerva .mw-parser-output .infobox-full-data,body.skin-minerva .mw-parser-output .infobox-below{text-align:center}JavaScriptParadigmMulti-paradigm: event-driven, functional, imperative, object-oriented programmingDesignedÂ byBrendan Eich of Netscape initially; others have also contributed to the ECMAScript standardFirstÂ appearedDecemberÂ 4, 1995; 26 years agoÂ (1995-12-04)[1]Stable releaseECMAScript 2021[2]Â"

In [27]:
import re
# s='\t\tJump to navigation!!..\n'
# re.sub('\s+', '', s)
re.sub("[^0-9A-Za-z ]", "" , '.mw-parser-output')

'mwparseroutput'

In [28]:
'   / June 2021; 8 months agoÂ\xa0(June 2021)Preview releaseECMAScript 2022[3]Â\xa0\n'.strip('\n').strip('\t')

'   / June 2021; 8 months agoÂ\xa0(June 2021)Preview releaseECMAScript 2022[3]Â\xa0'

In [29]:
tokens_word = nltk.word_tokenize(s)
tokens_word

['.mw-parser-output',
 '.infobox-subbox',
 '{',
 'padding:0',
 ';',
 'border',
 ':',
 'none',
 ';',
 'margin',
 ':',
 '-3px',
 ';',
 'width',
 ':',
 'auto',
 ';',
 'min-width:100',
 '%',
 ';',
 'font-size:100',
 '%',
 ';',
 'clear',
 ':',
 'none',
 ';',
 'float',
 ':',
 'none',
 ';',
 'background-color',
 ':',
 'transparent',
 '}',
 '.mw-parser-output',
 '.infobox-3cols-child',
 '{',
 'margin',
 ':',
 'auto',
 '}',
 '.mw-parser-output',
 '.infobox',
 '.navbar',
 '{',
 'font-size:100',
 '%',
 '}',
 'body.skin-minerva',
 '.mw-parser-output',
 '.infobox-header',
 ',',
 'body.skin-minerva',
 '.mw-parser-output',
 '.infobox-subheader',
 ',',
 'body.skin-minerva',
 '.mw-parser-output',
 '.infobox-above',
 ',',
 'body.skin-minerva',
 '.mw-parser-output',
 '.infobox-title',
 ',',
 'body.skin-minerva',
 '.mw-parser-output',
 '.infobox-image',
 ',',
 'body.skin-minerva',
 '.mw-parser-output',
 '.infobox-full-data',
 ',',
 'body.skin-minerva',
 '.mw-parser-output',
 '.infobox-below',
 '{',
 'text

In [30]:
data = '   / June 2021; 8 months agoÂ\xa0(June 2021)Preview releaseECMAScript 2022[3]Â\xa0\n'
re.split(', |:|_|-|!|;', '.June')

['.June']

In [31]:
re.split(', |:|\+|\|/|\*|$|&|@|_|-|!|;|,', '/Ë\x88dÊ\x92É\x91Ë\x90vÉ\x99skrÉªpt/')

['/Ë\x88dÊ\x92É\x91Ë\x90vÉ\x99skrÉªpt/', '']

In [32]:
nltk.word_tokenize('   / .June 2021;')

['/', '.June', '2021', ';']

In [33]:
x = 'June'.split('.')

In [34]:
sorted(x, key=len, reverse=True)

['June']

In [35]:
bool(re.compile('^[+-]?[0-9]+\.?[0-9]*$').match('20a'))

False

In [36]:
a="1992"
a.replace('"', '')

'1992'

In [37]:

# import required module
import unidecode
 
# assign string
string = "AAple"
 
# display original string
print('\nOriginal String:', string)
 
# remove ascents
outputString = unidecode.unidecode(string)
outputString


Original String: AAple


'AAple'

In [38]:
!pip install --user unidecode



In [39]:
patt = re.compile(r'(\w)\1*')
patt.sub(r'\1', 'Pommou')

'Pomou'

In [40]:
'===='.isalnum()

False

In [41]:
re.sub('[^A-Za-z0-9]+', '', ' \malskd123====')

'malskd123'

In [108]:
re.findall(r'[a-zA-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', 'camel')

['camel']

In [118]:
re.sub(r'[0-9]+', '', '2018JaavsCHCncj')

'JaavsCHCncj'

In [102]:
'camelCase'[0]

'c'