## Characters and their meaning in RegEx. 


> corpus means collection of textual documents.

> ` r ` actually means raw string

> ` $ ` (Dollar) is always added at the end of a string.

> ` ^ ` (carot) used to mark the starting letters of a string

> ` . ` (wildcard) denotes a character which can be literally anything.
 
> ` ? ` denotes that the character just before "?" is optional. The word shall be included in with or without the character placed just before "?". 

> ` | ` (pipe) and ` [ ] ` stands for "or". For e.x.  ` (g|h|i) = [ghi] `
 
> ` + ` check for the occurance of character for one or more than one times. 

> ` * ` check for the occurance of character for zero or more than one times.

> ` \ `  helps a character to be recognized as a normal character of the string. Ex. on line 51 and 56 

> ` { } ` tells the number of characters, for the expression just before "{ }", the string should consist of.

> ` ?: `  makes the expression include all the characters coming before the patter, in the string.

> ` \d ` any digit lying between 0-9

> ` \D` any non digit character 

> `\s` any wide space character

In [1]:
import re 
import nltk

In [2]:
wordlist = []

for w in nltk.corpus.words.words('en'):
    if w.islower():
        wordlist.append(w)
        

In [3]:
len(wordlist)

210687

In [4]:
ed_wordlist = []

for w in wordlist:
    if re.search(r'ed$', w):
        ed_wordlist.append(w)

In [5]:
ed_wordlist[:10]

['abaissed',
 'abandoned',
 'abased',
 'abashed',
 'abatised',
 'abed',
 'aborted',
 'abridged',
 'abscessed',
 'absconded']

In [6]:
eight_char_wordlist = list()

for w in wordlist:
    if re.search(r'^..q..t..$',w):
        eight_char_wordlist.append(w)
        
eight_char_wordlist

['coquetry',
 'coquette',
 'haqueton',
 'maquette',
 'moquette',
 'requital',
 'requiter',
 'roquette',
 'sequitur',
 'unquoted']

In [7]:
tokens_list = ['email', 'e-mail', 'e_mail', 'e/mail']

valid_email_list = list()

for w in tokens_list: 
    if re.search(r'e(_|-)?mail$', w):
        valid_email_list.append(w)

valid_email_list    

['email', 'e-mail', 'e_mail']

In [8]:
four_letter_wordlist = []

for w in wordlist:
    if re.search(r'^[pfrs][uyek][qwit][cvmg]$', w): 
    #or re.search(r'^(p|f|r|s)(u|y|e|k)(q|w|i|t)(c|v|m|g)$', w)
        four_letter_wordlist.append(w)  
    

In [9]:
four_letter_wordlist

['pyic', 'reim', 'skim', 'skiv']

In [10]:
chatwords = sorted(set( w for w in nltk.corpus.nps_chat.words()))

In [11]:
comedy_words = []

for w in chatwords:
    if re.search(r'^[ha]+$', w):
        comedy_words.append(w)
        
comedy_words[7:20]

['ahhahahaha',
 'ahhh',
 'ahhhh',
 'ahhhhhh',
 'ahhhhhhhhhhhhhh',
 'h',
 'ha',
 'haaa',
 'hah',
 'haha',
 'hahaaa',
 'hahah',
 'hahaha']

In [12]:
silly_words_list = []

for w in chatwords:
    if re.search(r'^m+i+n+e+$', w):
        silly_words_list.append(w)

silly_words_list

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'mine',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

In [13]:
treebankwords = sorted(set(nltk.corpus.treebank.words()))

In [14]:
float_point_numbers = list()

for w in treebankwords:
    if re.search(r'^[0-9]+\.[0-9]+$',w):
         float_point_numbers.append(w)
                                               
float_point_numbers[:10]

['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']

In [15]:
dollar_sign_list = []

for w in treebankwords:
    if re.search(r'^[A-Z]+\$$', w): # '\' helps $ to be recognized as a character of the string 
        dollar_sign_list.append(w)
    
dollar_sign_list

['C$', 'US$']

In [16]:
# {} tells how many digit number (or character in general) has to be searched

four_digit_list = []

for w in treebankwords:
    if re.search(r'^[0-9]{4}$', w):
        four_digit_list.append(w)

four_digit_list[:10]

['1614',
 '1637',
 '1787',
 '1901',
 '1903',
 '1917',
 '1925',
 '1929',
 '1933',
 '1934']

In [17]:
year_list = []

for w in treebankwords:
    if re.search(r'^[1-2][0-9][0-9]{2}$', w):
        year_list.append(w)
    
year_list[10:20]

['1948',
 '1953',
 '1955',
 '1956',
 '1961',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969']

In [18]:
digit_list = []

for w in treebankwords:
    if re.search(r'^[1-2][0-9][0-9]{,2}$', w):  #includes 0-2 digits
        digit_list.append(w)
    if re.search(r'^[1-2][0-9][0-9]\d\D{1,2}$',w):  #includes 3-4 digits
        digit_list.append(w)
    
digit_list[60:70]

['1933',
 '1934',
 '1940s',
 '1948',
 '195',
 '1950s',
 '1953',
 '1955',
 '1956',
 '1960s']

In [19]:
 # * here checks for anything (.) occuring btw 0 or more times.
before = re.findall(r'^.*(ing|ed|ly|ious|ies|ive|es|s|ment)$', 'deadly')
print(before)

# ?: makes the expression include the characters coming before the pattern.
re.findall(r'^.*(?:ing|ed|ly|ious|ies|ive|es|s|ment)$', 'deadly')

['ly']


['deadly']