# Exercises for `01_regex` lesson

In [1]:
# library imports
import pandas as pd
import numpy as np

import re

# 1. `is_vowel` function
Write a function named `is_vowel`. It should accept a `string` as input and use a `regular expression` to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of `re.search` as a boolean value that indicates whether or not the regular expression matches the given string.

>Generating and testing the code for the function...

In [2]:

# first, testing on a string that contains vowels, but is not a vowel itself
print(re.search(r'aeiou', 'Test: This is a test.'))

None


>`None`: Passed! Now testing on string that is a vowel...

In [3]:
print(re.search(r'aeiou', 'a'))

None


>`None`: Failed. I will try the `|` sign in the regex between each vowel character...

In [4]:
print(re.search(r'a|e|i|o|u', 'a'))

<re.Match object; span=(0, 1), match='a'>


>`Match`: Passed! Now I will include another character that is not a vowel...

In [5]:
print(re.search(r'a|e|i|o|u', 'ab'))

<re.Match object; span=(0, 1), match='a'>


>`Match`: Failed. I will try to add the metacharacter `{n}` with *n* == `1`

In [6]:
print(re.search(r'aeiou', 'ab'))

None


>`None`: Passed! Now I will try it on a non-vowel...

In [7]:
print(re.match(r'[aeiou]', 'ab'))

<re.Match object; span=(0, 1), match='a'>


>`Match`: Failed. I will try using the `{n}` with *n* == `1` so it only matches one instance of the vowel and `$` so that it ends.

In [8]:
print(re.search(r'[aeiou]{1}$', 'ab'))

None


>`None`: Passed! I will try this with 

In [9]:
print(re.search(r'[aeiou]{1}$', 'a'))

<re.Match object; span=(0, 1), match='a'>


>Defining `is_vowel` function...

In [10]:
def is_vowel(string):
    '''
    this function takes in a string and defines a regex statement that, when combined with the 
    re search function, will test the string to see whether it is a single vowel character.
        - if the string is a not a single vowel, it will print: Now a vowel.
        - if the strng is a single vowel, it will print: Vowel!
    '''
    regex = r'[aeiou]{1}$'
    
    # the lines of code below will print the corresponding statements... 
    if re.search(regex, string, re.IGNORECASE) == None:
        print('Not a vowel.')
    
    else:
        print('Vowel!')




>Testing...

In [11]:
# single vowel test
is_vowel('a')

Vowel!


In [12]:
# not single vowel test
is_vowel('ab')

Not a vowel.


In [13]:
is_vowel('A')

Vowel!


>It works!

### Instructor solution

# 2. `is_valid_username` function
Write a function named `is_valid_username` that accepts a `string` as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the `_` character. It should also be no longer than 32 characters. 


Conditions:
1. starts with lowercase letters
2. only consists of:
    - lowercase letters,
    - numbers, or
    - `_` char
3. no longer than 32 chars
<br>
<br>

The function should return either `True` or `False` depending on whether the passed string is a valid username.


>Generating and testing code for the function...<br>
>1. Starting with starts with lowercase letter

In [14]:
# starts with lowercase letter 
re.search(r'^[a-z]', 'user_name')

# Pass >> Match = 'user_name'

<re.Match object; span=(0, 1), match='u'>

In [15]:
# starts with uppercase letter
re.search(r'^[a-z]', 'User_name')

# None >> Pass

>2. only consists of:
>    - lowercase letters,
>    - numbers, or
>    - `_` char

In [16]:
# all lowercase and _ test
re.search(r'^[a-z0-9_]+$', 'user_name')

<re.Match object; span=(0, 9), match='user_name'>

In [17]:
# not all lowercase test
re.search(r'^[a-z0-9_]+$', 'user_Name')

In [18]:
# numbers test
re.search(r'^[a-z0-9_]+$', 'user_name1')

<re.Match object; span=(0, 10), match='user_name1'>

In [19]:
# no _ test
re.search(r'^[a-z0-9_]+$', 'username')

<re.Match object; span=(0, 8), match='username'>

In [20]:
# empty space test
re.search(r'^[a-z0-9_]+$', 'user name')

>3. no longer than 32 chars

In [21]:
def check_len(string, max_len):
    '''
    this function takes in a string and max_len value and returns True if the string is less than
    or equal to the max len of the input string.
    '''
    
    return len(string) <= max_len

In [22]:
check_len('string'*8, 100)

True

In [23]:
check_len('string'*8, 32)

False

>Now that I have a way to test the max_lenth, I can put everything together in a function to test...

In [24]:
def is_valid_user_name(string, max_len):
    '''
    
    '''
    
    # defining regular expression
    regex = r'^[a-z0-9_]+$'
    
    if re.search(regex, string) == None:
        print('Invalid password. Password contains invalid character.')
    
    else:
        if len(string) > max_len:
            print(f'Invalid password. Password cannot be longer than {max_len} characters.')
        else:
            print('Valid password. Password meets criteria!')
    

In [25]:
# testing pw that meets all criteria
is_valid_user_name('user_name', 32)

Valid password. Password meets criteria!


In [26]:
# testing password that contains empty space
is_valid_user_name('user name', 32)

Invalid password. Password contains invalid character.


In [27]:
# testing password that begins with uppercase character
is_valid_user_name('User_name', 32)

Invalid password. Password contains invalid character.


In [28]:
# testing pw that contains uppercase character
is_valid_user_name('user_Name', 32)

Invalid password. Password contains invalid character.


In [29]:
# testing password that contains invalid special character
is_valid_user_name('user_name!', 32)

Invalid password. Password contains invalid character.


In [30]:
# testing password that is longer than max_len
is_valid_user_name('user_name123456789101112', 10)

Invalid password. Password cannot be longer than 10 characters.


>Function tests successful!

# 3. ph# regex
Write a regular expression to capture phone numbers. It should match all of the following:
>`'(210) 867 5309'`<br>
`'+1 210.867.5309'`<br>
`'867-5309'`<br>
`'210-867-530'`<br>

In [35]:
# only 7 numbers

# defining regex
reg = r'\d{7}'

# valid test
re.search(reg, '8675309')

<re.Match object; span=(0, 7), match='8675309'>

In [36]:
# invalidat test
re.search(reg, '8 675309')

In [37]:
# nnn - nnnn

# defining regex
regex = r'\d{3}-\d{4}'

# valid test
re.search(regex, '867-5309')

<re.Match object; span=(0, 8), match='867-5309'>

In [38]:
# invalid test
re.search(regex, '8-67, 5309')

In [41]:
# nnn-nnnn or nnn.nnnn or nnn nnnnn

# defining regex
regex = r'\d{3}[-. ]\d{4}'

# validat test
re.search(regex, '867.5309')

<re.Match object; span=(0, 8), match='867.5309'>

#### Final Regex Breakdown `r"(\+?\d+)?.?(\(?\d{3}\)?)?.?\d{3}.?\d{4}"`
- `(\+?\d+)?` optional group
    - `\+` an actual plus sign: +
    - `?\d+` one or more digits, optional
- `.?` optional . character
- `(\(?\d{3}\)?)?` optional group, with optional subgroup
    - `\(?` actual open parenthesis: (, optional
    - (?\d{3}\)? optional sub-group
        - `\d{3}` three digits
        - `\)` actual closed parenthesis: ), optional
- `.?` actual period: ., optional
- `\d{3}` three digits
- `.?` actual period: ., optional
- `\d{4}` four digits


In [49]:
# assigning regex function to variable
r = re.search 

In [45]:
# regex solution
regex = r'(\+?\d+)?.?(\(?\d{3}\)?)?.?\d{3}.?\d{4}'

<re.Match object; span=(0, 15), match='+1 210.867.5309'>

In [47]:
# test phone numbers

a_test = '(210) 867 5309'
b_test = '+1 210.867.5309'
c_test = '867-5309'
d_test = '210-867-530'

In [50]:
# a test
r(regex, a_test)

<re.Match object; span=(0, 14), match='(210) 867 5309'>

In [51]:
# b test
r(regex, b_test)

<re.Match object; span=(0, 15), match='+1 210.867.5309'>

In [52]:
# c test
r(regex, c_test)

<re.Match object; span=(0, 8), match='867-5309'>

In [53]:
# d test
r(regex, d_test)

# 4. dates regex
Use regular expressions to convert the dates below to the standardized year-month-day format.

>`'02/04/19'`<br>
`'02/05/19'`<br>
`'02/06/19'`<br>
`'02/07/19'`<br>
`'02/08/19'`<br>
`'02/09/19'`<br>
`'02/10/19'`<br>

In [55]:
# creating a list of the dates
dates = ['02/04/19',
            '02/05/19',
            '02/06/19',
            '02/07/19',
            '02/08/19',
            '02/09/19',
            '02/10/19']

In [57]:
# using a dictionary to turn the list into a df
df_dates = pd.DataFrame({'dates': dates})
df_dates

Unnamed: 0,dates
0,02/04/19
1,02/05/19
2,02/06/19
3,02/07/19
4,02/08/19
5,02/09/19
6,02/10/19


In [58]:
# creating a regular expression to format the dates
split = re.compile(r'''(?P<month>\d{2})\
                      (?P<day>\d{2})\
                      (?P<year>\d{2})''', re.VERBOSE)

In [61]:
# using the split pattern on the df and then adding the split columns to original df
df_dates = pd.concat([df_dates, df_dates.dates.str.extract(split)], axis = 1)
df_dates

Unnamed: 0,dates,month,day,year,month.1,day.1,year.1
0,02/04/19,,,,,,
1,02/05/19,,,,,,
2,02/06/19,,,,,,
3,02/07/19,,,,,,
4,02/08/19,,,,,,
5,02/09/19,,,,,,
6,02/10/19,,,,,,


In [60]:
pattern = re.compile(r"""
(?P<month>\d{2})/
(?P<day>\d{2})/
(?P<year>\d{2})
""", re.VERBOSE)

In [62]:
df_dates = pd.concat([df_dates, df_dates.dates.str.extract(split)], axis=1)
df_dates

Unnamed: 0,dates,month,day,year,month.1,day.1,year.1,month.2,day.2,year.2
0,02/04/19,,,,,,,,,
1,02/05/19,,,,,,,,,
2,02/06/19,,,,,,,,,
3,02/07/19,,,,,,,,,
4,02/08/19,,,,,,,,,
5,02/09/19,,,,,,,,,
6,02/10/19,,,,,,,,,


### Not sure why getting Nans?

# 5. logfil regex
Write a regex to extract the various parts of these logfile lines:

`'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58'`<br><hr>
`'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58'`<br><hr>
`'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58'`<br><hr>


In [63]:
# creating a list to hold each string
lines = [
    """GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58""",
    """POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58""",
    """GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58"""
]

lines

['GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
 'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
 'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58']

In [69]:
regex = "\}\s(?P<bytes>\d+)\s\""

x = re.compile(regex, re.VERBOSE)

match = re.search(x, lines[0])

match

<re.Match object; span=(65, 75), match='} 510348 "'>

In [70]:
match.group("bytes")

'510348'

In [74]:
log_pattern = re.compile(r"""
(?P<method>GET|POST) 
\s
(?P<path>/[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
$
""", re.VERBOSE)

In [75]:
rows = [re.search(log_pattern, line).groupdict() for line in lines]
rows

[{'method': 'GET',
  'path': '/api/v1/sales?page=86',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '200',
  'bytes': '510348',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'},
 {'method': 'POST',
  'path': '/users_accounts/file-upload',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '201',
  'bytes': '42',
  'user_agent': 'User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
  'ip': '97.105.19.58'},
 {'method': 'GET',
  'path': '/api/v1/items?page=3',
  'timestamp': '16/Apr/2019:193453+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '429',
  'bytes': '3561',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'}]

In [76]:
df = pd.DataFrame(rows)
df

Unnamed: 0,method,path,timestamp,http_version,status_code,bytes,user_agent,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58


In [77]:
def is_query_string(string):
    return "?" in string

In [78]:
df["has_query_string"] = df["path"].apply(is_query_string)
df

Unnamed: 0,method,path,timestamp,http_version,status_code,bytes,user_agent,ip,has_query_string
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58,True
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58,False
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58,True


# Bonus | `/usr/share/dict/words` mac words
You can find a list of words on your mac at `/usr/share/dict/words`. Use this file to answer the following questions:
1. How many words have at least 3 vowels?
2. How many words have at least 3 vowels in a row?
3. How many words have at least 4 consonants in a row?
4. How many words start and end with the same letter?
5. How many words start and end with a vowel?
6. How many words contain the same letter 3 times in a row?
7. What other interesting patterns in words can you find?

In [80]:
bonus = pd.read_csv('/usr/share/dict/words', header = None)
bonus.columns = ['word']

In [83]:
bonus.head()

Unnamed: 0,word
0,A
1,a
2,aa
3,aal
4,aalii


In [81]:
three_or_more_vowels = bonus.word.str.count(r"[aeiouAEIOU]") >= 3

In [84]:
# How many words have at least 3 vowels
bonus[three_or_more_vowels]

Unnamed: 0,word
4,aalii
6,Aani
7,aardvark
8,aardwolf
9,Aaron
...,...
235874,zymotically
235875,zymotize
235876,zymotoxic
235878,Zyrenian


In [86]:
# How many words have at least 3 vowels in a row?
bonus[bonus.word.str.count(r"[aeiouAEIOU]{3}") > 0]

Unnamed: 0,word
234,Abietineae
235,abietineous
301,ablatitious
434,abranchious
507,absenteeism
...,...
235800,Zygophyceae
235801,zygophyceous
235802,Zygophyllaceae
235803,zygophyllaceous
