# Regex Exercises

## Imports

In [1]:
import pandas as pd
import re

### 1. Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:
def is_vowel(regexp, subject):
    """
    This function will take in two arguments:
    1. regexp = the regular expression assigned to this variable
    2. subject = whatever you are passing through the regular expression
    """
    if re.search(regexp, subject):
        print('found a vowel!')
    else:
        print('no vowels present')
    

In [3]:
regexp = r'[aeiou\.]' 
subject = 'codeup'

is_vowel(regexp, subject)

found a vowel!


In [4]:
regexp = r'[aeiou\.]' 
subject = 'bbbb'

is_vowel(regexp, subject)

no vowels present


In [5]:
regexp = r'[aeiou\.]' 
subject = 'cttjtjsydfsh'

is_vowel(regexp, subject)

no vowels present


### 2. Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.


>>> is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
False
>>> is_valid_username('codeup')
True
>>> is_valid_username('Codeup')
False
>>> is_valid_username('codeup123')
True
>>> is_valid_username('1codeup')
False

In [6]:
def is_valid_username(regexes, subject):
    """
    This function will take in two arguments and return a bool
    asserting wheter the string fits the username criteria:
    1. regexes = multiple regular expressions
    2. subject = string/characters to test on
    """
    regexp = r'^[a-z][a-z0-9_]{,31}'
    return bool(re.search(regex, string))

In [7]:
#regex that starts with a lowercase letter
r'^[a-z]'

'^[a-z]'

In [8]:
#regex that only consists of lowercase letters
r'[a-z]'

'[a-z]'

In [9]:
#regex that only consists of numbers
r'\d'

'\\d'

In [10]:
#letters, numbers or underscores
r'\w'

'\\w'

In [12]:
#regex no longer than 32 characters
r'{1,31}$'

'{1,31}$'

In [13]:
#test
regexp = r'^[a-z]\w{1,31}$'
subject = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
re.search(regexp, subject)

In [14]:
len(subject)

33

In [15]:
regexp = r'^[a-z][a-z0-9_]{,31}'
subject = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
re.search(regexp, subject)

<re.Match object; span=(0, 32), match='aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'>

### 3. Write a regular expression to capture phone numbers. It should match all of the following:
- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

In [None]:
regexp = r'\w' 
subject = '(210) 867 5309'

re.search(regexp, subject) 

In [None]:
^
(\+\d+)?
(\d{3})?
(\d{3})
(\d{4})
$

In [16]:

df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]

In [17]:
phone_regex = re.compile(
'''^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
$''', re.VERBOSE)

In [18]:
df['number'].str.extract(phone_regex)


Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


In [19]:
pd.concat([df, df['number'].str.extract(phone_regex)], axis=1)


Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


### 4. Use regular expressions to convert the dates below to the standardized year-month-day format.
- 02/04/19
- 02/05/19
- 02/06/19
- 02/07/19
- 02/08/19
- 02/09/19
- 02/10/19

In [20]:
dates = pd.Series(['02/04/19', '02/05/19', '02/06/19', '02/07/19', '02/08/19', '02/09/19', '02/10/19'])

In [21]:
dates.str.replace(r'(\d{2})/(\d{2})/(\d{2})', r'\3/\1/\2', regex=True)

0    19/02/04
1    19/02/05
2    19/02/06
3    19/02/07
4    19/02/08
5    19/02/09
6    19/02/10
dtype: object

In [22]:
date_reg = r'(\d+)/(\d+)/(\d+)'
[re.sub(date_reg, r'20\3-\2-\2', date) for date in dates]

['2019-04-04',
 '2019-05-05',
 '2019-06-06',
 '2019-07-07',
 '2019-08-08',
 '2019-09-09',
 '2019-10-10']

In [23]:
new_list = []
for date in dates:
    new_list.append(re.sub(date_reg, r'20\3-\2-\2', date))
new_list

['2019-04-04',
 '2019-05-05',
 '2019-06-06',
 '2019-07-07',
 '2019-08-08',
 '2019-09-09',
 '2019-10-10']

### 5. Write a regex to extract the various parts of these logfile lines:
- GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
- POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
- GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

In [24]:
lines = """
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
"""

In [25]:
# parts:
# GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
# method GET
# path /api/v1/sales?page=86
# timestamp [16/Apr/2019:193452+0000]
# http version HTTP/1.1
# status code {200}
# bytes 510348
# user agent "python-requests/2.21.0"
# ip 97.105.19.58

In [26]:
regexp = r'''
^
(?P<method>GET|POST)
\s
(?P<path>[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes_out>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d+\.\d+\.\d+\.\d+)
$'''

In [27]:
[re.search(regexp, line, re.VERBOSE).groupdict() for line in lines.strip().split('\n')]


[{'method': 'GET',
  'path': '/api/v1/sales?page=86',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '200',
  'bytes_out': '510348',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'},
 {'method': 'POST',
  'path': '/users_accounts/file-upload',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '201',
  'bytes_out': '42',
  'user_agent': 'User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
  'ip': '97.105.19.58'},
 {'method': 'GET',
  'path': '/api/v1/items?page=3',
  'timestamp': '16/Apr/2019:193453+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '429',
  'bytes_out': '3561',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'}]

In [28]:
regex = re.compile(regexp, re.VERBOSE)

df = pd.DataFrame()
df['line'] = lines.strip().split('\n')
df = pd.concat([df, df.line.str.extract(regex)], axis=1)
df

Unnamed: 0,line,method,path,timestamp,http_version,status_code,bytes_out,user_agent,ip
0,GET /api/v1/sales?page=86 [16/Apr/2019:193452+...,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST /users_accounts/file-upload [16/Apr/2019:...,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET /api/v1/items?page=3 [16/Apr/2019:193453+0...,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58
