# Regular Expressions

# Exercises
Using the repo setup directions, setup a new local and remote repository named natural-language-processing-exercises. The local version of your repo should live inside of ~/codeup-data-science. This repo should be named natural-language-processing-exercises

Save this work in your natural-language-processing-exercises repo. Then add, commit, and push your changes.

Unless a specific file extension is specified, you may do your work either in a python script (.py) or a jupyter notebook (.ipynb).

Do your work for this exercise in a file named regex_exercises.

# Imports

In [111]:
# standard imports
import pandas as pd
import numpy as np

from datetime import datetime

# basic regexes
import re

In [2]:
sentence = 'abc 123745 1bc'

In [3]:
def show_all_matches(regexes, subject, re_length=6):
    print('Sentence:')
    print()
    print('    {}'.format(subject))
    print()
    print(' regexp{} | matches'.format(' ' * (re_length - 6)))
    print(' ------{} | -------'.format(' ' * (re_length - 6)))
    for regexp in regexes:
        fmt = ' {:<%d} | {!r}' % re_length
        matches = re.findall(regexp, subject)
        if len(matches) > 8:
            matches = matches[:8] + ['...']
        print(fmt.format(regexp, matches))

In [4]:
show_all_matches([
    r'^[aeiou\s\.]', # any letter that's not a vowel
], sentence, re_length=12)


Sentence:

    abc 123745 1bc

 regexp       | matches
 ------       | -------
 ^[aeiou\s\.] | ['a']


## 1. Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [5]:
strings = 'a c'

In [6]:
def start_vowel(subject):
    """
    match all words that start with a vowel
    """
    regexp = r'^[aeiou\s\.]'

    return re.findall(regexp, subject)

In [7]:
start_vowel(strings)

['a']

In [8]:
def find_all_vowel(subject):
    """
    match all words that start with a vowel
    """
    regexp = r'[aeiou]'

    return re.findall(regexp, subject)

In [9]:
find_all_vowel(strings)

['a']

In [10]:
strings = 'a c'

In [11]:
def is_vowel(subject):
    """
    searches for a string with a single vowel
    
    Parameters:
        strings
    
    Returns:
        True or False
    """
    regexp = (r'^\b[aeiouAEIOU]\b\S')

    return bool(re.search(regexp, subject))

In [12]:
is_vowel(strings)

False

## 2. Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.


> is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
False   
> is_valid_username('codeup')
True    
> is_valid_username('Codeup')
False    
> is_valid_username('codeup123')
True   
> is_valid_username('1codeup')
False

In [13]:
subject = 'codeup&'

In [14]:
def is_valid_username(subject):
    """
    match the following parameters:
    
    - Starts with a lowercase
    - Only consists of lowercase letters numbers or "_"
    - No longer than 32 characters
    
    Returns:
     True or False
    """
    regexp = r'^[a-z][a-z0-9_]{0,31}$'

    return bool(re.match(regexp, subject))

In [15]:
is_valid_username(subject)

False

## 3. Write a regular expression to capture phone numbers. It should match all of the following:


(210) 867 5309   
+1 210.867.5309    
867-5309   
210-867-5309   

In [81]:
phone1 = '(210) 867 5309'

In [91]:
regexp = r"(\+\d\s)?(\()?(\d{3})?(?(2)\))[-.\s]?(\d{3})[-.\s]?(\d{4})"

re.match(regexp, phone1)

<re.Match object; span=(0, 14), match='(210) 867 5309'>

In [92]:
phone2 = '+1 210.876.5309'

In [93]:
regexp = r"(\+\d\s)?(\()?(\d{3})?(?(2)\))[-.\s]?(\d{3})[-.\s]?(\d{4})"

re.match(regexp, phone2)

<re.Match object; span=(0, 15), match='+1 210.876.5309'>

In [94]:
phone3 = '867-5309'

In [95]:
regexp = r"(\+\d\s)?(\()?(\d{3})?(?(2)\))[-.\s]?(\d{3})[-.\s]?(\d{4})"

re.match(regexp, phone3)

<re.Match object; span=(0, 8), match='867-5309'>

In [96]:
phone4 = '210-867-5309'

In [97]:
regexp = r"(\+\d\s)?(\()?(\d{3})?(?(2)\))[-.\s]?(\d{3})[-.\s]?(\d{4})"

re.match(regexp, phone4)

<re.Match object; span=(0, 12), match='210-867-5309'>

In [99]:
phone_numbers = ['(210) 867 5309','+1 210.867.5309','867-5309','210-867-5309','Hi']

In [101]:
regexp = re.compile(r"(\+\d\s)?(\()?(\d{3})?(?(2)\))[-.\s]?(\d{3})[-.\s]?(\d{4})")

for number in phone_numbers:
    match = regexp.match(number)
    if match:
        print(f"'{number}' is a valid phone number.")
    else:
        print(f"'{number}' is not a valid phone number.")

'(210) 867 5309' is a valid phone number.
'+1 210.867.5309' is a valid phone number.
'867-5309' is a valid phone number.
'210-867-5309' is a valid phone number.
'Hi' is not a valid phone number.


### 4. Use regular expressions to convert the dates below to the standardized year-month-day format.


02/04/19    
02/05/19    
02/06/19   
02/07/19   
02/08/19   
02/09/19   
02/10/19    

In [104]:
# the date that need to be found
date = '02/04/19'

In [110]:
# Match the date format MM/DD/YY
regexp = r"(\d{2})/(\d{2})/(\d{2})"
match = re.match(regexp, date)

# Extract the components
month, day, year = match.groups()

# Assume the year is in the 2000s
year = '20' + year

# Print to the format YYYY-MM-DD
print(f'{year}-{month}-{day}')

2019-02-04


In [112]:
dates = ['02/04/19', '02/05/19', '02/06/19', '02/07/19', '02/08/19', '02/09/19', '02/10/19']

In [113]:
def convert_date_format(date):
    # Match the date format MM/DD/YY
    match = re.match(r'(\d{2})/(\d{2})/(\d{2})', date)
    if match:
        # Extract the components
        month, day, year = match.groups()
        # Assume the year is in the 2000s
        year = '20' + year
        # Convert to the format YYYY-MM-DD
        return f'{year}-{month}-{day}'
    else:
        return None

In [121]:
# Convert all dates
converted_dates = [convert_date_format(date) for date in dates]
converted_dates

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

In [124]:
# Create a DataFrame from the converted dates
df = pd.DataFrame(converted_dates, columns=['Date'])
df

Unnamed: 0,Date
0,2019-02-04
1,2019-02-05
2,2019-02-06
3,2019-02-07
4,2019-02-08
5,2019-02-09
6,2019-02-10


In [125]:
# Check the data type of the 'Date' column
df['Date'].dtype

dtype('O')

## 5. Write a regex to extract the various parts of these logfile lines:

GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58    

POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58  

GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58     

In [142]:
log_lines = [
    'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
    'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
    'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58'
]

This regular expression works as follows:

(\w+) : This matches the HTTP method, which is one or more word characters.    
\s : This matches a space.   
(\S+) : This matches the path, which is one or more non-space characters.    
\s\[(.*?)\]\s : This matches the date and time, which is enclosed in square brackets.   
(\S+) : This matches the HTTP version, which is one or more non-space characters.    
\s\{(\d+)\}\s : This matches the status code, which is enclosed in curly braces.    
(\d+) : This matches the size.    
\s"(.*?)"\s : This matches the user agent, which is enclosed in double quotes.   
(\S+) : This matches the IP address, which is one or more non-space characters.   

In [155]:
# Regular expression pattern to match the log line components
regexp = re.compile(r'(\w+)\s(\S+)\s\[(.*?)\]\s(\S+)\s\{(\d+)\}\s(\d+)\s"(.*?)"\s(\S+)')

# List to store the extracted parts
log_parts = []

for line in log_lines:
    match = regexp.match(line)
    if match:
        method, path, date_time, http_version, status_code, size, user_agent, ip = match.groups()
        print(f'Method: {method}\nPath: {path}\nDate and Time: {date_time}\nHTTP Version: {http_version}\n\
Status Code: {status_code}\nSize: {size}\nUser Agent: {user_agent}\nIP: {ip}\n')

    else:
        print('No match')

Method: GET
Path: /api/v1/sales?page=86
Date and Time: 16/Apr/2019:193452+0000
HTTP Version: HTTP/1.1
Status Code: 200
Size: 510348
User Agent: python-requests/2.21.0
IP: 97.105.19.58

Method: POST
Path: /users_accounts/file-upload
Date and Time: 16/Apr/2019:193452+0000
HTTP Version: HTTP/1.1
Status Code: 201
Size: 42
User Agent: User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36
IP: 97.105.19.58

Method: GET
Path: /api/v1/items?page=3
Date and Time: 16/Apr/2019:193453+0000
HTTP Version: HTTP/1.1
Status Code: 429
Size: 3561
User Agent: python-requests/2.21.0
IP: 97.105.19.58



In [149]:
def extract_log_parts(log_lines):
    """
    Extracts the various parts of the given log lines.

    Parameters:
    log_lines (list): A list of log lines.

    Returns:
    list: A list of tuples, where each tuple contains the extracted parts of a log line.
    """
    # Regular expression pattern to match the log line components
    regexp = re.compile(r'(\w+)\s(\S+)\s\[(.*?)\]\s(\S+)\s\{(\d+)\}\s(\d+)\s"(.*?)"\s(\S+)')

    # List to store the extracted parts
    log_parts = []

    for line in log_lines:
        match = regexp.match(line)
        if match:
            # Extract the parts and add them to the list
            log_parts.append(match.groups())

    return log_parts

In [158]:
df_logs = pd.DataFrame(extract_log_parts(log_lines), columns=['method', 'path','date_time','http_version','status_code','size','user_agent','ip'])

In [159]:
df_logs

Unnamed: 0,method,path,date_time,http_version,status_code,size,user_agent,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58
