# Lab 3 - Web Scraping & One-Hot Encoding

In [1]:
# Web-Scraping.
import re
import requests

# Data-Related Tasks.
import pandas as pd

In [2]:
re_title = r'<title>(.*?)<\/title>'
re_email = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
re_phone = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'

re_address = r'\d+\s+[A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*[A-Z0-9]{3,6}'
re_address = r'\d{1,3}.?\d*\s[a-zA-Z]{2,}\s?[a-zA-Z]{0,},?\s?[a-zA-Z]+,?\s?[A-Z]{2,3}\s?\d{5,6}'
re_address = r'\d{1,5}\s[\w\s]+,\s[\w\s]+,\s[A-Z]{2},\s?[A-Z0-9\s]+'
re_address = r'(\d{1,5}[\w\s.,\-]+(St|Street|Rd|Road|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Lane|Ln|Way|Court|Plaza|Circle|Crescent)[\w\s.,\-]+,\s?[A-Za-z\s]+,\s?[A-Z]{2,3}\s?\d{5,6})'
re_address = r'(\d{1,5}[\w\s.,\-]+(St|Street|Rd|Road|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Lane|Ln|Way|Court|Plaza|Circle|Crescent)[\w\s.,\-]+,\s?[A-Za-z\s]+,\s?[A-Za-z\s]+,\s?[A-Z]{2,3}\s?\d{5,6})'
re_address = r'(\d{1,6}\s+[a-zA-Z0-9\s.,\'-]+(?:St|Street|Rd|Road|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Lane|Ln|Way|Court|Circle|Crescent|Campus|Loop|Parkway|Trail|Plaza|Place|Mall|Park|Square)[\s.,]+[A-Za-z\s]+,\s?[A-Za-z\s]+,\s?(?:[A-Z]{2,3}\s?\d{5}|\w{2,3}\d{1,2}\w{1,2}\s?\d{1,2}\w{2,3})?)'

In [3]:
urls = {
    'Loyalist' : 'https://loyalistcollege.com/about/contact-us/',
    'Seneca' : 'https://www.senecapolytechnic.ca/news-and-events/media-releases.html',
    'OntarioTech' : 'https://ontariotechu.ca/',
    'Centennial' : 'https://www.centennialcollege.ca/about-centennial/contact-us',
    'Georgian' : 'https://www.georgiancollege.ca/about-georgian/campuses/barrie-campus/#contact'
}

In [4]:
for college, url in urls.items():
    # Fetching Data from the given `URL`.
    response = requests.get(url)

    # Getting HTML Content as Text.
    content = response.text

    # Extracting Page-Title from the Website.
    titles = re.search(re_title, content)

    # Extracting Email from the Website.
    emails = re.findall(re_email, content)

    # Extracting Phone-Number from the Website.
    phones = re.findall(re_phone, content)

    # Extracting Address from the Website.
    addresses = re.findall(re_address, content)

    print({
        'College' : college,
        'URL' : url,
        'Title' : titles.group(1) if titles else 'No Title Found',
        'Email' : emails,
        'Phone' : phones,
        'Address' : addresses
    })

{'College': 'Loyalist', 'URL': 'https://loyalistcollege.com/about/contact-us/', 'Title': 'Contact us - Loyalist College', 'Email': ['fippa@loyalistcollege.com', 'info@loyalistcollege.com', 'hbrown@loyalistcollege.com', 'hbrown@loyalistcollege.com', 'communications@loyalistcollege.com', 'communications@loyalistcollege.com', 'AccessAbility@loyalistcollege.com', 'AccessAbility@loyalistcollege.com', 'admissions@loyalistcollege.com', 'admissions@loyalistcollege.com', 'internationaladmissions@loyalistcollege.com', 'internationaladmissions@loyalistcollege.com', 'athletics@loyalistcollege.com', 'athletics@loyalistcollege.com', 'awards@loyalistcollege.com', 'awards@loyalistcollege.com', 'distance@loyalistcollege.com', 'distance@loyalistcollege.com', 'finaid@loyalistcollege.com', 'finaid@loyalistcollege.com', 'indigenousrc@loyalistcollege.com', 'indigenousrc@loyalistcollege.com', 'giving@loyalistcollege.com', 'giving@loyalistcollege.com', 'pathways@loyalistcollege.com', 'pathways@loyalistcollege

#### Since the logic is not executing logically to generate the data using regular expression after trying many a times, we are taking the sample data in the DataFrame.

In [5]:
df = pd.DataFrame({
    'College' : college,
    'URL' : url,
    'Title' : titles.group(1) if titles else 'No Title Found',
    'Email' : list(emails)[0],
    'Phone' : list(phones)[0],
    'Address' : [
        '376 Wallbridge-Loyalist Road, Belleville, ON, K8N 5B9',
        '1750 Finch Avenue, Toronto, ON, M2J 2X5',
        '2000 Simcoe Street North, Oshawa, ON, L1G 0C5',
        'Enrolment Services Centennial College, Toronto, ON, M1K 5E9',
        'One Georgian Drive, Toronto, ON, L4M 3X9'
    ]
})
df

Unnamed: 0,College,URL,Title,Email,Phone,Address
0,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"376 Wallbridge-Loyalist Road, Belleville, ON, ..."
1,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"1750 Finch Avenue, Toronto, ON, M2J 2X5"
2,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"2000 Simcoe Street North, Oshawa, ON, L1G 0C5"
3,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"Enrolment Services Centennial College, Toronto..."
4,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"One Georgian Drive, Toronto, ON, L4M 3X9"


## Extracting City

In [6]:
cities = [x[1] for x in df.Address.str.split(',').to_list()]
df['City'] = pd.Series(cities)
df.City

0     Belleville
1        Toronto
2         Oshawa
3        Toronto
4        Toronto
Name: City, dtype: object

## Applying One-Hot Encoding

In [7]:
ohe = pd.get_dummies(df.City)
ohe

Unnamed: 0,Belleville,Oshawa,Toronto
0,True,False,False
1,False,False,True
2,False,True,False
3,False,False,True
4,False,False,True


In [8]:
df = pd.concat([df, ohe], axis = 1)
df

Unnamed: 0,College,URL,Title,Email,Phone,Address,City,Belleville,Oshawa,Toronto
0,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"376 Wallbridge-Loyalist Road, Belleville, ON, ...",Belleville,True,False,False
1,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"1750 Finch Avenue, Toronto, ON, M2J 2X5",Toronto,False,False,True
2,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"2000 Simcoe Street North, Oshawa, ON, L1G 0C5",Oshawa,False,True,False
3,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"Enrolment Services Centennial College, Toronto...",Toronto,False,False,True
4,Georgian,https://www.georgiancollege.ca/about-georgian/...,No Title Found,inquire@georgiancollege.ca,333.3333333,"One Georgian Drive, Toronto, ON, L4M 3X9",Toronto,False,False,True
