## Task 1

### (a) Crawl Web Page

In [20]:
from requests import get

url = 'https://www.sfu.ca/computing/people/faculty.html'
response = get(url)

def write_to_file(response_text, file_name):
    output_file = open(file_name, 'w')
    output_file.write('pages\\' + response_text)
    
write_to_file(response.text, 'csfaculty.html')

### (b) Extract Structured Data

In [21]:
from bs4 import BeautifulSoup
import pandas as pd

file = open('pages\\csfaculty.html', 'r')
file_content = file.read()

html_soup = BeautifulSoup(file_content, 'html.parser')
faculty_container = html_soup.find_all('div', class_ = 'textimage section')

faculty_names = []
faculty_ranks = []
faculty_areas = []
faculty_profiles = []
faculty_homepages = []

def prep_string(input_string):
    return ("_".join(input_string.split())).upper()
    
for faculty in faculty_container:
    # get faculty name and rank
    name_and_rank = faculty.h4.text
    
    # split name and rank
    name_and_rank = str.replace(name_and_rank, '\n', ',')
    split_name_and_rank = name_and_rank.split(',')
    
    # get faculty name
    faculty_name = split_name_and_rank[0]
    faculty_names.append(faculty_name.title())
    
    # get faculty rank
    faculty_rank = str.strip(split_name_and_rank[1])
    faculty_ranks.append(faculty_rank.title())

    # get areas
    area = ''
    area_paragraph = faculty.p
    if (type(area_paragraph) != type(None)):
        area = area_paragraph.text.title().replace('Area:', '').strip()
    area = area.replace(';', ',')
    faculty_areas.append(area)
    
    # get contact information and home page links
    links = faculty.find_all('a', class_ = '')
    
    profile_link = ''
    homepage_link = ''
    
    # iterate through links [because they don't have any specific html attribute]
    for link in links:
        
        # check if href attribute is avaialable 
        if (link.has_attr('href')):
            
            # check if it's contact information <a> 
            if (prep_string(link.text) == 'PROFILE_&_CONTACT_INFORMATION'):
                profile_link = link['href']
                
                # add http://www.sfu.ca to start of relative paths
                if (profile_link.startswith('http://www.sfu.ca/') == False):
                    profile_link = 'http://www.sfu.ca' + profile_link
                    
            # check if it's home page <a>
            elif (prep_string(link.text) == 'HOME_PAGE'):
                homepage_link = link['href']

    faculty_profiles.append(profile_link)
    faculty_homepages.append(homepage_link)


df_faculty = pd.DataFrame({'name': faculty_names, 
                           'rank': faculty_ranks,
                           'areas': faculty_areas,
                           'profile': faculty_profiles,
                           'homepage': faculty_homepages
                         })

df_faculty.to_csv('csfaculty.csv', index=False)

### Interesting Finding

In [22]:
from dataprep.eda import plot
import pandas as pd

df = pd.read_csv("csfaculty.csv")
plot(df)

## Task 2

In [32]:
from requests import get 
from time import sleep
from random import randint


# read generated faculty data
df_faculty = pd.read_csv('csfaculty.csv')


# function to save file
def write_to_file(response_text, file_name):
    output_file = open(file_name, 'w')
    output_file.write('pages\\' + response_text)

    
# iterate through faculties
for index, faculty in df_faculty.iterrows():
    
    # check if faculty has profile link
    if (not pd.isnull(faculty['profile'])):
        
        # get profile page
        url = faculty['profile']
        response = get(url)

        # write to output
        write_to_file(response.text, faculty['profile'][43:])
        
        # sleep for some seconds until next request
        sleep(randint(1, 4))
        
print('faculty pages saved successfuly.')

Yagiz Aksoy html page saved.
Brad Bart html page saved.
Angel Chang html page saved.
Sheelagh Carpendale html page saved.
Leonid Chindelevitch html page saved.
Diana Cukierman html page saved.
James P. Delgrande html page saved.
Toby Donaldson html page saved.
John Edgar html page saved.
Martin Ester html page saved.
Brian Fraser html page saved.
Yasutaka Furukawa html page saved.
Uwe Glässer html page saved.
Mohamed Hefeeda html page saved.
Harinder Singh Khangura html page saved.
Anne Lavergne html page saved.
Maxwell Libbrecht html page saved.
Jiangchuan (Jc) Liu html page saved.
David Mitchell html page saved.
Jian Pei html page saved.
Fred Popowich html page saved.
Arrvindh Shriraman html page saved.
William (Nick) Sumner html page saved.
Manolis Savva html page saved.
Igor Shinkar html page saved.
Ping Tan html page saved.
Eugenia Ternovska html page saved.
Keval Vora html page saved.
Ke Wang html page saved.
Tianzheng Wang html page saved.
Kay C. Wiese html page saved.
Kangkang 

In [69]:
from time import sleep
from random import randint 
from bs4 import BeautifulSoup
from requests import get 
import pandas as pd 


# read generated faculty data
df_faculty = pd.read_csv('csfaculty.csv')


# this function finds the first graduation year from <li> tags
def get_min_gradyear_from_li(educations_list):
    grad_year_min = 9999
    
    # iterate through all graduations of prof. in <li> 
    for education in educations_list:
        
        # split parts using ','
        split_education = education.text.split(',')
        
        # get last part after split which is graduation year
        grad_year = split_education[len(split_education) - 1].strip()
        
        # get last 4 digits which is year
        grad_year = grad_year[len(grad_year) - 4:]
        
        # check if it's for sure only digits
        if (grad_year.isdigit()):
            grad_year = int(grad_year)
            
            # check for minimum graduation date
            if grad_year < grad_year_min:
                grad_year_min = grad_year

    # return value
    return grad_year_min


# this function finds the first graduation year from <p> tag
def get_min_gradyear_from_p(educations_paragraph):
    grad_year_min = 9999
    
    # split input paragraph by line breaks '\n'
    educations_list = educations_paragraph.text.lower().split('\n')
    
    # iterate through graduations
    for education in educations_list:
        
        # check if data is not empty
        if (education != ''):
            
            # remove . from data [some cases]
            education = education.replace('.', '')
            
            # split using ','
            split_education = education.split(',')
            
            # last part after split is graduation year
            grad_year = split_education[len(split_education) - 1].strip()
            
            # get last 4 digits which is year
            grad_year = grad_year[len(grad_year) - 4:]
            
            # make sure its only digits
            if (grad_year.isdigit()):
                grad_year = int(grad_year)
                
                # find the minimum value
                if grad_year < grad_year_min:
                    grad_year_min = grad_year

    # return value
    return grad_year_min


# arrays of data to generate
faculty_names = []
faculty_gradyears = []


# iterate through faculties
for index, row in df_faculty.iterrows():
    
    # add faculty name to array
    faculty_names.append(row['name'])
    
    # set min grad year
    grad_year_min = 9999
    
    # if faculty has profile page then start process
    if (not pd.isnull(row.profile)):
        
        # open saved file
        file_name = row['profile'][43:]
        file = open('pages\\' + file_name, 'r')
        file_content = file.read()

        # parse html
        html_soup = BeautifulSoup(file_content, 'html.parser')

        # read all div s
        sections = html_soup.find_all('div', 'text parbase section')
        
        # iterate through all divs
        for section in sections:

            # read <h2>
            headers = section.find_all('h2')

            # iterate through h2 to find Education title
            for header in headers:

                # if 'Education' found
                if (header.text.strip().title() == 'Education'):

                    # get parent element
                    parent = section.h2.parent

                    # find if there's any <li> tag [some eduation are <li> some are <p>]
                    educations = parent.find_all('li')

                    # if there's any <li>
                    if (len(educations) > 0):
                        grad_year_min = get_min_gradyear_from_li(educations)
                    else: # otherwise, data is in a paragraph <p>
                        grad_year_min = get_min_gradyear_from_p(parent.p)
    
    # add grad year to array
    faculty_gradyears.append(grad_year_min)
    

# create pandas data frame
df_gradyear = pd.DataFrame({'name': faculty_names, 
                            'gradyear': faculty_gradyears
                           })

# change 9999 to '' 
df_gradyear['gradyear'] = df_gradyear['gradyear'].apply(lambda year: year if year != 9999 else '')

# save dataframe to output
df_gradyear.to_csv('faculty_grad_year.csv')

# print job completed
print('data saved to faculty_grad_year.csv')

data saved to faculty_grad_year.csv


In [68]:
from dataprep.eda import plot
import pandas as pd

df = pd.read_csv("faculty_grad_year.csv")
df["age"] = 2020+23-df["gradyear"]

plot(df, "age")