# Scraping UNSW Courses

In [213]:
from bs4 import BeautifulSoup # BeautifulSoup is a Python library for pulling data out of HTML and XML files
import urllib.request # Python module for fetching URLs
from IPython.display import HTML
import pandas as pd

In [214]:
# Open URL
html = urllib.request.urlopen('http://timetable.unsw.edu.au/2020/subjectSearch.html').read()

# Parse HTML data
soup = BeautifulSoup(html, "lxml")

# Prints first 100 characters
print(soup.prettify()[0:100])

<html>
 <head>
  <title>
   Class Search by Teaching Period
  </title>
  <link href="../layout/2020/


In [215]:
# Create an empty list to access different links
# for different subject area pages
endLinks = []

# Filter through HTML <td>, <tr> and <a> tags, and create
# a list of partial links for all subject areas
for list_td in soup.find_all('td', attrs={'class':"formBody"}):
    for list_tr in list_td.find_all('tr', attrs={'class':"rowLowlight"}):
        for list_a in list_tr.find_all('a'):
            endLinks.append(list_a.get('href'))
    for list_tr in list_td.find_all('tr', attrs={'class':"rowHighlight"}):
        for list_a in list_tr.find_all('a'):
            endLinks.append(list_a.get('href'))

# Convert list to DF
endLinks = pd.DataFrame(endLinks)

# Remove duplicates
endLinks = endLinks.drop_duplicates()

# Sort DF in alphanumeric order by column '0'
endLinks.columns = ['0']
endLinks = endLinks.sort_values(by='0')

In [217]:
# Create empty lists for features
code = [] # Course Code
name = [] # Course Name
UoC = [] # Units of Credit
camp = [] # Campus
sub = [] # Subject Area
uni = [] # University
fac = [] # Faculty
sch = [] # School
camp = [] # Campus
car = [] # Career

# Scan website links for all different subject areas
for i in range(len(endLinks)):
    link = endLinks.iloc[i]
    # Convert to string and remove whitespacing
    link = link.to_string(index=False).strip()
    fullLink = 'http://timetable.unsw.edu.au/2020/' + str(link)
    html = urllib.request.urlopen(fullLink).read()
    soup = BeautifulSoup(html, 'lxml')

    # Create empty lists for preprocessing
    infoA = [] # Will contain course Code, Course Title, Units of Credit
               # for certain subject area
    infoB = [] # Will contain content under <td> tag for certain subject area
    endLinksB = [] # Will contain partial links for all subjects
    facSubj = [] # Will temporarily contain faculty for each subject
    schSubj = [] # Will temporarily contain school for each subject
    campSubj = [] # Will temporarily contain campus for each subject
    carSubj = [] # Will temporarily contain career for each subject

    # Filter through HTML <td>, <tr> and <a> tags, and create
    # an 'infoA' and 'infoB' list for a certain subject area
    for list_tdBody in soup.find_all('td', attrs={'class':"formBody"}):
        for list_tr in list_tdBody.find_all('tr', attrs={'class':"rowLowlight"}):
            for list_a in list_tr.find_all('a'):
                endLinksB.append(list_a.get('href'))
            for list_tdData in list_tr.find_all('td', attrs={'class':"data"}):
                infoA.append(list_tdData.text)
        for list_tr in list_tdBody.find_all('tr', attrs={'class':"rowHighlight"}):
            for list_a in list_tr.find_all('a'):
                endLinksB.append(list_a.get('href'))
            for list_tdData in list_tr.find_all('td', attrs={'class':"data"}):
                infoA.append(list_tdData.text)
        for list_td in list_tdBody.find_all('td'):
            infoB.append(list_td.text)
            
    # Convert list of subject website links to formatted DF 
    endLinksB = pd.DataFrame(endLinksB)
    endLinksB = endLinksB.drop_duplicates()
    endLinksB.columns = ['0']
    endLinksB = endLinksB.sort_values(by='0')
    endLinksB = endLinksB.reset_index(drop=True)
    
    # For each subject, open website link
    for j in range(len(endLinksB)):
        linkB = endLinksB.iloc[j]
        # Convert to string and remove whitespacing
        linkB = linkB.to_string(index=False).strip()
        fullLinkB = 'http://timetable.unsw.edu.au/2020/' + str(linkB)
        htmlB = urllib.request.urlopen(fullLinkB).read()
        soupB = BeautifulSoup(htmlB, 'lxml')
        
        # Create empty list for preprocessing
        infoC = [] # Will contain content under <td> tag for certain subject

        # Create list of HTML <td> tags, containing features for a 
        # certain subject
        for list_tdBody in soupB.find_all('td', attrs={'class':"formBody"}):
            for list_td in list_tdBody.find_all('td'):
                infoC.append(list_td.text) 

        # Extract features for subject
        indFac = infoC.index('Faculty ') + 1
        faculty = infoC[indFac]
        indSch = infoC.index('School ') + 1
        school = infoC[indSch]
        indCamp = infoC.index('Campus ') + 1
        campus = infoC[indCamp]
        indCar = infoC.index('Career ') + 1
        career = infoC[indCar]
        
        # Create temporary feature list for all subjects in a subject area
        facSubj.append(faculty)
        schSubj.append(school)
        campSubj.append(campus)
        carSubj.append(career)

    # Merge permanent and temporary feature lists    
    fac = fac + facSubj
    sch = sch + schSubj
    camp = camp + campSubj
    car = car + carSubj
        
    # Identify subject area
    indSubj = infoB.index('Subject Area ') + 1
    subjArea = infoB[indSubj]

    # Split 'infoA' list into respective feature arrays
    for k in range(0,len(infoA),3):
        code.append(infoA[k])
        name.append(infoA[k+1])
        UoC.append(infoA[k+2])
        
    # Array length of certain subject area
    height = int(len(infoA)/3)
    
    # Populate features with same value
    # in a certain subject area
    for l in range(height):
        sub.append(subjArea)
        uni.append('UNSW')
        
# DF Formatting
df = pd.DataFrame({"Course Code": code, "Course Name": name, "UoC": UoC,\
                   "Subject Area": sub, "University": uni})    
df = df.drop_duplicates()
df = df.drop_duplicates(subset='Course Code', keep='first')
df = df.sort_values(by='Course Code')
df = df.reset_index(drop=True)

# Add additional feature columns
df['Faculty']=fac
df['School']=sch
df['Campus']=camp
df['Career']=car

print(df)

     Course Code                                 Course Name UoC  \
0       ACCT1501      Accounting and Financial Management 1A   6   
1       ACCT1511      Accounting and Financial Management 1B   6   
2       ACCT2101                        Industry Placement 1  12   
3       ACCT2522                     Management Accounting 1   6   
4       ACCT2542  Corporate Financial Reporting and Analysis   6   
...          ...                                         ...  ..   
4328    ZZSC5806     Regression Analysis for Data Scientists   6   
4329    ZZSC5836            Data Mining and Machine Learning   6   
4330    ZZSC5855   Multivariate Analysis for Data Scientists   6   
4331    ZZSC5905   Statistical Inference for Data Scientists   6   
4332    ZZSC9001                 Foundations of Data Science   6   

             Subject Area University               Faculty  \
0              Accounting       UNSW  UNSW Business School   
1              Accounting       UNSW  UNSW Business School 

In [218]:
# Export DF to Excel
with pd.ExcelWriter('UNSW Course List.xlsx') as writer:
    df.to_excel(writer)

In [None]:
# Converting DataFrame to dictionary
dictDF = df.to_dict('df')