In [1]:
# Regular expressions (compes with Python by default so does not need to be included in requirements file)
import re

# Package for HTTP requests
# (Imnstalled as standard with Anaconda, usually will need to be installed)
import requests as rq

# For adding original data set as a time stamp for saving the data file
import datetime as dt

In [2]:
# Fetch the CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')

In [3]:
#Get the current data and time
now = dt.datetime.now()

# Format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a file path for the original data
path = 'data/cao2021new_' + nowstr + '.html'

In [5]:
# To fix wrong encoding:
original_encoding = resp.encoding

# Change to cp1252
resp.encoding = 'cp1252'

In [6]:
# Save the original html file
with open(path, 'w') as f:
    f.write(resp.text)

In [1]:
# To loop through the lines one by one, using iter_lines() function

# The regular expressions package enables to find defined pattern of text.
# Looking at the data, all relevant lines containing CAO point inforamtion begin with a course identifier consisting of two letters and three numbers.
# For example __AL801__ Software Design for Virtual Reality and Gaming
# match() functions looks for all lines where the beginning of the line matches the regular expression pattern
# fullmatch() function looks for all lines where the whole string matches the regular expression pattern
# see https://docs.python.org/3/library/re.html 
# [A-Z] = any capital letters, {2} is a quantifier, so any two capital letters
# . (dot) is the wildcard, any character at all (integer, string, punctuation, etc.)
# * is a quantifier (0 or more)
# + is a quantifier (1 or more)
# ? is a quantifier (0 or 1)
# \* looks for an * itself (because it's not escaped due to the letter r at the beginning of the line)

# Round brackets group characters , for example ([A-Z]{2}[0-9]{3}) = group 1
# Square brackets = Anything in square brackets [A-Z] = any capital letter from A-Z
# Curly brackets are a quantifier
# line.decode() converts byte-like objects into a string (iso-8859-1 format which enables some special characters like letters with accents) 

# Compiling the regular expression for matching lines so that it can be reused when matching the lines
# letter r at the beginning returns raw string, meaning that backslashes are not "evaluated" so that \* won't be evaluated 
# re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [7]:
# Compile the regular expression for matching lines
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')

In [25]:
def points_to_array(s):
    portfolio = ''
    if s[0] == '#':
        portfolio = '#'
    random = ''
    if s[-1] == '*':
        random = '*'
    points = ''
    for i in s:
        if i.isdigit():
            points = points + i

    return [points, portfolio, random]

In [38]:
def points_to_array(s):
    # https://www.pythonpool.com/empty-string-python/ using len() to check for empty values
    if len(s) == 0:
        return ['','','']
    else:
        portfolio = ''
        if s[0] == '#':
            portfolio = '#'
        random = ''
        if s[-1] == '*':
            random = '*'
        points = ''
        for i in s:
            if i.isdigit():
                points = points + i

        return [points, portfolio, random]

In [9]:
# The file path for the csv file
path = 'data/cao02021_new_csv' + nowstr + '.csv'

In [10]:
# Keep track of how many coureses we process
no_lines = 0

# Open the csv file for writing
with open(path,'w') as f:
    # Loop through the lines of the response
    for line in resp.iter_lines():
        #Decode the line using the wrong encoding
        dline = line.decode('cp1252')
        # Match only the lines representing courses
        if re_course.fullmatch(dline):
            # Add one to the lines counter
            no_lines = no_lines + 1
            # The course code
            course_code = dline[:5]
            # The course title
            course_title = dline[7:57]
            # Round one and two points
            course_points = re.split(' +', dline[60:])
            print(course_points)
            # Join the fields using a comma
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between
            f.write(','.join(linesplit) + '\n')
            
# Print the total number of processed lines
print(f"Total number of lines is {no_lines}.")

['300', '']
['313', '']
['350', '']
['321', '']
['328', '']
['', '']
['327', '']
['451*', '444']
['440*', '431']
['356', '']
['346', '']
['357', '']
['324', '']
['325', '']
['346', '']
['477', '476*']
['338', '']
['306', '']
['297', '']
['309', '']
['302', '']
['336', '']
['300', '299']
['309', '']
['304', '']
['308', '']
['301', '']
['#575', '']
['#747', '']
['306', '']
['484*', '467*']
['307', '']
['260', '']
['#700', '']
['292', '']
['250', '']
['270', '']
['270', '']
['266', '']
['307', '']
['430', '423']
['388', '']
['451', '']
['272', '']
['295', '']
['293', '']
['292', '']
['291', '']
['260', '251']
['291', '283']
['465', '']
['330', '328']
['280', '']
['371', '359']
['318', '']
['292', '']
['246', '']
['290', '']
['360', '358']
['247', '']
['269', '']
['#700', '']
['272', '']
['270', '']
['319', '']
['263', '235']
['262', '230']
['243', '224']
['443', '']
['431', '']
['434', '']
['396', '']
['336', '']
['390', '']
['365', '']
['#904', '#904']
['#1028', '']
['#525', '']
['350', 

<br>
**NB.**: It was verified as of 03/11/2021 that there were 949 courses exactly on the COA 2021 points list

***

In [37]:
# Keep track of how many coureses we process
no_lines = 0

# Open the csv file for writing
with open(path,'w') as f:
    # Loop through the lines of the response
    for line in resp.iter_lines():
        #Decode the line using the wrong encoding
        dline = line.decode('cp1252')
        # Match only the lines representing courses
        if re_course.fullmatch(dline):
            # Add one to the lines counter
            no_lines = no_lines + 1
            # The course code
            course_code = dline[:5]
            # The course title, strip() removes white spaces
            course_title = dline[7:57].strip()
            # Round one and two points
            course_points = re.split(' +', dline[60:])
            print(course_points)
            print(course_points[0])
            # Using join() to change array created in points_to_array to string separated by , https://www.w3schools.com/python/ref_string_join.asp 
            course_points_1 = ",".join(points_to_array(course_points[0]))
            course_points_2 = ",".join(points_to_array(course_points[1]))
            print(course_points_1, course_points_2)
            
            # Join the fields using a comma
            #linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between
            # f.write(','.join(linesplit) + '\n')
            
            linesplit = [course_code, course_title, course_points_1, course_points_2]
            print(linesplit)
            f.write(','.join(linesplit) + '\n')
            
# Print the total number of processed lines
print(f"Total number of lines is {no_lines}.")

['300', '']
300
300,, ,,
['AL801', 'Software Design for Virtual Reality and Gaming', '300,,', ',,']
['313', '']
313
313,, ,,
['AL802', 'Software Design in Artificial Intelligence for Clo', '313,,', ',,']
['350', '']
350
350,, ,,
['AL803', 'Software Design for Mobile Apps and Connected Devi', '350,,', ',,']
['321', '']
321
321,, ,,
['AL805', 'Computer Engineering for Network Infrastructure', '321,,', ',,']
['328', '']
328
328,, ,,
['AL810', 'Quantity Surveying', '328,,', ',,']
['', '']

,, ,,
['AL811', 'Civil Engineering', ',,', ',,']
['327', '']
327
327,, ,,
['AL820', 'Mechanical and Polymer Engineering', '327,,', ',,']
['451*', '444']
451*
451,,* 444,,
['AL830', 'General Nursing', '451,,*', '444,,']
['440*', '431']
440*
440,,* 431,,
['AL832', 'Mental Health Nursing', '440,,*', '431,,']
['356', '']
356
356,, ,,
['AL835', 'Pharmacology', '356,,', ',,']
['346', '']
346
346,, ,,
['AL836', 'Nutrition and Health Science', '346,,', ',,']
['357', '']
357
357,, ,,
['AL837', 'Sports Science wit