# CAO Points Notebook

***

Import all the necessary packages

In [1]:
# Package for HTTP requests
import requests as rq

# Regular experssions package
import re

# Dates and time
import datetime as dt

In [2]:
url = 'http://www2.cao.ie/points/l8.php'

# Fetch the CAO points URL
resp = rq.get(url)

resp

<Response [200]>

<br>

## Save the original data
***

In [3]:
# get the current date and time
now = dt.datetime.now()

now_string = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a filepath with a current timestamp for the original data
filepath = 'data/cao2021_' + now_string + '.html'

In [5]:
# Server uses wron encoding, we need to fix it
orig_encoding = resp.encoding

# We need to use 'cp1252' endcoding
new_encoding = 'cp1252'

#change to cp1252
resp.encoding = new_encoding

In [6]:
# Save the original html file in csv format
with open(filepath, 'w') as f:
    f.write(resp.text)

In [7]:
# Compile the reqular expression for matching lines
re_courses = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)  ([0-9]{3}\*?)( [0-9]{3})? *')
re_all_courses = re.compile('[A-Z]{2}[0-9]{3}')

In [33]:
# Define function for splitting the lines

def split_dline(input_line):
    
    # split the line by 3 or more spaces (2 spaces are not enough as there are 2 spaces in some of the courses descriptions)
    space_separated = re.split('   +', input_line)
    
    # fisrt string in the resulting list is the course code and title like: 'AL801  Software Design for Virtual Reality and Gaming'
    code_title = space_separated[0]   
    
    # The code is first 5 characters 
    code = code_title[0:5]   
            
    # The title starts at 7th location
    title= code_title[7:len(code_title)] 
    
    # Join code, title an the rest of the line (starting from location 1) into one coma separated line:
    result = code +',' + title+','+','.join(space_separated[1:len(space_separated)])  + '\n'
    
    return result

In [34]:
# Create a path for csv file
filepath = 'data/cao2021_' + now_string + '.csv'

# loop through the lines of the sesponse content
no_lines = 0

# Open the csv file for writing
with open(filepath, 'w') as f:
    for line in resp.iter_lines():
    
        dline = line.decode(new_encoding)
        
        # Check if line starts with two capital letters followed by three numbers: ('[A-Z]{2}[0-9]{3}')
        matched = re_all_courses.match(dline)
        if matched:    
            
            
            f.write(split_dline(dline))
            
            # count number of matching lines
            no_lines = no_lines + 1

print("Number of courses found {}".format(no_lines))

Number of courses found 949


## References

# End
