# CAO Points Notebook

***

Import all the necessary packages

In [1]:
# Package for HTTP requests
import requests as rq

# Regular experssions package
import re

# Import Pandas package
import pandas as pd

# Dates and time
import datetime as dt

# For comparing sequences (string for example)
import difflib

# For downloading files form the web
import urllib.request as urlrq

#### Create a now_string

This variable will be used to store the current date and time and used when saving the original data files with a timestamp

In [25]:
# get the current date and time
now = dt.datetime.now()

now_string = now.strftime('%Y%m%d_%H%M%S')

### Read original data from the CAO website

#### Read the 2019 points


1. Points for 2019 year downloaded from http://www.cao.ie/index.php?page=points&p=2019 in .pdf file format
2. PDF file opened using Adobe Acrobat DC
3. Exported to .xlsx file format using Acrobat DC
4. Pandas DataFrame created from the .xlsx format


In [3]:
df19 = pd.read_excel('data\cao2019_20211129_180145.xlsx', skiprows=10)

In [4]:
df19

Unnamed: 0,Course Code,Unnamed: 1,INSTITUTION and COURSE,Unnamed: 3,EOS,Mid
0,,,Athlone Institute of Technology,,,
1,AL801,,Software Design with Virtual Reality and Gaming,,304,328.0
2,AL802,,Software Design with Cloud Computing,,301,306.0
3,AL803,,Software Design with Mobile Apps and Connected...,,309,337.0
4,AL805,,Network Management and Cloud Infrastructure,,329,442.0
...,...,...,...,...,...,...
960,WD200,,Arts (options),,221,296.0
961,WD210,,Software Systems Development,,271,329.0
962,WD211,,Creative Computing,,275,322.0
963,WD212,,Recreation and Sport Management,,274,311.0


In [5]:
df19.columns

Index(['Course Code', 'Unnamed: 1', 'INSTITUTION and COURSE', 'Unnamed: 3',
       'EOS', 'Mid'],
      dtype='object')

In [12]:
df19 = df19.drop(['Unnamed: 1', 'Unnamed: 3'], axis=1)

In [15]:
# Create the dataframe with institution names and their place in the original dataframe

# As per: https://datatofish.com/rows-with-nan-pandas-dataframe/#:~:text=%20Steps%20to%20select%20all%20rows%20with%20NaN,NaN%20under%20a%20single%20DataFrame%20column%20More%20
hei = df19[df19['Course Code'].isna()]

#### Read the 2021 points

In [2]:
# URL of the page with the 2021 CAO points
url = 'http://www2.cao.ie/points/l8.php'

# Fetch the CAO points URL
resp = rq.get(url)

resp

<Response [200]>

#### Read the 2020 data

In [23]:
# Read the file directly from the CAO website
url21 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'
df21 = pd.read_excel(url21, skiprows=10)

##### Download the original 2020 file to the disk

In [28]:
# Create a filepath with a current timestamp for the original data
filepath = 'data/cao2020_' + now_string + '.xlsx'

In [32]:
urlrq.urlretrieve(url21, filepath)

('data/cao2020_20211128_151211.xlsx',
 <http.client.HTTPMessage at 0x253cde0df70>)

In [4]:
# Check the shape of the dataframe
df21.shape

(1464, 23)

In [5]:
# Show 5 first rows of the dataframe
df21.head()

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,


Columns named Column1-Column8 serve no purpose, they need to be removed. First I will list all the columns in the dataframe and then use it as a reference for removing them:

In [6]:
# List the columns in the datafarme
df21.columns

Index(['CATEGORY (i.e.ISCED description)', 'COURSE TITLE', 'COURSE CODE2',
       'R1 POINTS', 'R1 Random *', 'R2 POINTS', 'R2 Random*', 'EOS',
       'EOS Random *', 'EOS Mid-point', 'LEVEL', 'HEI', 'Test/Interview #',
       'avp', 'v', 'Column1', 'Column2', 'Column3', 'Column4', 'Column5',
       'Column6', 'Column7', 'Column8'],
      dtype='object')

In [7]:
# There are 23 columns in the dataframe, we need to remove 8, so we'll remove the columns from 15 to 23:
df21 = df21.drop(df21.columns[15:23], axis=1)

In [8]:
# Check teh head of the dataframe to make sure correct columns were removed
df21.head()

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
0,Business and administration,International Business,AC120,209,,,,209,,280,8,American College,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,8,American College,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,


In [10]:
# Check the end of the dataframe, to make sure all the rows were loaded
df21.tail()

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,7,Waterford Institute of Technology,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,8,Waterford Institute of Technology,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,8,Waterford Institute of Technology,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,8,Waterford Institute of Technology,,,
1463,Engineering and engineering trades,Mechanical and Manufacturing Engineering,WD230,253,,,,253,,369,8,Waterford Institute of Technology,,,


In [18]:
# Spot check, make sure that Excels row 765 was imported correctly. 
# Recalculate the index by substracting 10 skipped rows. allow for the fact that panda numbers rows from 0 and Excel from 1 and allow for the first row being used as header
df21.iloc[765-12]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Road Transport Technology and Management
COURSE CODE2                                                           LC286
R1 POINTS                                                                264
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      264
EOS Random *                                                             NaN
EOS Mid-point                                                            360
LEVEL                                                                      7
HEI                                         Limerick Institute of Technology
Test/Interview #                                                         NaN

In [21]:
# Check unique values of the 'HEI' column.
df21['HEI'].unique()

array(['American College', 'National College of Art and Design',
       'Athlone Institute of Technology', 'St. Angela`s College',
       'Irish College of Humanities & Applied Sciences',
       'University College Cork (NUI)', 'Marino Institute of Education',
       'Cork Institute of Technology', 'CCT College Dublin',
       'Institute of Technology, Carlow', 'Dublin Business School',
       'Dublin City University', 'Dundalk Institute of Technology',
       'Dun Laoghaire Institute of Art, Design and Technology',
       'University College Dublin (NUI)', 'Dorset College',
       'Galway-Mayo Institute of Technology', 'Galway Business School',
       'Griffith College', 'National University of Ireland, Galway',
       'ICD Business School', 'Limerick Institute of Technology',
       'University of Limerick', 'Letterkenny Institute of Technology',
       'Maynooth University', 'Mary Immaculate College',
       "Pontifical University, St Patrick's College",
       'National College of 

In [33]:
# Create a filepath with a current timestamp for the pandas data
filepath = 'data/cao2020_pandas' + now_string + '.csv'

In [36]:
# save the pandas dataframe as a csv file
df21.to_csv(filepath, index=False)

<br>

## Save the original data
***

In [None]:
# Create a filepath with a current timestamp for the original data
filepath = 'data/cao2021_' + now_string + '.html'

In [None]:
# Server uses wron encoding, we need to fix it
orig_encoding = resp.encoding

# We need to use 'cp1252' endcoding
new_encoding = 'cp1252'

#change to cp1252
resp.encoding = new_encoding

In [None]:
# Save the original html file in csv format
with open(filepath, 'w') as f:
    f.write(resp.text)

<br>

## Clean the data
***

In [None]:
# compile regular expression to find all the numbers in the String
points = re.compile('[0-9]+')

In [None]:
def extract_points(chunk):
    # Match all the numbers in the string
    pnt = points.search(chunk)

    # Find the difference between the starting string and the found numbers
    # In order to extract any special requirements like *, # etc
    # Solution found on:
    # https://stackoverflow.com/questions/17904097/python-difference-between-two-strings
    diff = [li.replace('+ ','') for li in difflib.ndiff(pnt.group(0), chunk) if li[0] != ' ']
    
    return pnt.group(0), ' '.join(diff)

In [None]:
# Define the function for splitting the lines

def split_dline(input_line):
    
    result = []
    
    # split the line by 3 or more spaces (2 spaces are not enough as there are 2 spaces in some of the courses descriptions)
    space_separated = re.split('   +', input_line)
    
    # The first string in the resulting list is the course code and title, in the format: 'AL801  Software Design for Virtual Reality and Gaming'
    code_title = space_separated[0]   
    
    # The code is first 5 characters 
    result.append(code_title[0:5])
            
    # The title starts at 7th location
    result.append(code_title[7:len(code_title)])
    
    # Append Round 1 points limits and extract any special requirements
    try:
        pts, spec = extract_points(space_separated[1])
        
        result.append(pts)
        result.append(spec)     
        
    except AttributeError:
        result.append("n/a")
        result.append("")
        
    try:     
        pts, spec = extract_points(space_separated[2])
        
        result.append(pts)
        result.append(spec)
        
    except IndexError:
        result.append("n/a")
    except AttributeError:
        result.append("n/a")
    
    # Join code, title an the rest of the line into one coma separated line:
    result = ','.join(result)  + '\n'
    
    return result

In [None]:
# Compile the reqular expression for matching lines
#re_courses = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)  ([0-9]{3}\*?)( [0-9]{3})? *')
re_all_courses = re.compile('[A-Z]{2}[0-9]{3}')

In [None]:
# Create a path for csv file
filepath = 'data/cao2021_' + now_string + '.csv'

# loop through the lines of the sesponse content
no_lines = 0

# Open the csv file for writing
with open(filepath, 'w') as f:
    for line in resp.iter_lines():
    
        dline = line.decode(new_encoding)
        
        # Check if line starts with two capital letters followed by three numbers: ('[A-Z]{2}[0-9]{3}')
        matched = re_all_courses.match(dline)
        if matched:    
            
            # write a slit line into the csv file
            f.write(split_dline(dline))
            
            # count number of matching lines
            no_lines = no_lines + 1

print("Number of courses found {}".format(no_lines))

## References

# End
