# CAO Points Notebook

***

Import all the necessary packages

In [1]:
# Package for HTTP requests
import requests as rq
# Regular experssions package
import re
# Import Pandas package
import pandas as pd
# Dates and time
import datetime as dt
# For comparing sequences (string for example)
import difflib
# For downloading files form the web
import urllib.request as urlrq

#### Create a now_string

This variable will be used to store the current date and time and used when saving the original data files with a timestamp

In [2]:
# get the current date and time
now = dt.datetime.now()

now_string = now.strftime('%Y%m%d_%H%M%S')

## Read original data from the CAO website
****

<br>

## **Read the 2019 points**


1. Points for 2019 year downloaded from http://www.cao.ie/index.php?page=points&p=2019 in .pdf file format
2. PDF file opened using Adobe Acrobat DC
3. Exported to .xlsx file format using Acrobat DC 'Export to' option
4. Pandas DataFrame created from saved .xlsx file

****

In [169]:
df19 = pd.read_excel('data\cao2019_20211129_180145.xlsx', skiprows=10, dtype=str)

In [170]:
#Check the head of the dataframe
df19.head()

Unnamed: 0,Course Code,Unnamed: 1,INSTITUTION and COURSE,Unnamed: 3,EOS,Mid
0,,,Athlone Institute of Technology,,,
1,AL801,,Software Design with Virtual Reality and Gaming,,304.0,328.0
2,AL802,,Software Design with Cloud Computing,,301.0,306.0
3,AL803,,Software Design with Mobile Apps and Connected...,,309.0,337.0
4,AL805,,Network Management and Cloud Infrastructure,,329.0,442.0


In [171]:
# List columns in imported dataframe
df19.columns

Index(['Course Code', 'Unnamed: 1', 'INSTITUTION and COURSE', 'Unnamed: 3',
       'EOS', 'Mid'],
      dtype='object')

In [172]:
# Drop the columns that hold no data.
df19 = df19.drop(['Unnamed: 1', 'Unnamed: 3'], axis=1)

In [173]:
#Check the head of the dataframe after removing Unnamed columns:
df19.head()

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
0,,Athlone Institute of Technology,,
1,AL801,Software Design with Virtual Reality and Gaming,304.0,328.0
2,AL802,Software Design with Cloud Computing,301.0,306.0
3,AL803,Software Design with Mobile Apps and Connected...,309.0,337.0
4,AL805,Network Management and Cloud Infrastructure,329.0,442.0


In [174]:
#Check the tail of the dataframe to make sure all the courses were imported:
df19.tail()

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
960,WD200,Arts (options),221,296
961,WD210,Software Systems Development,271,329
962,WD211,Creative Computing,275,322
963,WD212,Recreation and Sport Management,274,311
964,WD230,Mechanical and Manufacturing Engineering,273,348


In [175]:
#institute names are stored in the rows that don't have any values in the 'Course code' column
df19[df19['Course Code'].isna()]

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
0,,Athlone Institute of Technology,,
28,,"Institute of Technology, Carlow",,
62,,"Carlow College, St. Patrick`s",,
66,,Cork Institute of Technology,,
110,,University College Cork (NUI),,
173,,American College,,
176,,CCT College Dublin,,
178,,Marino Institute of Education,,
185,,Dublin Business School,,
209,,Dublin City University,,


In [176]:
#Let's remove these rows:
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html
print("Shape of the dataFrame with HEI names: {}".format(df19.shape))
df19 = df19.dropna(axis=0, subset=['Course Code'])
print("Shape of the dataFrame without HEI names: {}".format(df19.shape))

Shape of the dataFrame with HEI names: (965, 4)
Shape of the dataFrame without HEI names: (930, 4)


In [177]:
# Check that all the rows without values defined in 'Course Code' column are removed
df19[df19['Course Code'].isna()]

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid


In [178]:
# 1. Filter df19 dataframe to include only EOS points that start with '#'
# 2. Once it's filtered, iterate through it to remove the '#' from the poitns value
# 3. Add the '#' to the 'Test/Interview #' column

# As per https://stackoverflow.com/questions/62397170/python-pandas-how-to-select-rows-where-objects-start-with-letters-pl
# List all the courses that start with '#'

df19['2019 Interview'] = 0

for index, row in df19[df19['EOS'].str.startswith('#', na=False)].iterrows():
    points = df19.loc[index, 'EOS'][1:]
    
    df19.loc[index, 'EOS'] = points
    df19.loc[index, '2019 Interview'] = 1

In [179]:
# Check that a new column 'Test/Interview #' was created and populated correctly and that # was removed from the points value
df19[df19['2019 Interview']==1].head(5)

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid,2019 Interview
24,AL861,Animation and Illustration (portfolio),615,899,1
25,AL863,Graphic and Digital Design (portfolio),703,898,1
31,CW038,"Art (portfolio, Wexford)",700,700,1
58,CW858,"Sports Management and Coaching (options, portf...",700,700,1
74,CR121,Music at CIT Cork School of Music,633,1052,1


In [180]:
# Display 5 top courses that have '*' at the end of the points 
df19[df19['EOS'].str.endswith('*', na=False)].head(5)

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid,2019 Interview
129,CK201,Commerce,465*,489,0
163,CK704,Occupational Therapy,532*,554,0
166,CK707,Medical and Health Sciences,510*,543,0
172,CK791,Medicine - Graduate Entry (GAMSAT required),58*,59,1
179,CM001,Education - Primary Teaching,452*,462,0


In [181]:
# 1. Filter df19 dataframe to include only EOS points that have '*' at the end
# 2. Once it's filtered, iterate through it to remove the '*' from the poitns value
# 3. Add the '*' to the 'R1 Random *' column

df19['2019 R1 Random'] = 0

for index, row in df19[df19['EOS'].str.endswith('*', na=False)].iterrows():
    points = df19.loc[index, 'EOS'][:-1]
    
    df19.loc[index, 'EOS'] = points
    df19.loc[index, '2019 R1 Random'] = 1

In [182]:
# Check that a new column 'R1 Random *' was created and populated correctly and that * was removed from the points value
df19[df19['2019 R1 Random']==1].head(5)

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid,2019 Interview,2019 R1 Random
129,CK201,Commerce,465,489,0,1
163,CK704,Occupational Therapy,532,554,0,1
166,CK707,Medical and Health Sciences,510,543,0,1
172,CK791,Medicine - Graduate Entry (GAMSAT required),58,59,1,1
179,CM001,Education - Primary Teaching,452,462,0,1


In [183]:
# Display 5 top courses that have '*' at the end of the 'Mid' column values
df19[df19['Mid'].str.endswith('*', na=False)]

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid,2019 Interview,2019 R1 Random


In [184]:
# Create a filepath with a current timestamp for the pandas data
filepath = 'data/cao2019_pandas' + now_string + '.csv'

# save the pandas dataframe as a csv file
df19.to_csv(filepath, index=False)

<br>

## **Read the 2020 points**
****

In [None]:
# Read the file directly from the CAO website
url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'
df20 = pd.read_excel(url, skiprows=10)

##### Download the original 2020 file to the disk

In [None]:
# Create a filepath with a current timestamp for the original data
filepath = 'data/cao2020_' + now_string + '.xlsx'

In [None]:
urlrq.urlretrieve(url, filepath)

In [None]:
# Check the shape of the dataframe
df20.shape

In [24]:
# Show 5 first rows of the dataframe
df20.head()

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,


Columns named Column1-Column8 serve no purpose, they need to be removed. First I will list all the columns in the dataframe and then use it as a reference for removing them:

In [25]:
# List the columns in the datafarme
df20.columns

Index(['CATEGORY (i.e.ISCED description)', 'COURSE TITLE', 'COURSE CODE2',
       'R1 POINTS', 'R1 Random *', 'R2 POINTS', 'R2 Random*', 'EOS',
       'EOS Random *', 'EOS Mid-point', 'LEVEL', 'HEI', 'Test/Interview #',
       'avp', 'v', 'Column1', 'Column2', 'Column3', 'Column4', 'Column5',
       'Column6', 'Column7', 'Column8'],
      dtype='object')

In [26]:
# There are 23 columns in the dataframe, we need to remove 8, so we'll remove the columns from 15 to 23:
df20 = df20.drop(df20.columns[15:23], axis=1)

In [27]:
# Check the head of the dataframe to make sure correct columns were removed
df20.head()

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
0,Business and administration,International Business,AC120,209,,,,209,,280,8,American College,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,8,American College,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,


In [28]:
# Check the end of the dataframe, to make sure all the rows were loaded
df20.tail()

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,7,Waterford Institute of Technology,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,8,Waterford Institute of Technology,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,8,Waterford Institute of Technology,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,8,Waterford Institute of Technology,,,
1463,Engineering and engineering trades,Mechanical and Manufacturing Engineering,WD230,253,,,,253,,369,8,Waterford Institute of Technology,,,


In [29]:
# Spot check, make sure that Excels row 765 was imported correctly. 
# Recalculate the index by substracting 10 skipped rows and allow for the fact that panda numbers rows from 0 and Excel from 1 plus allow for the first row being used as header
df20.iloc[765-12]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Road Transport Technology and Management
COURSE CODE2                                                           LC286
R1 POINTS                                                                264
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      264
EOS Random *                                                             NaN
EOS Mid-point                                                            360
LEVEL                                                                      7
HEI                                         Limerick Institute of Technology
Test/Interview #                                                         NaN

In [30]:
# Create a filepath with a current timestamp for the pandas data
filepath = 'data/cao2020_pandas' + now_string + '.csv'

In [31]:
# save the pandas dataframe as a csv file
df20.to_csv(filepath, index=False)

### Create a dataframe that holds all the names of Higher Education Institutions and first 2 letter of their courses

In [32]:
# Stire unique values of the 'HEI' column:
hei = pd.DataFrame(df20['HEI'].unique(), columns=['HEI'])

In [33]:
#https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

for index, row in hei.iterrows():
    
    code = df20[df20['HEI']==row['HEI']]['COURSE CODE2'].iloc[0]
    hei.loc[index, 'Code'] = code[0:2]

In [237]:
# check the hei dataframe:
hei[hei.duplicated('Code')]

Unnamed: 0,HEI,Code


In [238]:
hei.set_index('Code')

Unnamed: 0_level_0,HEI
Code,Unnamed: 1_level_1
AC,American College
AD,National College of Art and Design
AL,Athlone Institute of Technology
AS,St. Angela`s College
CI,Irish College of Humanities & Applied Sciences
CK,University College Cork (NUI)
CM,Marino Institute of Education
CR,Cork Institute of Technology
CT,CCT College Dublin
CW,"Institute of Technology, Carlow"


<br>

## Read the 2021 points
****


In [35]:
# URL of the page with the 2021 CAO points
url = 'http://www2.cao.ie/points/l8.php'

# Fetch the CAO points URL
resp = rq.get(url)

resp

<Response [200]>

<br>

## Save the original data
***

In [36]:
# Create a filepath with a current timestamp for the original data
filepath = 'data/cao2021_' + now_string + '.html'

In [37]:
# Server uses wron encoding, we need to fix it
orig_encoding = resp.encoding

# We need to use 'cp1252' endcoding
new_encoding = 'cp1252'

#change to cp1252
resp.encoding = new_encoding

In [38]:
# Save the original html file in csv format
with open(filepath, 'w') as f:
    f.write(resp.text)

<br>

## Clean the data
***

In [39]:
# compile regular expression to find all the numbers in the String
points = re.compile('[0-9]+')

In [40]:
def extract_points(chunk):
    # Match all the numbers in the string
    pnt = points.search(chunk)

    # Find the difference between the starting string and the found numbers
    # In order to extract any special requirements like *, # etc
    # Solution found on:
    # https://stackoverflow.com/questions/17904097/python-difference-between-two-strings
    diff = [li.replace('+ ','') for li in difflib.ndiff(pnt.group(0), chunk) if li[0] != ' ']
    
    return pnt.group(0), ' '.join(diff)

In [41]:
# Define the function for splitting the lines

def split_dline(input_line):
    
    result = []
    
    # split the line by 3 or more spaces (2 spaces are not enough as there are 2 spaces in some of the courses descriptions)
    space_separated = re.split('   +', input_line)
    
    # The first string in the resulting list is the course code and title, in the format: 'AL801  Software Design for Virtual Reality and Gaming'
    code_title = space_separated[0]   
    
    # The code is first 5 characters 
    result.append(code_title[0:5])
            
    # The title starts at 7th location
    result.append(code_title[7:len(code_title)])
    
    result.append(space_separated[1])
    
    try:
        result.append(space_separated[2]) 
    except IndexError:
        result.append("")
    
    '''
    
    # Append Round 1 points limits and extract any special requirements
    try:
        pts, spec = extract_points(space_separated[1])
        
        result.append(pts)
        result.append(spec)     
        
    except AttributeError:
        result.append("n/a")
        result.append("")
        
    try:     
        pts, spec = extract_points(space_separated[2])
        
        result.append(pts)
        result.append(spec)
        
    except IndexError:
        result.append("n/a")
    except AttributeError:
        result.append("n/a")
    
    '''
    
    # Join code, title an the rest of the line into one coma separated line:
    result = ','.join(result)  + '\n'
    
    
    
    return result

In [42]:
# Compile the reqular expression for matching lines
#re_courses = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)  ([0-9]{3}\*?)( [0-9]{3})? *')
re_all_courses = re.compile('[A-Z]{2}[0-9]{3}')

In [43]:
# Create a path for csv file
filepath21 = 'data/cao2021_' + now_string + '.csv'

# loop through the lines of the sesponse content
no_lines = 0

# Open the csv file for writing
with open(filepath21, 'w') as f:
    
    # Add first line with column titles
    # Use the same column titles as in the 2020 Excel file
    f.write("COURSE CODE2, COURSE TITLE, R1 POINTS, R2 POINTS \n")
    
    for line in resp.iter_lines():
        
        dline = line.decode(new_encoding)
        
        # Check if line starts with two capital letters followed by three numbers: ('[A-Z]{2}[0-9]{3}')
        matched = re_all_courses.match(dline)
        if matched:    
            
            # write a slit line into the csv file
            f.write(split_dline(dline))
            
            # count number of matching lines
            no_lines = no_lines + 1

print("Number of courses found {}".format(no_lines))

Number of courses found 949


In [44]:
# Create a Pandas dataframe from the saved csv file

df21 = pd.read_csv(filepath21, encoding='cp1252')
df21.head(5)

Unnamed: 0,COURSE CODE2,COURSE TITLE,R1 POINTS,R2 POINTS
0,AL801,Software Design for Virtual Reality and Gaming,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructure,321,
4,AL810,Quantity Surveying,328,


## Concatenate data frames

In [90]:
print(df21.columns)
df21.head(5)

Index(['COURSE CODE2', ' COURSE TITLE', ' R1 POINTS', ' R2 POINTS '], dtype='object')


Unnamed: 0,COURSE CODE2,COURSE TITLE,R1 POINTS,R2 POINTS
0,AL801,Software Design for Virtual Reality and Gaming,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructure,321,
4,AL810,Quantity Surveying,328,


In [122]:
df21.columns=['COURSE CODE', 'COURSE TITLE', 'R1 2021', 'R2 2021']

In [123]:
print(df20.columns)
df20.head(5)

Index(['CATEGORY (i.e.ISCED description)', 'COURSE TITLE', 'COURSE CODE',
       'R1 POINTS', 'R1 Random *', 'R2 POINTS', 'R2 Random*', 'EOS',
       'EOS Random *', 'EOS Mid-point', 'LEVEL', 'HEI', 'Test/Interview #',
       'avp', 'v'],
      dtype='object')


Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
0,Business and administration,International Business,AC120,209,,,,209,,280,8,American College,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,8,American College,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,


In [124]:
df20.columns=['CATEGORY (i.e.ISCED description)', 'COURSE TITLE', 'COURSE CODE', 'R1 2020', 'R1 Random *', 'R2 P2020', 'R2 Random*', 
              'EOS', 'EOS Random *', 'EOS Mid-point', 'LEVEL', 'HEI', 'Test/Interview #', 'avp', 'v']

In [187]:
print(df19.columns)
df19.head(5)

Index(['Course Code', 'INSTITUTION and COURSE', 'EOS', 'Mid', '2019 Interview',
       '2019 R1 Random'],
      dtype='object')


Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid,2019 Interview,2019 R1 Random
1,AL801,Software Design with Virtual Reality and Gaming,304,328,0,0
2,AL802,Software Design with Cloud Computing,301,306,0,0
3,AL803,Software Design with Mobile Apps and Connected...,309,337,0,0
4,AL805,Network Management and Cloud Infrastructure,329,442,0,0
5,AL810,Quantity Surveying,307,349,0,0


In [188]:
# Rename the columns in the df19 dataFarme to match column names in df20
df19.columns=['COURSE CODE','COURSE TITLE', 'R1 2019', 'R2 2019', '2019 Interview', '2019 R1 Random']
print(df19.columns)
df19

Index(['COURSE CODE', 'COURSE TITLE', 'R1 2019', 'R2 2019', '2019 Interview',
       '2019 R1 Random'],
      dtype='object')


Unnamed: 0,COURSE CODE,COURSE TITLE,R1 2019,R2 2019,2019 Interview,2019 R1 Random
1,AL801,Software Design with Virtual Reality and Gaming,304,328,0,0
2,AL802,Software Design with Cloud Computing,301,306,0,0
3,AL803,Software Design with Mobile Apps and Connected...,309,337,0,0
4,AL805,Network Management and Cloud Infrastructure,329,442,0,0
5,AL810,Quantity Surveying,307,349,0,0
...,...,...,...,...,...,...
960,WD200,Arts (options),221,296,0,0
961,WD210,Software Systems Development,271,329,0,0
962,WD211,Creative Computing,275,322,0,0
963,WD212,Recreation and Sport Management,274,311,0,0


In [240]:
# cancatenate all the dataframes
allcourses = pd.concat([df19[['COURSE CODE', 'COURSE TITLE']], df20[['COURSE CODE', 'COURSE TITLE']], df21[['COURSE CODE', 'COURSE TITLE']]])

# check if there are some row that don't have any value in 'COURSE CODE2' column
allcourses[allcourses['COURSE CODE'].isna()]

Unnamed: 0,COURSE CODE,COURSE TITLE


In [241]:
# check if there are some row that don't have any value in 'COURSE CODE2' column
allcourses[ allcourses['COURSE TITLE'].isna()]

Unnamed: 0,COURSE CODE,COURSE TITLE


In [242]:
# Remove duplicate course codes

print(allcourses.shape)
allcourses = allcourses.drop_duplicates(subset=['COURSE CODE'])
print(allcourses.shape)

(3343, 2)
(1651, 2)


In [243]:
# ignore_index=True to re_index the dataframe after sorting
allcourses = allcourses.sort_values('COURSE CODE', ignore_index=True)

## Join the data frames

In [244]:
# As per:
# https://stackoverflow.com/questions/36505847/substring-of-an-entire-column-in-pandas-dataframe
allcourses['Code'] = allcourses['COURSE CODE'].str[0:2]
allcourses

Unnamed: 0,COURSE CODE,COURSE TITLE,Code
0,AC120,International Business,AC
1,AC137,Liberal Arts,AC
2,AD101,First Year Art & Design (Common Entry),AD
3,AD102,Graphic Design and Moving Image Design,AD
4,AD103,Textile & Surface Design and Jewellery & Objects,AD
...,...,...,...
1646,WD211,Creative Computing,WD
1647,WD212,Recreation and Sport Management,WD
1648,WD230,Mechanical and Manufacturing Engineering,WD
1649,WD231,Early Childhood Care and Education,WD


In [248]:
# Add Higher Education Institution name to the Corse Code df

allcourses = allcourses.join(hei.set_index('Code'), on='Code')
allcourses

Unnamed: 0,COURSE CODE,COURSE TITLE,Code,HEI
0,AC120,International Business,AC,American College
1,AC137,Liberal Arts,AC,American College
2,AD101,First Year Art & Design (Common Entry),AD,National College of Art and Design
3,AD102,Graphic Design and Moving Image Design,AD,National College of Art and Design
4,AD103,Textile & Surface Design and Jewellery & Objects,AD,National College of Art and Design
...,...,...,...,...
1646,WD211,Creative Computing,WD,Waterford Institute of Technology
1647,WD212,Recreation and Sport Management,WD,Waterford Institute of Technology
1648,WD230,Mechanical and Manufacturing Engineering,WD,Waterford Institute of Technology
1649,WD231,Early Childhood Care and Education,WD,Waterford Institute of Technology


In [253]:
# Remove the column Code as it was needed only to join the hei table

allcourses = allcourses.drop('Code', axis=1)


KeyError: "['Code'] not found in axis"

In [255]:
allcourses[allcourses['HEI'].isna()]

Unnamed: 0,COURSE CODE,COURSE TITLE,HEI
84,BN101,Business,
85,BN103,Business and Information Technology,
86,BN104,Computing (Information Technology),
87,BN107,Applied Social Studies in Social Care,
88,BN108,Engineering (Common Entry with Award options),
...,...,...,...
1199,TA323,Computing with IT Management,
1200,TA326,DNA and Forensic Analysis,
1201,TA327,Sports Science and Health,
1202,TA328,Computing with Language (French/German/Spanish),


In [256]:
allcourses.set_index('COURSE CODE')

Unnamed: 0_level_0,COURSE TITLE,HEI
COURSE CODE,Unnamed: 1_level_1,Unnamed: 2_level_1
AC120,International Business,American College
AC137,Liberal Arts,American College
AD101,First Year Art & Design (Common Entry),National College of Art and Design
AD102,Graphic Design and Moving Image Design,National College of Art and Design
AD103,Textile & Surface Design and Jewellery & Objects,National College of Art and Design
...,...,...
WD211,Creative Computing,Waterford Institute of Technology
WD212,Recreation and Sport Management,Waterford Institute of Technology
WD230,Mechanical and Manufacturing Engineering,Waterford Institute of Technology
WD231,Early Childhood Care and Education,Waterford Institute of Technology


In [257]:
df19.set_index('COURSE CODE')

Unnamed: 0_level_0,COURSE TITLE,R1 2019,R2 2019,2019 Interview,2019 R1 Random
COURSE CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AL801,Software Design with Virtual Reality and Gaming,304,328,0,0
AL802,Software Design with Cloud Computing,301,306,0,0
AL803,Software Design with Mobile Apps and Connected...,309,337,0,0
AL805,Network Management and Cloud Infrastructure,329,442,0,0
AL810,Quantity Surveying,307,349,0,0
...,...,...,...,...,...
WD200,Arts (options),221,296,0,0
WD210,Software Systems Development,271,329,0,0
WD211,Creative Computing,275,322,0,0
WD212,Recreation and Sport Management,274,311,0,0


In [258]:
allcourses = allcourses.join(df19[['R1 2019', 'R2 2019', '2019 Interview', '2019 R1 Random']], how="left")

In [259]:
allcourses.set_index('COURSE CODE')

Unnamed: 0_level_0,COURSE TITLE,HEI,R1 2019,R2 2019,2019 Interview,2019 R1 Random
COURSE CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AC120,International Business,American College,,,,
AC137,Liberal Arts,American College,304,328,0.0,0.0
AD101,First Year Art & Design (Common Entry),National College of Art and Design,301,306,0.0,0.0
AD102,Graphic Design and Moving Image Design,National College of Art and Design,309,337,0.0,0.0
AD103,Textile & Surface Design and Jewellery & Objects,National College of Art and Design,329,442,0.0,0.0
...,...,...,...,...,...,...
WD211,Creative Computing,Waterford Institute of Technology,,,,
WD212,Recreation and Sport Management,Waterford Institute of Technology,,,,
WD230,Mechanical and Manufacturing Engineering,Waterford Institute of Technology,,,,
WD231,Early Childhood Care and Education,Waterford Institute of Technology,,,,


Index(['COURSE CODE', 'COURSE TITLE', 'R1 2019', 'R2 2019', '2019 Interview',
       '2019 R1 Random'],
      dtype='object')

## References

# End
