In [1]:
import requests
import pandas as pd
import json
import html
from bs4 import BeautifulSoup
course_schedule = 'spring_2021_cs.html'
catalog = 'cs_catalog.html'

In [2]:
def extract_data_from_report3(filename):
    return BeautifulSoup(open(filename), "html.parser")
cs_spring = extract_data_from_report3(course_schedule)

Getting Course Sections and Numbers

In [3]:
courses = cs_spring.select('.schedule-listing')[0]
course_nums_html = courses.select('.expand')
course_nums = []
for course in course_nums_html:
    course_nums.append(course['href'][1:-5].upper())

Course Names

In [4]:
course_names = []
for name in [name_tag.text for name_tag in course_nums_html]:
    course_names.append(' '.join(name.strip().split()))

Getting Professor names

In [5]:
profs_html = courses.find_all('span', {'class' : 'col-xs-12 col-sm-2'})
profs_office_hours = []
for prof in range(0, len(profs_html), 3):
    profs_office_hours.append(' '.join(profs_html[prof].get_text().strip().split()))
office_hours = ['Office', 'Hours']
profs = [prof.replace('Office', '').replace('Hours', '').strip() for prof in profs_office_hours]

Getting Time of classes

In [6]:
course_times_html = courses.find_all('span', {'class' : 'col-xs-12 col-sm-2'})
course_times = []
for time in range(1, len(course_times_html), 3):
    course_times.append(' '.join(course_times_html[time].get_text().strip().split()))
courses = pd.DataFrame(list(zip(course_nums, course_names, profs, course_times)), columns =['Number-Section', 'Name', 'Instructor', 'Time'])

In [7]:
courses = pd.DataFrame(list(zip(course_nums, course_names, profs, course_times)), columns =['Number-Section', 'Name', 'Instructor', 'Time'])
courses['Number'] = courses.apply(lambda courses: courses['Number-Section'][:-4], axis = 1)
courses['Section'] = courses.apply(lambda courses: courses['Number-Section'][-3:], axis = 1)
del courses['Number-Section']

In [8]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        152 non-null    object
 1   Instructor  152 non-null    object
 2   Time        152 non-null    object
 3   Number      152 non-null    object
 4   Section     152 non-null    object
dtypes: object(5)
memory usage: 6.1+ KB


In [9]:
courses.head().append(courses.tail())

Unnamed: 0,Name,Instructor,Time,Number,Section
0,PAC II*,Mohamed Zahran,T 6:00-8:30PM,CSCI-GA1144,1
1,PAC II Recitation,Gurkirat Singh Bajwa,R 7:10-8:00PM,CSCI-GA1144,2
2,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:00PM,CSCI-GA1170,1
3,Fundamental Algorithms Recitation,Alex Bienstock Charles Peyser Fengyuan Liu,R 8:10-9:00PM,CSCI-GA1170,2
4,Fundamental Algorithms Recitation,Harish Karthikeyan,R 8:10-9:00PM,CSCI-GA1170,3
147,Special Topics: Natural Language Processing,Adam Meyers,TR 9:30-10:45AM,CSCI-UA0480,57
148,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 3:30-4:45PM,CSCI-UA0480,69
149,Special Topics: Algorithmic Problem Solving,Joanna Klukowska,MW 11:00-12:15PM,CSCI-UA0480,521
150,Special Topics: Algorithmic Problem Solving,Kunal Khatri,W 3:30-4:45PM,CSCI-UA0480,522
151,Special Topics: Algorithmic Problem Solving,Samasth Ananda,W 3:30-4:45PM,CSCI-UA0480,523


In [10]:
courses.sample(5)

Unnamed: 0,Name,Instructor,Time,Number,Section
77,Intro To Computer Programming (No Prior Experi...,Craig Kapp,MW 3:30-4:45PM,CSCI-UA0002,9
73,Intro To Computer Programming (No Prior Experi...,Hasan Aljabbouli,MW 12:30-1:45PM,CSCI-UA0002,5
146,Special Topics: Parallel Computing,Mohamed Zahran,MW 2:00-3:15PM,CSCI-UA0480,51
19,Big Data Application Development,Ulrich Finkler,W 7:10-9:00PM,CSCI-GA2437,1
46,Special Topics: Design and Analysis of Algorit...,Manuel Charlemagne,W 9:30-12:00PM,CSCI-GA3033,117


### Catalog

Course Numbers

In [11]:
with open(catalog, "r", encoding="UTF-8") as f:
    contents = f.read()
    catalog_html = BeautifulSoup(contents, 'html.parser')
catalog_courses = catalog_html.find_all('div', {'class' : 'row'})[0].find_all('li', {'class':'col-sm-12'})
catalog_numbers = []
for course in catalog_courses:
    catalog_numbers.append(course.get_text().strip()[:12])

Course Points

In [12]:
catalog_names_html = catalog_html.find_all('p', {'class': 'bold'})
catalog_credits = []
for course in range(1, len(catalog_names_html), 3):
    catalog_credits.append(catalog_names_html[course].get_text().strip().split()[0])
# I had to replace this specific index because otherwise, the points would simply be "Points." instead of the intended 1-12 the HTML denotes. 
catalog_credits[catalog_credits.index('Points.')] = '1-12'

Prerequisites

In [13]:
catalog_prereqs = []
for course in range(2, len(catalog_names_html), 3):
    catalog_prereqs.append(catalog_names_html[course].get_text().strip()[15:])

In [14]:
catalog = pd.DataFrame(list(zip(catalog_numbers, catalog_prereqs, catalog_credits)), columns =['Number', 'Prerequisites', 'Points'])

In [15]:
catalog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Number         100 non-null    object
 1   Prerequisites  100 non-null    object
 2   Points         100 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [16]:
catalog.head().append(catalog.tail())

Unnamed: 0,Number,Prerequisites,Points
0,CSCI-GA.1133,,4
1,CSCI-GA.1144,CSCI-GA 1133 or departmental permission.,4
2,CSCI-GA.1170,At least one year of experience with a high-le...,3
3,CSCI-GA.1180,,3
4,CSCI-GA.2110,Students taking this class should already have...,3
95,CSCI-UA.0897,Restricted to declared computer science majors...,1
96,CSCI-UA.0898,Restricted to declared computer science majors...,1
97,CSCI-UA.0997,Permission of the department. Does not satisfy...,1
98,CSCI-UA.0998,Permission of the department. Does not satisfy...,1
99,FRSEM-UA.059,"Some programming experience in Python, Java, J...",4


In [17]:
catalog.sample(5)

Unnamed: 0,Number,Prerequisites,Points
58,CSCI-GA.3840,Approval of a faculty adviser and the Director...,3
56,CSCI-GA.3812,For MS in IS students: Successful completion o...,3
15,CSCI-GA.2390,Strong mathematical background and instructor ...,3
79,CSCI-UA.0436,Computer Systems Organization (CSCI-UA 201) an...,4
66,CSCI-UA.0003,This course is intended for students with limi...,4


In [18]:
courses['Number'] = courses['Number'].str.replace('GA', 'GA.')
courses['Number'] = courses['Number'].str.replace('UA', 'UA.')

In [19]:
course_catalog = pd.merge(courses, catalog, on="Number", how='left')[['Number', 'Name', 'Instructor', 'Time', 'Prerequisites', 'Points']]
course_catalog

Unnamed: 0,Number,Name,Instructor,Time,Prerequisites,Points
0,CSCI-GA.1144,PAC II*,Mohamed Zahran,T 6:00-8:30PM,CSCI-GA 1133 or departmental permission.,4
1,CSCI-GA.1144,PAC II Recitation,Gurkirat Singh Bajwa,R 7:10-8:00PM,CSCI-GA 1133 or departmental permission.,4
2,CSCI-GA.1170,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:00PM,At least one year of experience with a high-le...,3
3,CSCI-GA.1170,Fundamental Algorithms Recitation,Alex Bienstock Charles Peyser Fengyuan Liu,R 8:10-9:00PM,At least one year of experience with a high-le...,3
4,CSCI-GA.1170,Fundamental Algorithms Recitation,Harish Karthikeyan,R 8:10-9:00PM,At least one year of experience with a high-le...,3
...,...,...,...,...,...,...
147,CSCI-UA.0480,Special Topics: Natural Language Processing,Adam Meyers,TR 9:30-10:45AM,Topics determine prerequisites.,4
148,CSCI-UA.0480,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 3:30-4:45PM,Topics determine prerequisites.,4
149,CSCI-UA.0480,Special Topics: Algorithmic Problem Solving,Joanna Klukowska,MW 11:00-12:15PM,Topics determine prerequisites.,4
150,CSCI-UA.0480,Special Topics: Algorithmic Problem Solving,Kunal Khatri,W 3:30-4:45PM,Topics determine prerequisites.,4


## Conclusion

* Not really many anomalies to be honest. It was all pretty straightforward.
* I did have to find clever ways to loop through different p tags because I found no way of getting just the prerequisites, say, or any other data for that matter, so I had to use the first description's index I found and loop by 4 because those were usually the amount of data points for every course.
* I also described this above, there was one course where the course points were '1 - 12' and I had to manually change it by code to '1-12' for it to be kept correctly, because else, the points were recorded as 'Points.' and that wouldn't help anyone
* How = left made it so that all columns of courses df were kept and not only those that appeared in catalog, there were many duplicate class numbers so this allowed the pd module to loop through all the rows of the "left" dataframe and merge them with rows from the "right" dataframe