# Part 1 - Combine Course Info with Requirements
## Import Libraries

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

## 1. Read the course schedule into a DataFrame

In [2]:
with open('NYU_CS_Course_Schedule.html', 'r') as HTMLFile:
    dom = BeautifulSoup(HTMLFile, "html.parser")

In [3]:
schedule = dom.select_one('.schedule-listing')

In [4]:
courses = schedule.select('li')

In [5]:
course_schedule_data = []
for course in courses:
    course_data = []
    # Each column's info are stored as 'span' -> select them 
    span_lists = course.select('span')
    # only select necessary info (Number-Section, Name, Instructor, and Time)
    for i, span in enumerate(span_lists[:4]):
        # Extract text data only from html span
        # Some text data contain unnecessary spaces. Deal with this using re.sub(' +', ' ')
        text_info = re.sub(' +', ' ', span.text.strip()).split('\n')
        # Some text in 'Name' columns is formatted differently ('\n' within the Name). Deal with this. 
        if i == 2 and len(text_info) > 1:
            text_info = [text_info[0] + ", " + text_info[-1]]
        # Drop everything except first element 
        # For Number-Section: it only takes first Number-Section which is mainly CSCI
        # For Instructor: It only takes first instructor, Second or other instructors are TA or not main
        # instructors. so decided to drop.
        course_data.append(text_info[0])
    # Replace invisiable space character with empty string
    course_data = list(map(lambda c: c.replace('\u200b',""),course_data))
    course_schedule_data.append(course_data)

In [6]:
# Make DataFrrame using Pandas by putting 2D array into data and specified feature names into column 
course_schedule = pd.DataFrame(course_schedule_data, columns = ['Number-Section', 'Name', 'Instructor', 'Time'])

In [7]:
course_schedule

Unnamed: 0,Number-Section,Name,Instructor,Time
0,CSCI-GA.1170-001,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:10PM
1,CSCI-GA.1170-002,Fundamental Algorithms Recitation,Aditya Pandey,R 8:10-9:00PM
2,CSCI-GA.1180-001,Mathematical Techniques For CS Applications,Parijat Dube,W 7:10-9:10PM
3,CSCI-GA.2110-001,Programming Languages,Cory Plock,M 4:55-6:55PM
4,CSCI-GA.2110-002,Programming Languages Recitation,Jahnavi Pothineni,R 7:10-8:00PM
...,...,...,...,...
146,CSCI-UA.0480-061,Special Topics:,Joanna Klukowska,MW 12:30-1:45PM
147,CSCI-UA.0480-063,Special Topics: Introduction to Computer Security,Joseph Bonneau,MW 2:00-3:15PM
148,CSCI-UA.0480-069,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 12:30-1:45PM
149,CSCI-UA.0480-072,Special Topics:,Lerrel Pinto,TR 3:30-4:45PM


In [8]:
course_schedule[['Number', 'Section']] = course_schedule['Number-Section'].str.extract('(.+)-(\d{3})')

In [9]:
course_schedule.drop('Number-Section',axis = 1,inplace=True)
course_schedule = course_schedule[['Number', 'Section', 'Name', 'Instructor', 'Time']]

In [10]:
course_schedule

Unnamed: 0,Number,Section,Name,Instructor,Time
0,CSCI-GA.1170,001,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:10PM
1,CSCI-GA.1170,002,Fundamental Algorithms Recitation,Aditya Pandey,R 8:10-9:00PM
2,CSCI-GA.1180,001,Mathematical Techniques For CS Applications,Parijat Dube,W 7:10-9:10PM
3,CSCI-GA.2110,001,Programming Languages,Cory Plock,M 4:55-6:55PM
4,CSCI-GA.2110,002,Programming Languages Recitation,Jahnavi Pothineni,R 7:10-8:00PM
...,...,...,...,...,...
146,CSCI-UA.0480,061,Special Topics:,Joanna Klukowska,MW 12:30-1:45PM
147,CSCI-UA.0480,063,Special Topics: Introduction to Computer Security,Joseph Bonneau,MW 2:00-3:15PM
148,CSCI-UA.0480,069,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 12:30-1:45PM
149,CSCI-UA.0480,072,Special Topics:,Lerrel Pinto,TR 3:30-4:45PM


In [11]:
course_schedule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Number      151 non-null    object
 1   Section     151 non-null    object
 2   Name        151 non-null    object
 3   Instructor  151 non-null    object
 4   Time        151 non-null    object
dtypes: object(5)
memory usage: 6.0+ KB


In [12]:
course_schedule.head(5)

Unnamed: 0,Number,Section,Name,Instructor,Time
0,CSCI-GA.1170,1,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:10PM
1,CSCI-GA.1170,2,Fundamental Algorithms Recitation,Aditya Pandey,R 8:10-9:00PM
2,CSCI-GA.1180,1,Mathematical Techniques For CS Applications,Parijat Dube,W 7:10-9:10PM
3,CSCI-GA.2110,1,Programming Languages,Cory Plock,M 4:55-6:55PM
4,CSCI-GA.2110,2,Programming Languages Recitation,Jahnavi Pothineni,R 7:10-8:00PM


In [13]:
course_schedule.tail(5)

Unnamed: 0,Number,Section,Name,Instructor,Time
146,CSCI-UA.0480,61,Special Topics:,Joanna Klukowska,MW 12:30-1:45PM
147,CSCI-UA.0480,63,Special Topics: Introduction to Computer Security,Joseph Bonneau,MW 2:00-3:15PM
148,CSCI-UA.0480,69,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 12:30-1:45PM
149,CSCI-UA.0480,72,Special Topics:,Lerrel Pinto,TR 3:30-4:45PM
150,CSCI-UA.0480,73,Special Topics: Randomized Algorithms,Richard Cole,MW 9:30-10:45AM


In [14]:
course_schedule.sample(5)

Unnamed: 0,Number,Section,Name,Instructor,Time
39,CSCI-GA.3205,1,Applied Cryptography & Network Security,Mazdak Zamani,T 4:55-6:55PM
92,CSCI-UA.0102,4,Data Structures - Recitation,Chris Davidson,W 9:30-10:45AM
147,CSCI-UA.0480,63,Special Topics: Introduction to Computer Security,Joseph Bonneau,MW 2:00-3:15PM
60,CSCI-UA.0002,4,Intro To Computer Programming (No Prior Experi...,Mihir Patil,MW 9:30-10:45AM
42,CSCI-GA.3813,2,Adv Lab MS *,STAFF,-


## 2. Read the course catalog into a DataFrame

In [15]:
with open('NYU_CS_Course_Catalog.html', 'r') as HTMLFile2:
    dom2 = BeautifulSoup(HTMLFile2, 'html.parser')

In [16]:
catalog = dom2.select_one('.courses-listing')

In [17]:
courses2 = catalog.select('li')

In [18]:
course_catalog_data = []
for course in courses2:
    course_data = []
    # Each column's info are stored as 'p' -> select them
    p_list = course.select('p')
    # Extract only necessary info (Course Title, points, and Prereqs)
    for i, p in enumerate(p_list[:3]):
        text_info = re.sub(' +', ' ', p.text.strip()).split('\n')
        # When it's Course Title, Just take first Course Number
        if i == 0:
            text_info = text_info[0].split(" ")[0]
        # When it's Points column, just take points number
        elif i == 1:
            # Extract text that ends with 'Points' using re
            point_list = re.findall('(.+) (Points)', text_info[0])
            if point_list:
                # Take only the number of credits 
                text_info = point_list[0][0]
            else:
                # if they don't have the number of credits, mark as 'NOT SPECIFIED'
                text_info = "NOT SPECIFIED"
        else:
            # When it's Prereqs column, just take contents
            text_info = text_info[0].replace("Prerequisites: ", "")
        course_data.append(text_info)
    course_catalog_data.append(course_data)

In [19]:
course_catalog = pd.DataFrame(course_catalog_data, columns = ['Number', 'Points', 'Prereqs'])

In [20]:
course_catalog = course_catalog[['Number', 'Prereqs', 'Points']]

In [21]:
course_catalog

Unnamed: 0,Number,Prereqs,Points
0,CSCI-GA.1133,,4
1,CSCI-GA.1144,CSCI-GA 1133 or departmental permission.,4
2,CSCI-GA.1170,At least one year of experience with a high-le...,3
3,CSCI-GA.1180,,3
4,CSCI-GA.2110,Students taking this class should already have...,3
...,...,...,...
91,CSCI-UA.0897,Restricted to declared computer science majors...,1 - 4
92,CSCI-UA.0898,Restricted to declared computer science majors...,1 - 4
93,CSCI-UA.0997,Permission of the department. Does not satisfy...,1 - 4
94,CSCI-UA.0998,Permission of the department. Does not satisfy...,1 - 4


In [22]:
course_catalog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Number   96 non-null     object
 1   Prereqs  96 non-null     object
 2   Points   96 non-null     object
dtypes: object(3)
memory usage: 2.4+ KB


In [23]:
course_catalog.head(5)

Unnamed: 0,Number,Prereqs,Points
0,CSCI-GA.1133,,4
1,CSCI-GA.1144,CSCI-GA 1133 or departmental permission.,4
2,CSCI-GA.1170,At least one year of experience with a high-le...,3
3,CSCI-GA.1180,,3
4,CSCI-GA.2110,Students taking this class should already have...,3


In [24]:
course_catalog.tail(5)

Unnamed: 0,Number,Prereqs,Points
91,CSCI-UA.0897,Restricted to declared computer science majors...,1 - 4
92,CSCI-UA.0898,Restricted to declared computer science majors...,1 - 4
93,CSCI-UA.0997,Permission of the department. Does not satisfy...,1 - 4
94,CSCI-UA.0998,Permission of the department. Does not satisfy...,1 - 4
95,FRSEM-UA.0597,"Some programming experience in Python, Java, J...",4


In [25]:
course_catalog.sample(5)

Unnamed: 0,Number,Prereqs,Points
21,CSCI-GA.2437,"Prerequisites include experience with Hadoop, ...",3
46,CSCI-GA.3110,Permission of the instructor for master’s stud...,4
45,CSCI-GA.3033,Prerequisites vary according to topic.,3
72,CSCI-UA.0310,Data Structures (CSCI-UA 102); Discrete Mathem...,4
61,MAINT-GA.4747,,NOT SPECIFIED


## 3. Put together both DataFrames

In [26]:
CS_course = pd.merge(course_schedule, course_catalog, on = 'Number', how = 'left')

In [27]:
CS_course

Unnamed: 0,Number,Section,Name,Instructor,Time,Prereqs,Points
0,CSCI-GA.1170,001,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:10PM,At least one year of experience with a high-le...,3
1,CSCI-GA.1170,002,Fundamental Algorithms Recitation,Aditya Pandey,R 8:10-9:00PM,At least one year of experience with a high-le...,3
2,CSCI-GA.1180,001,Mathematical Techniques For CS Applications,Parijat Dube,W 7:10-9:10PM,,3
3,CSCI-GA.2110,001,Programming Languages,Cory Plock,M 4:55-6:55PM,Students taking this class should already have...,3
4,CSCI-GA.2110,002,Programming Languages Recitation,Jahnavi Pothineni,R 7:10-8:00PM,Students taking this class should already have...,3
...,...,...,...,...,...,...,...
146,CSCI-UA.0480,061,Special Topics:,Joanna Klukowska,MW 12:30-1:45PM,Topics determine prerequisites.,4
147,CSCI-UA.0480,063,Special Topics: Introduction to Computer Security,Joseph Bonneau,MW 2:00-3:15PM,Topics determine prerequisites.,4
148,CSCI-UA.0480,069,Special Topics: Agile Software Development and...,Amos Bloomberg,MW 12:30-1:45PM,Topics determine prerequisites.,4
149,CSCI-UA.0480,072,Special Topics:,Lerrel Pinto,TR 3:30-4:45PM,Topics determine prerequisites.,4


In [28]:
pd.set_option('display.max_rows', 200)

In [29]:
display(CS_course)

Unnamed: 0,Number,Section,Name,Instructor,Time,Prereqs,Points
0,CSCI-GA.1170,1,Fundamental Algorithms,Yevgeniy Dodis,T 7:10-9:10PM,At least one year of experience with a high-le...,3
1,CSCI-GA.1170,2,Fundamental Algorithms Recitation,Aditya Pandey,R 8:10-9:00PM,At least one year of experience with a high-le...,3
2,CSCI-GA.1180,1,Mathematical Techniques For CS Applications,Parijat Dube,W 7:10-9:10PM,,3
3,CSCI-GA.2110,1,Programming Languages,Cory Plock,M 4:55-6:55PM,Students taking this class should already have...,3
4,CSCI-GA.2110,2,Programming Languages Recitation,Jahnavi Pothineni,R 7:10-8:00PM,Students taking this class should already have...,3
5,CSCI-GA.2130,1,Compiler Construction,Joseph Tassarotti,M 7:10-9:10PM,"CSCI-GA 1170, CSCI-GA 2110, and CSCI-GA 2250.",3
6,CSCI-GA.2250,1,Operating Systems,Hubertus Franke,M 7:10-9:10PM,,3
7,CSCI-GA.2250,2,Operating Systems,Yang Tang,T 4:55-6:55PM,,3
8,CSCI-GA.2262,1,Data Communications and Networks,Jean-Claude Franchitti,M 7:10-9:10PM,Students must have a working knowledge of fund...,3
9,CSCI-GA.2270,1,Computer Graphics,Daniele Panozzo,R 4:55-6:55PM,CSCI-GA 1170 and CSCI-UA 140 (or an equivalent...,3


## 4. Conclusion

I confronted several unexpected data format. 

1. when I deal with course data in course schedule page, some text data within `span` contain unnecessary space and this made me hard to extract contents only. To deal with this problem I used `sub` method of `regular expression (re)` to substitute unnecessary large space(more than two space) to one space. (' +' -> ' ') `re.sub(' +', ' ', span.text.strip())`. After that, split it by `\n` and take the first information. `text_info = re.sub(' +', ' ', span.text.strip()).split('\n')` The reason why I only take the first one is second one is mostly not exist (unnecessary space) or additional information we do not that much needed, so I decided to drop the others. However, the `Name` column sometimes include uncessary space and another professor and noticed that when I split with `\n`, it returns `[first professor, ' ', ' ', second professor]` Therefore, I decided when we are dealing with name column and if they had multiple professors (no more than 2), we just take first and last element from the list we get after split. `if i == 2 and len(text_info) > 1: text_info = [text_info[0] + text_info[-1]]`.

2. After that, sometimes, our data contain invisible space character `\u220b`. To deal with this issue, I used replace method to replace `\u220b` to "". `course_data = list(map(lambda c: c.replace('\u200b',""),course_data))`.

3. when I deal with course data in course catalog page, also some text data within `p` contain unnecessary space. I deal with this the same way that I did previously. ` text_info = re.sub(' +', ' ', p.text.strip()).split('\n')`. Other columns' data is all expected, but I faced some difficulties when I deal with `Points` column. Some courses, they have various points or even do not have points at all. The way that I approach to this issue was first extract all info before `Points` string using `findall` method of `regular expression (re)`. `point_list = re.findall('(.+) (Points)', text_info[0])`. And then, if they have information before the `Points` string (Either just one digits or range of digits - such as 1-4 ), I just take that information, or if they do not have any information before `Points` string, I just put `NOT SPECIFIED` to indicate this course do not specified its credits.

And in this particular DataFrames, we used `how=left` when we merge two DataFrames. This parameter put your second DataFrame on the very left of the first DataFrame but only merge data that is intersection with first DataFrame based on the key that specified `on` paramater.

In [30]:
print(course_schedule.head())

         Number Section                                         Name  \
0  CSCI-GA.1170     001                       Fundamental Algorithms   
1  CSCI-GA.1170     002            Fundamental Algorithms Recitation   
2  CSCI-GA.1180     001  Mathematical Techniques For CS Applications   
3  CSCI-GA.2110     001                        Programming Languages   
4  CSCI-GA.2110     002             Programming Languages Recitation   

          Instructor           Time  
0     Yevgeniy Dodis  T 7:10-9:10PM  
1      Aditya Pandey  R 8:10-9:00PM  
2       Parijat Dube  W 7:10-9:10PM  
3         Cory Plock  M 4:55-6:55PM  
4  Jahnavi Pothineni  R 7:10-8:00PM  
