# Part 1: Getting Rweb Courses



In [2]:
from requests import request, get, RequestException
from requests.cookies import RequestsCookieJar

from bs4 import BeautifulSoup
import html
import pandas as pd
import time


### Getting cookies and total number of courses
- First make an intitial request to get the cookies 
- Make another request to get the total number of courses in the Fall 2024 term

In [53]:
#tokens and cookies

JSESSIONID = get("https://registrationssb.ucr.edu").cookies["JSESSIONID"]

#headers for the request
jar = RequestsCookieJar()
headers = {
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
}
term = "202440"
r = request("POST", "https://registrationssb.ucr.edu/StudentRegistrationSsb/ssb/term/search?mode=search", data={"term": term})

jar.update(r.cookies)

#url parameters

pageOffset = 0 #initial val
pageMaxSize = 500

#initial request to get totalCount of courses
url = f"https://registrationssb.ucr.edu/StudentRegistrationSsb/ssb/searchResults/searchResults?&txt_term={term}&pageOffset={pageOffset}&pageMaxSize={pageMaxSize}&sortColumn=subjectDescription&sortDirection=asc"
response = request("GET", url, headers=headers, cookies=jar)
totalCount = response.json()["totalCount"]
print("totalCount:", totalCount)



totalCount: 10545


#### Geting the CRNs
The Course Reference Numbers (CRNs) are essential because they are required parameters for the API used to fetch the course descriptions.

In [None]:
pageMaxSize = 500  # max request size
courses = []
pageOffset = 0

while True:
    print(len(courses))
    url = f"https://registrationssb.ucr.edu/StudentRegistrationSsb/ssb/searchResults/searchResults?&txt_term={term}&startDatepicker=&endDatepicker=&pageOffset={pageOffset}&pageMaxSize={pageMaxSize}&sortColumn=subjectDescription&sortDirection=asc"
    try:
        response = request("GET", url, headers=headers, cookies=jar)
        response.raise_for_status()
        new_courses = response.json()["data"]
        
        if not new_courses:
            print("No more courses available.")
            break
        
        courses.extend(new_courses)
        
        if len(new_courses) < pageMaxSize:
            print("Fetched all available data.")
            break
        
        pageOffset += pageMaxSize
        
        if len(courses) >= totalCount:
            break

    except RequestException as e:
        print(f"An error occurred: {e}")
        break

print(f"Total courses fetched: {len(courses)}")


In [13]:
if len(courses) == totalCount:
    print("Got all the courses")
else:
    print("Error: did not get all courses")

Got all the courses


#### Removes duplicate courses (discussions and labs)

In [29]:
uniqueCourses = {}

for course in courses:
    if course["subjectCourse"] in uniqueCourses:
        continue
    else:
        subjectCourse = course["subjectCourse"]
        uniqueCourses[subjectCourse] = {
            "courseReferenceNumber": course["courseReferenceNumber"], 
            "subjectDescription": html.unescape(course["subjectDescription"]),
            "courseTitle": html.unescape(course["courseTitle"]),
        }
print("Number of uniqure courses:", len(uniqueCourses))

Number of uniqure courses: 1685


#### Getting each course description
- Pass the CRN for a course into the POST request in order to fetch the html for the course description
- Then use beautifulsoup to parse the html
- Store data into `output` dictionary
- Estimated wait time: 5 mins

In [54]:
import requests
from bs4 import BeautifulSoup
import time

output = dict()
counter = 0
max_retries = 5  # Maximum number of retries

# Function to fetch course description
def fetch_course_description(course, term, headers):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.post(
                "https://registrationssb.ucr.edu/StudentRegistrationSsb/ssb/searchResults/getCourseDescription",
                headers=headers,
                data={"term": term, "courseReferenceNumber": course["courseReferenceNumber"]}
            )
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"Timeout occurred for course {course['courseReferenceNumber']}. Retrying...")
            retries += 1
            time.sleep(2 ** retries)  # Exponential backoff
        except requests.exceptions.RequestException as e:
            print(f"Request failed for course {course['courseReferenceNumber']}: {e}")
            break
    return None

# Main loop to fetch course descriptions
for course_key in uniqueCourses:
    course = uniqueCourses[course_key]
    response = fetch_course_description(course, term, headers)
    
    if response:
        soup = BeautifulSoup(response.text, "html.parser")
        output[course_key] = {
            "courseReferenceNumber": course["courseReferenceNumber"],
            "courseTitle": course["courseTitle"],
            "subjectDescription": course["subjectDescription"],
            "description": soup.get_text().strip().lower()
        }
        counter += 1
    else:
        print(f"Failed to fetch description for course {course['courseReferenceNumber']} after {max_retries} retries.")

print(f"Number of course descriptions fetched: {counter}")

Number of course descriptions fetched: 1685


#### Write data to CSV file
- Cleaning the data and drop into a file called `courses.csv`
- Removing first 2 sentences from course description in order to remove number of units and prerequisites

In [None]:
df = pd.DataFrame.from_dict(output, orient='index')
df.index.name = "courseNumber"

df.to_csv("courses.csv")

if len(uniqueCourses) == len(df):
    print("All courses are in CSV file")
else:
    print("Error occured: All courses are not in CSV file")