In [1]:
# Libraries
import os
import pandas as pd
import glob
import json
import re
import numpy as np

In [2]:
DATA_PATH = os.path.join("..", "data")
MERGED_DATA_PATH = os.path.join(DATA_PATH, "merged")
CSV_SEPARATOR = "|"

PROGRAMS = {
    901 : "INFORMATION SYSTEMS",
    902 : "COGNITIVE SCIENCES",
    903 : "MODELLING AND SIMULATION",
    904 : "INFORMATICS ONLINE",
    905 : "SOFTWARE MANAGEMENT",
    906 : "MEDICAL INFORMATICS",
    908 : "BIOINFORMATICS",
    909 : "MULTIMEDIA INFORMATICS",
    910 : "CYBER SECURITY",
    911 : "DATA INFORMATICS"
}

In [3]:
# All courses opened by a specific department
courses_opened_by_a_specific_department_path = os.path.join(f"{DATA_PATH}","*all_courses.csv")
courses_opened_by_a_specific_department = glob.glob(courses_opened_by_a_specific_department_path)

print(courses_opened_by_a_specific_department[:3])
print()

['../data/908_all_courses.csv', '../data/911_all_courses.csv', '../data/901_all_courses.csv']



In [4]:
# All information about each course
# e.g. course content, course objectives, learning outcomes...
course_contents_path = os.path.join(f"{DATA_PATH}","*.json")
course_contents = glob.glob(course_contents_path)
print(course_contents[:3])

['../data/9060537.json', '../data/9030515.json', '../data/9020515.json']


In [5]:
# All courses opened by a specific department in 2024-2025 fall semester
# 20241 means "Semester: 2024-2025 Fall"
courses_opened_in_2024_2025_Fall_path = os.path.join(f"{DATA_PATH}", "20241-*.csv")
courses_opened_in_2024_2025_Fall = glob.glob(courses_opened_in_2024_2025_Fall_path)
print(courses_opened_in_2024_2025_Fall[:3])

['../data/20241-904.csv', '../data/20241-910.csv', '../data/20241-911.csv']


### Scraped Data Contents
![](img/scraped_data.png)

In [6]:
def trim_before_pipe(str):
    pipe_index = str.find('|')
    
    if pipe_index != -1:
        str = str[pipe_index + 1:]

    return str


def trim_buzz_words(str):
    str = str.replace(" Course Syllabus", "")
    str = str.replace("Course Objectives", "")
    str = str.replace("Course Learning Outcomes", "")
    return str


def find_in(program, collection):
    return list(filter(lambda x: str(program) in x, collection))

def get_course_name(df_all_courses, course_code):
    course_name = ""

    matching_index = df_all_courses[df_all_courses['URL'].str.contains(course_code)].index
    
    if not matching_index.empty:
        course_name = df_all_courses.loc[matching_index[0], "Course Name"]
    
    return course_name



def merge_data(all_courses_file, semester_courses_file, courses_contents_files):

    df_semester_courses = pd.read_csv(semester_courses_file)
    df_merged = df_semester_courses.copy()
    
    df_all_courses = pd.read_csv(all_courses_file)

    # Below columns should be added to new dataframe
    attributes = ["Language of Instruction",
                  "Level of Study",
                  "Course Objectives",
                  "Course Learning Outcomes",
                  "Course Content",
                  "Prerequisite"]

    for attr in attributes:
        df_merged[attr] = len(df_semester_courses) * [None]

    # Read the course content and update the dataframe
    for c in courses_contents_files:

        with open(c, 'r', encoding='utf-8') as file:
            data = json.load(file)

            filtered_data = {key: data.get(key, "Not Available")
                             for key in attributes}

            # Clean data
            filtered_data["Course Objectives"] = trim_before_pipe(
                trim_buzz_words(filtered_data["Course Objectives"]))
            filtered_data["Course Learning Outcomes"] = trim_before_pipe(
                trim_buzz_words(filtered_data["Course Learning Outcomes"]))

            # Update the 'key' column where 'Course Code' equals 'data["Course Code"]'
            for key, value in filtered_data.items():
                # print(data["Course Code"], key, f"'{value}'")

                filtered_rows = df_merged['Course Code'] == int(
                    data["Course Code"])
                df_merged.loc[filtered_rows, key] = value
        
            
            # If that course does not exist in the latest semester
            if int(data["Course Code"]) not in df_merged["Course Code"].values:
                # print(f"{data['Course Code']} was not inside.")

                new_row = pd.DataFrame([[np.nan] * len(df_merged.columns)], columns=df_merged.columns)
                
                new_row["Course Code"] = int(data["Course Code"])
                
                for attr in attributes:
                    new_row[attr] = data.get(attr, "Not Available")
                
                new_row["Semester"] = data["Offered Semester"]
                new_row["Course Objectives"] = trim_before_pipe(trim_buzz_words(data["Course Objectives"]))
                new_row["Course Learning Outcomes"] = trim_before_pipe(trim_buzz_words(data["Course Learning Outcomes"]))
                new_row["ECTS Credit"] = data["ECTS Credit"]
                new_row["Course Name"] = get_course_name(df_all_courses, data["Course Code"])
                new_row["Instructor Name"] = data["Course Coordinator"]
                
                # df = pd.concat([df, empty_row], ignore_index=True)
                df_merged = pd.concat([df_merged, new_row], ignore_index=True)

    return df_merged

In [7]:
for program in PROGRAMS.keys():
    # For program 901

    # 901_all_courses.csv
    all_courses_file = find_in(
        program, courses_opened_by_a_specific_department)[0]

    # 20241-901.csv
    semester_courses_file = find_in(
        program, courses_opened_in_2024_2025_Fall)[0]

    # All json files started with 901
    courses_contents_files = find_in(program, course_contents)

    # print(all_courses_file, semester_courses_file, courses_contents_files)

    merged_data = merge_data(
        all_courses_file, semester_courses_file, courses_contents_files)
    filename = os.path.join(MERGED_DATA_PATH, f"{program}-merged.csv")
    merged_data.to_csv(filename, sep=CSV_SEPARATOR, index=False)


In [8]:
merged_path = os.path.join(f"{MERGED_DATA_PATH}","*merged.csv")
merged_files = glob.glob(merged_path)
merged_files

['../data/merged/902-merged.csv',
 '../data/merged/905-merged.csv',
 '../data/merged/908-merged.csv',
 '../data/merged/911-merged.csv',
 '../data/merged/903-merged.csv',
 '../data/merged/906-merged.csv',
 '../data/merged/910-merged.csv',
 '../data/merged/901-merged.csv',
 '../data/merged/904-merged.csv',
 '../data/merged/909-merged.csv']

In [9]:
pd.read_csv(merged_files[0], sep=CSV_SEPARATOR).head(5)

Unnamed: 0,Semester,Program Code,Program Short Name,Course Code,Course Name,Credit,ECTS Credit,Course Section,Capacity,Day1,Start Hour1,End Hour1,Instructor Name,Instructor Title,Language of Instruction,Level of Study,Course Objectives,Course Learning Outcomes,Course Content,Prerequisite
0,2024-2025 Fall,902.0,COGS,9020501,ALGORITHMIC STRUCTURES IN COGNITION,3.0,8.0,1.0,25.0,Wednesday,14:40,17:30,UMUT ÖZGE,Assist.Prof.Dr,English,Graduate,PLEASE VISIT THE COURSE WEBSITE AND COME TO TH...,"By the end of the semester, a successful stude...",Natural language and linguistic knowledge. Lan...,Not Available
1,2024-2025 Fall,902.0,COGS,9020507,COGNITIVE SCIENCE PRIMER I,1.0,5.0,1.0,25.0,Monday,12:40,13:30,MURAT PERİT ÇAKIR,Assoc.Prof.Dr.,English,Graduate,,,The course covers topics which are considered ...,Not Available
2,2024-2025 Fall,902.0,COGS,9020515,ARTIFICIAL INTELLIGENCE FOR COGNITIVE SCIENCE,3.0,8.0,1.0,15.0,Wednesday,08:40,11:30,BARBAROS YET,Assoc.Prof.Dr.,English,Graduate,"At the end of this course, the students will k...",The student will have learned the techniques f...,Fundamental Techniques of Artificial Intellige...,Not Available
3,2024-2025 Fall,902.0,COGS,9020532,THEORETICAL LINGUISTICS,3.0,8.0,1.0,25.0,Thursday,14:40,17:30,UMUT ÖZGE,Assist.Prof.Dr,English,Graduate,PLEASE VISIT THE COURSE WEBSITE AND COME TO TH...,By the end of the course students are expected...,"A survey of the history of linguistics, sound-...",Not Available
4,2024-2025 Fall,902.0,COGS,9020536,RESEARCH METHODS AND STATISTICS FOR COGNITIVE ...,3.0,8.0,1.0,15.0,Tuesday,11:40,14:30,MURAT PERİT ÇAKIR,Assoc.Prof.Dr.,English,Graduate,The major objective of this course is to enabl...,At the end of the course students will be able...,Research methods: The students will be introdu...,Not Available
