# What are the key trends and factors influencing the enrolment in undergraduate programs at LSE? 


## Data Acquisition

### data from LSE course calendar for each course over the years

In [128]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_course_numbers(url, year):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    content = soup.find("div", attrs={"class": "right-container"})
    links = content.find_all("a", href=True)
    course_list = []
    for link in links:
        text = link.text.strip()
        if len(text) > 2:
            course_list.append(text[0:6].strip())

    service = Service("chromedriver.exe")
    driver = webdriver.Chrome(service=service)

    course_enrolment = {}

    for course in course_list:
        course_url = f"https://www.lse.ac.uk/resources/calendar{year}-{year+1}/courseGuides/{course[0:2]}/{year}_{course}.htm"
        driver.get(course_url)
        try:
            total_students = WebDriverWait(driver, 0.4).until(
                EC.presence_of_element_located((By.XPATH, "//div[@id='keyFacts-Content']/p[2]"))
            ).text
            course_enrolment[course] = total_students.split(":")[-1].strip()
            
        except:
            #print(f"cant find {course} in {year}")
            pass

    driver.quit()
    df = pd.DataFrame.from_dict(course_enrolment, orient="index")
    return df
course_numbers_2024_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2024)
course_numbers_2023_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2023)
course_numbers_2022_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2022)
course_numbers_2021_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2021)
course_numbers_2020_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2020)
course_numbers_2019_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2019)
course_numbers_2018_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2018)
course_numbers_2017_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2017)
course_numbers_2016_df = get_course_numbers("https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/undergraduate.htm", 2016)
print("doneeeeeeeeeeeeeeeeeeeeeeeee")

cant find LSE100
cant find LSE100
cant find LSE100
cant find AC205
cant find AC206
cant find AN253
cant find AN273
cant find AN285
cant find AN286
cant find AN287
cant find AN288
cant find AN379
cant find AN389
cant find AN390
cant find AN395
cant find AN3A1
cant find DS205
cant find EC328
cant find FM210
cant find FM211
cant find FM214
cant find FM215
cant find FM310
cant find FM311
cant find GV333
cant find GV362
cant find GV3L6
cant find HY248
cant find HY249
cant find HY344
cant find IR374
cant find IR392
cant find LL150
cant find LL200
cant find LL213
cant find LL216
cant find LL217
cant find LL220
cant find LL224
cant find LL225
cant find LL228
cant find LL229
cant find LL243
cant find LL244
cant find LL245
cant find LL279
cant find LL280
cant find LL304
cant find LL306
cant find LL307
cant find LL332
cant find LSE100
cant find LSE100
cant find LSE100
cant find MA221
cant find PH241
cant find SO243
cant find ST101
cant find ST111
cant find ST360
cant find AC105
cant find AC106
ca

for data cleaning, there are a lot of courses which have missing years. so maybe use only the courses where 4 or more recorded students numbers are present and use it like that

In [137]:
print(course_numbers_2021_df)
print(course_numbers_2023_df)

                 0
AC102          579
AC103          369
AC311          120
AC312          132
AC331          179
...            ...
ST311  Unavailable
ST312  Unavailable
ST326           60
ST327           62
ST330           74

[439 rows x 1 columns]
                 0
AC102          591
AC103          224
AC105  Unavailable
AC106  Unavailable
AC311          146
...            ...
ST313           13
ST314  Unavailable
ST326           53
ST327           61
ST330           67

[511 rows x 1 columns]


## data from Tableu, containing number of students per year - use for dropout rates

In [15]:
import tabula # you need to first install it
import pandas as pd
import warnings
warnings.simplefilter('ignore')
student_data_dict = {}
for year in [2016,2017,2018,2019,2020,2021,2022,2023]:
    dfs_in_list = tabula.read_pdf(f"data/student_data_{year}.pdf", pages='all')
    df = pd.concat(dfs_in_list, ignore_index=True)
    student_data_dict[year] = df

    

In [17]:
print(student_data_dict)

{2016:                      BA in Anthropology and Law    1st Year    17  \
0                                           NaN    2nd Year  20.0   
1                                           NaN  Final Year  16.0   
2                                           NaN       Total  53.0   
3    BSc in Business Mathematics and Statistics    1st Year  22.0   
4                                           NaN    2nd Year  34.0   
..                                          ...         ...   ...   
131                                         NaN       Total   NaN   
132                                         NaN    1st Year   NaN   
133                                         NaN    2nd Year   NaN   
134                                         NaN  Final Year   NaN   
135                                         NaN       Total   NaN   

    BSc in Accounting and Finance    129  
0                             NaN    NaN  
1                             NaN    NaN  
2                             NaN  

In [None]:
## here i think its best to use SQL to take each df (stored as values) and then maybe make different tables and then combine them using sql

## data from HEFA - clean data already

In [41]:
staff_data_dict = {}
for year in [2016,2017,2018,2019,2020,2021,2022,2023]:
    df = pd.read_csv(f"data/staff_data_{year}.csv", skiprows=14)
    staff_data_dict[year] = df



In [43]:
staff_data_dict[2018]

Unnamed: 0,UKPRN,HE Provider,"Managers, directors and senior officials",Professional occupations,Associate professional occupations,Clerical and manual occupations,Total academic staff
0,10007783.0,The University of Aberdeen,0,1425,5,0,1430
1,10007849.0,Abertay University,0,210,5,0,215
2,10007856.0,Aberystwyth University,0,745,5,0,750
3,10000163.0,AECC University College,0,55,0,0,55
4,10000291.0,Anglia Ruskin University,10,870,20,0,900
...,...,...,...,...,...,...,...
162,10007833.0,Wrexham University,0,205,0,0,205
163,10007657.0,Writtle University College,0,100,0,0,105
164,10007713.0,York St John University,10,355,15,0,380
165,10007167.0,The University of York,0,1935,0,0,1935


In [None]:
# extract lse from each df using sql and then make bar graph showing the trend or line plot