In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import pickle

In [3]:
with open('example.htm', 'r') as file:
    soup = BeautifulSoup(file.read(),'lxml')

In [54]:
def gen_database(filename: str,season: str) -> dict:
    ''' 
    Creates a dictionary of units with their statistics.

    :param filename: str filepath to read from.
    :param save_filename: str filepath to write to.
    :param sem: adds the semester and year in which the unit was done. e.g 2020_S1
    :output: dictionary.

    '''
    
    with open(filename, "r") as f:
        contents = f.read()
        soup = BeautifulSoup(contents, "lxml")

    database = {}

    for article in soup.find_all("article"):
        # invited number first
        base = article.find(
            "div", attrs={"class": "CrossCategoryBlockRow TableContainer"}).find("tbody")
        
        invited = int(
            base
            .find("tr", attrs={'class': 'CondensedTabularOddRows'})
            .find('td')
            .text)
        
        responded = int(
            base
            .find("tr", attrs={'class': 'CondensedTabularEvenRows'})
            .find('td')
            .text)

        if responded <= 1: continue

        entry = {}
        entry["Responses"] = responded
        entry["Invited"] = invited
        entry["Season"] = season
        entry['Response Rate'] = responded/invited*100

        # Full unit code
        code = article.find("table").find_all("tr")[3].text

        # Filter out MALAYSIA, COMPOSITE, ALFRED, SAFRICA
        if any(location in code for location in ["MALAYSIA","ALFRED","SAFRICA","FLEXIBLE"]):
            continue

        entry["code"] = code
        entry["unit_code"] = code.split("_")[0][1:]
        # Do not display on datatable, used only for queries
        try:
            entry["Level"] = int(entry["unit_code"][3]) 
        except ValueError: 
            entry["Level"] = 0
        scores = []
        # Response categories, retrieve all tables
        for item_num,divs in enumerate(
            article.find_all("div", attrs={"class": "FrequencyBlock_HalfMain"})):

            score_table = divs.find_all("table")[1].tbody.find_all("tr") # Split by stats and chart

            # Extract the means and medians from their td element
            mean, median = list(
                map(lambda x: x.find("td").text,
                    score_table
                )
            )[1:3]

            # Attempt conversion, not sure if this activates...?
            try:
                mean, median = float(mean), float(median)
                entry[f'I{item_num+1}'] = [mean,median]
                scores.append([mean,median])
            except ValueError:
                print(f"score could not be converted: {code}, {mean}, {median}")

        entry['agg_score'] = [sum(map(
            lambda item:item[measure],scores))/len(scores)
            for measure in range(2)
            ]

        database[code] = entry
        # Serialize after each point
        '''
        with open(save_filename, "wb") as f:
            pickle.dump(database, f, pickle.HIGHEST_PROTOCOL)
        '''

    return pd.DataFrame(database).T

In [55]:
thing = gen_database('batch_conversion.htm','2020_S1')


In [19]:
with open('test.pkl','wb') as file:
    pickle.dump(thing,file,pickle.HIGHEST_PROTOCOL)

In [20]:
with open('test.pkl','rb') as file:
    db = pickle.load(file)

In [2]:
from setu import gen_database

In [3]:
setu_2019 = gen_database("D://Programming//Python//data_analysis//SETool//conversion//2019_S1_SETU.html","setudb_2019_S1.pkl","2019_S1")

In [7]:
setu_2019.columns

Index(['Responses', 'Invited', 'Season', 'Response Rate', 'code', 'unit_code',
       'Level', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10',
       'I11', 'I12', 'I13', 'agg_score'],
      dtype='object')

In [9]:
setu_2019['unit_code']

\nACB1100_PENINSULA_ESP-EC_ON_S1-01\n       ACB1100
\nACB1100_PENINSULA_ON-CAMPUS_ON_S1-01\n    ACB1100
\nACB1200_PENINSULA_ON-CAMPUS_ON_S1-01\n    ACB1200
\nACB2020_PENINSULA_ON-CAMPUS_ON_S1-01\n    ACB2020
\nACB2491_PENINSULA_ON-CAMPUS_ON_S1-01\n    ACB2491
                                             ...   
\nVCO3206_CAULFIELD_ON-CAMPUS_ON_S1-01\n    VCO3206
\nVCO3403_CAULFIELD_ON-CAMPUS_ON_S1-01\n    VCO3403
\nVPR1011_CAULFIELD_ON-CAMPUS_ON_S1-01\n    VPR1011
\nVPR2011_CAULFIELD_ON-CAMPUS_ON_S1-01\n    VPR2011
\nVPR3011_CAULFIELD_ON-CAMPUS_ON_S1-01\n    VPR3011
Name: unit_code, Length: 1733, dtype: object

In [10]:
with open("D://Programming//Python//data_analysis//SETool//assets//setudb_total.pkl",'rb') as file:
    total_db = pickle.load(file)

In [13]:
total = pd.concat([total_db,setu_2019]).reset_index(drop=True)

In [14]:
with open('setudb_2019S1-2020S1.pkl','wb') as file:
    pickle.dump(total, file, pickle.HIGHEST_PROTOCOL)

In [16]:
total[total['unit_code']=='MTH2140']

Unnamed: 0,Responses,Invited,Season,Response Rate,code,unit_code,Level,I1,I2,I3,...,I5,I6,I7,I8,I9,I10,I11,I12,I13,agg_score
3235,5,22,2020_S1,22.727273,MTH2140_CLAYTON_ON-CAMPUS_ON_S1-01,MTH2140,2,"[4.4, 4.67]","[4.6, 4.88]","[4.4, 4.33]",...,"[4.2, 4.67]","[4.2, 4.25]","[3.8, 4.0]","[4.0, 4.0]","[4.0, 4.0]","[4.0, 4.0]","[4.0, 4.0]","[4.2, 4.25]","[4.0, 4.0]","[4.107692307692308, 4.183076923076923]"
6831,10,24,2021_S1,41.666667,MTH2140_CLAYTON_ON-CAMPUS_ON_S1-01,MTH2140,2,"[4.2, 4.5]","[4.4, 4.33]","[4.3, 4.3]",...,"[4.3, 4.3]","[4.2, 4.25]","[4.3, 4.3]","[4.1, 4.17]","[4.3, 4.5]","[4.1, 4.1]","[4.1, 4.1]","[3.9, 4.0]","[3.89, 4.0]","[4.168461538461538, 4.2323076923076925]"
8553,11,33,2019_S1,33.333333,\nMTH2140_CLAYTON_ON-CAMPUS_ON_S1-01\n,MTH2140,2,"[4.64, 4.71]","[4.55, 4.58]","[4.73, 4.81]",...,"[4.45, 4.58]","[4.45, 4.58]","[4.36, 4.4]","[4.36, 4.4]","[4.64, 4.71]","[4.45, 4.58]","[4.55, 4.58]","[4.27, 4.19]","[4.11, 4.13]","[4.47, 4.525384615384615]"


In [17]:
total.shape

(8857, 21)