## Preliminary study of the REF 2021 submissions data

In [1]:
import sys
import pandas as pd

# set the path to the source directory
sys.path.append('../src/')
import read_write as rw
import codebook as cb

# set to True to print details of the processing
verbose = True

## Explore the `Outputs` table

In [7]:
sheet_name = "Outputs"

fname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(fname, index_col=None, dtype={0: str})
if verbose:
    print(f"Read {fname}")


Read ../data/raw/extracted/Outputs.csv


## Explore the `ResearchGroups` table

In [6]:
sheet_name = "ResearchGroups"
columns_to_drop = ['Multiple submission letter', 
                   'Multiple submission name', 
                   'Joint submission']

fname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(fname, index_col=None, dtype={0: str})
if verbose:
    print(f"Read {fname}")

dset.drop(columns_to_drop, axis=1, inplace=True)

if verbose:
    print()
    print("Processed the data to")
    print(f"- drop {columns_to_drop}")
    print()

# get counts of unique values
n_records = dset.shape[0]
n_panels = len(dset[cb.COL_PANEL_CODE].unique())
n_uoa_numbers = len(dset[cb.COL_UOA_NUMBER].unique())
n_uoa_names = len(dset[cb.COL_UOA_NAME].unique())
n_inst_codes = len(dset[cb.COL_INST_CODE].unique())
n_inst_names = len(dset[cb.COL_INST_NAME].unique())
n_rg_codes = len(dset[cb.COL_RG_CODE].unique())
n_rg_names = len(dset[cb.COL_RG_NAME].unique())

# simple validation
validated = True
errors = []
if n_inst_codes != n_inst_names:
    errors.append(
        f"number of institution codes ({n_inst_codes}) "
        f"does not match number of institution names ({n_inst_names})"
    )
    validated = False
if n_uoa_numbers != n_uoa_names:
    errors.append(
        f"number of UOA codes ({n_uoa_numbers}) "
        f"does not match number of UOA names ({n_uoa_names})"
    )
    validated = False

print(f"Data validated : {validated}")
if not validated:
    print("ERRORS:")
    for error in errors:
        print(f"- {error}")


# some info about the data
print()
print(f"Records      : {n_records}")
print(f"Panels       : {n_panels}")
print(f"UOA          : {n_uoa_numbers}")
print(f"RG types     : {n_rg_codes}")
print(f"Institutions : {n_inst_codes}")
print(f"RG groups    : {n_rg_names}")

# assign the panel names
dset[cb.COL_PANEL_NAME] = dset[cb.COL_PANEL_CODE].map(cb.PANEL_NAMES)

print()
print("Submissions by panel split by UOA")
print("---------------------------------")
for key, value in cb.PANEL_NAMES.items():
    print(f"\nPanel {key} : {value}")
    dset_stats = dset[dset[cb.COL_PANEL_CODE] == key]\
        .value_counts(cb.COL_UOA_NAME).to_frame(name='Submissions')
    print(dset_stats.to_string())

Read ../data/raw/extracted/ResearchGroups.csv

Processed the data to
- drop ['Multiple submission letter', 'Multiple submission name', 'Joint submission']

Data validated : True

Records      : 2036
Panels       : 4
UOA          : 34
RG types     : 32
Institutions : 83
RG groups    : 1788

Submissions by panel split by UOA
---------------------------------

Panel A : Medicine, health and life sciences
                                                            Submissions
Unit of assessment name                                                
Allied Health Professions, Dentistry, Nursing and Pharmacy          214
Psychology, Psychiatry and Neuroscience                             119
Biological Sciences                                                 105
Clinical Medicine                                                    95
Public Health, Health Services and Primary Care                      57
Agriculture, Food and Veterinary Sciences                            41

Panel B : Physical