## Preliminary study of the REF 2021 submissions data

In [1]:
# import the necessary packages
import sys
import pandas as pd
import importlib

# set the path to the source directory
sys.path.append('../src/')
import read_write as rw
from codebook import panel_names, uoa_names

# set to True to print details of the processing
verbose = True

# import the excel file with the REF 2021 submissions
dobj = rw.import_excel(verbose=verbose)

Imported ../data/raw/REF-2021-Submissions-All-2022-07-27.xlsx
Datasets
- Outputs
- ImpactCaseStudies
- ResearchDoctoralDegreesAwarded
- ResearchIncome
- ResearchIncomeInKind
- ResearchGroups
- InstitutionCovid19Statements


## Explore the `ResearchGroups` table

In [4]:
sheet_name = "ResearchGroups"
columns_to_drop = ['Multiple submission letter', 
                   'Multiple submission name', 
                   'Joint submission']
dset = dobj.parse(sheet_name, header=4, index_col=None, na_values=['NA']) 
dset.drop(columns_to_drop, axis=1, inplace=True)

if verbose:
    print("Processed the data to")
    print(f"- drop {columns_to_drop}")
    print()

# assign the column names
col_panel_code = 'Main panel'
col_panel_name = 'Main panel name'
col_uoa_number = 'Unit of assessment number'
col_uoa_name = 'Unit of assessment name'
col_inst_code = 'Institution UKPRN code'
col_inst_name = 'Institution name'
col_rg_code = 'Research group code'
col_rg_name = 'Research group name'

# get counts of unique values
n_records = dset.shape[0]
n_panels = len(dset[col_panel_code].unique())
n_uoa_numbers = len(dset[col_uoa_number].unique())
n_uoa_names = len(dset[col_uoa_name].unique())
n_inst_codes = len(dset[col_inst_code].unique())
n_inst_names = len(dset[col_inst_name].unique())
n_rg_codes = len(dset[col_rg_code].unique())
n_rg_names = len(dset[col_rg_name].unique())

# simple validation
validated = True
errors = []
if n_inst_codes != n_inst_names:
    errors.append(
        f"number of institution codes ({n_institution_codes}) "
        f"does not match number of institution names ({n_institution_names})"
    )
    validated = False
if n_uoa_numbers != n_uoa_names:
    errors.append(
        f"number of UOA codes ({n_uoa_numbers}) "
        f"does not match number of UOA names ({n_uoa_names})"
    )
    validated = False

print(f"Data validated : {validated}")
if not validated:
    print("ERRORS:")
    for error in errors:
        print(f"- {error}")


# some info about the data
print()
print(f"Records      : {n_records}")
print(f"Panels       : {n_panels}")
print(f"UOA          : {n_uoa_numbers}")
print(f"RG types     : {n_rg_codes}")
print(f"Institutions : {n_inst_codes}")
print(f"RG groups    : {n_rg_names}")

# assign the panel names
dset[col_panel_name] = dset[col_panel_code].map(panel_names)

print()
print("Submissions by panel split by UOA")
print("---------------------------------")
for key, value in panel_names.items():
    print(f"\nPanel {key} : {value}")
    dset_stats = dset[dset[col_panel_code] == key]\
        .value_counts(col_uoa_name).to_frame(name='Submissions')
    print(dset_stats.to_string())


Processed the data to
- drop ['Multiple submission letter', 'Multiple submission name', 'Joint submission']

Data validated : True

Records      : 2036
Panels       : 4
UOA          : 34
RG types     : 32
Institutions : 83
RG groups    : 1788

Submissions by panel split by UOA
---------------------------------

Panel A : Medicine, health and life sciences
                                                            Submissions
Unit of assessment name                                                
Allied Health Professions, Dentistry, Nursing and Pharmacy          214
Psychology, Psychiatry and Neuroscience                             119
Biological Sciences                                                 105
Clinical Medicine                                                    95
Public Health, Health Services and Primary Care                      57
Agriculture, Food and Veterinary Sciences                            41

Panel B : Physical sciences, engineering and mathematics
        