## Preliminary study of the REF 2021 submissions data

In [1]:
import sys
import pandas as pd
import importlib

# set the path to the source directory
sys.path.append('../src/')
import read_write as rw
import visualisations as vis
import codebook as cb

# set to True to print details of the processing
verbose = True

## Explore the `Outputs` table

In [7]:
sheet_name = "Outputs"
infname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(infname, index_col=None, dtype={0: str})

# pre-processing
# --------------
dset = dset.fillna(cb.VALUE_ADDED_NOT_SPECIFIED)

# assign names where we only have codes
# -------------------------------------
dset[cb.COL_PANEL_NAME] = dset[cb.COL_PANEL_CODE].map(cb.PANEL_NAMES)
dset[cb.COL_OUTPUT_TYPE_NAME] = dset[cb.COL_OUTPUT_TYPE_CODE].map(cb.OUTPUT_TYPE_NAMES)

# save pre-processed data
# -----------------------
outfname = f"{rw.PROCESSSED_EXTRACTED_PATH}{sheet_name}_pprocessed.csv"
dset.to_csv(outfname)

if verbose:
    print(f"Read {infname}")
    print(f"Pre-processed dataset to ")
    print(f"- replace missing values with '{cb.VALUE_ADDED_NOT_SPECIFIED}'")
    print(f"- add names to panels and output types")
    print(f"Saved pre-processed dataset to {outfname}")

# print overall statistics
# ------------------------
print()
print(f"Records      : {dset.shape[0]}")
print(f"Institutions : {dset[cb.COL_INST_NAME].nunique()}")
print(f"Output types : {dset[cb.COL_OUTPUT_TYPE_NAME].nunique()}")
print(f"Panels       : {dset[cb.COL_PANEL_NAME].nunique()}")
print(f"UOA          : {dset[cb.COL_UOA_NAME].nunique()}")

vis.calculate_and_visualise_counts(dset, 
                                   cb.COL_OUTPUT_TYPE_NAME,
                                   do_print=True,
                                   do_plot=None)

# select and save software outputs
target_output_type = "Software"
fname_suffix = f"{target_output_type.lower()}"
outfname = f"{rw.PROCESSSED_SUBSETS_PATH}{sheet_name}_{fname_suffix}.csv"
dset[dset[cb.COL_OUTPUT_TYPE_NAME] == target_output_type].to_csv(outfname)

if verbose:
    print()
    print(f"Saved '{fname_suffix}' subset to {outfname}")   

Read ../data/raw/extracted/Outputs.csv
Pre-processed dataset to 
- add names to panels and output types
- replace missing values with 'Not specified - PP ADDED'
Saved pre-processed dataset to ../data/processed/extracted/Outputs_pprocessed.csv

Records      : 185354
Institutions : 158
Output types : 22
Panels       : 5
UOA          : 35

                                      Records  Records (%)
Output type name                                          
Journal article                        154826    83.529894
Authored book                           11801     6.366736
Chapter in book                          9475     5.111840
Conference contribution                  2272     1.225763
Edited book                              2135     1.151850
Other                                    1146     0.618276
Exhibition                                751     0.405171
Research report for external body         431     0.232528
Composition                               430     0.231989
Working pape

## Explore the `ImpactCaseStudies` table

In [21]:
importlib.reload(cb)

sheet_name = "ImpactCaseStudies"

infname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(infname, index_col=None, dtype={0: str})

# pre-processing
# --------------
dset = dset.fillna(cb.VALUE_ADDED_NOT_SPECIFIED)
# shift columns from title to the left
columns = dset.columns.tolist()
dset = dset.drop(cb.COL_IMPACT_TITLE, axis=1)
dset.columns = columns[:-1]

# assign names where we only have codes
# -------------------------------------
dset[cb.COL_PANEL_NAME] = dset[cb.COL_PANEL_CODE].map(cb.PANEL_NAMES)

# save pre-processed data
# -----------------------
outfname = f"{rw.PROCESSSED_EXTRACTED_PATH}{sheet_name}_pprocessed.csv"
dset.to_csv(outfname)

if verbose:
    print(f"Read {infname}")
    print(f"Pre-processed dataset to ")
    print(f"- replace missing values with '{cb.VALUE_ADDED_NOT_SPECIFIED}'")
    print(f"- shift columns from title to the left")
    print(f"- add names to panels and output types")
    print(f"Saved pre-processed dataset to {outfname}")

# print overall statistics
# ------------------------
print()
print(f"Records      : {dset.shape[0]}")
print(f"Institutions : {dset[cb.COL_INST_NAME].nunique()}")
print(f"Panels       : {dset[cb.COL_PANEL_NAME].nunique()}")
print(f"UOA          : {dset[cb.COL_UOA_NAME].nunique()}")

Read ../data/raw/extracted/ImpactCaseStudies.csv
Pre-processed dataset to 
- replace missing values with 'Not specified - PP ADDED'
- shift columns from title to the left
- add names to panels and output types
Saved pre-processed dataset to ../data/processed/extracted/ImpactCaseStudies_pprocessed.csv

Records      : 6362
Institutions : 156
Panels       : 4
UOA          : 35


## Explore the `ResearchGroups` table

In [8]:
sheet_name = "ResearchGroups"

infname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(infname, index_col=None, dtype={0: str})

# pre-processing
# --------------
dset = dset.fillna(cb.VALUE_ADDED_NOT_SPECIFIED)

# assign names where we only have codes
# -------------------------------------
dset[cb.COL_PANEL_NAME] = dset[cb.COL_PANEL_CODE].map(cb.PANEL_NAMES)

# save pre-processed data
# -----------------------
outfname = f"{rw.PROCESSSED_EXTRACTED_PATH}{sheet_name}_pprocessed.csv"
dset.to_csv(outfname)

if verbose:
    print(f"Read {infname}")
    print(f"Pre-processed dataset to ")
    print(f"- replace missing values with '{cb.VALUE_ADDED_NOT_SPECIFIED}'")
    print(f"- add names to panels")
    print(f"Saved pre-processed dataset to {outfname}")

# some info about the data
print()
print(f"Records      : {dset.shape[0]}")
print(f"Institutions : {dset[cb.COL_INST_NAME].nunique()}")
print(f"RGs          : {dset[cb.COL_RG_NAME].nunique()}")
print(f"RG types     : {dset[cb.COL_RG_CODE].nunique()}")
print(f"Panels       : {dset[cb.COL_PANEL_NAME].nunique()}")
print(f"UOA          : {dset[cb.COL_UOA_NAME].nunique()}")


Read ../data/raw/extracted/ResearchGroups.csv
Pre-processed dataset to 
- add names to panels
- replace missing values with 'Not specified - PP ADDED'
Saved pre-processed dataset to ../data/processed/extracted/ResearchGroups_pprocessed.csv

Records      : 2036
Institutions : 83
RGs          : 1788
RG types     : 32
Panels       : 4
UOA          : 34
