## Preliminary study of the REF 2021 submissions data

In [26]:
import sys
import pandas as pd
import importlib

# set the path to the source directory
sys.path.append('../src/')
import read_write as rw
import visualisations as vis
import codebook as cb
import preprocess as pp

# set to True to print details of the processing
verbose = True

## Explore the `Outputs` table

In [7]:
sheet_name = "Outputs"
infname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(infname, index_col=None, dtype={0: str})

# pre-processing
# --------------
dset = dset.fillna(cb.VALUE_ADDED_NOT_SPECIFIED)

# assign names where we only have codes
# -------------------------------------
dset[cb.COL_PANEL_NAME] = dset[cb.COL_PANEL_CODE].map(cb.PANEL_NAMES)
dset[cb.COL_OUTPUT_TYPE_NAME] = dset[cb.COL_OUTPUT_TYPE_CODE].map(cb.OUTPUT_TYPE_NAMES)

# save pre-processed data
# -----------------------
outfname = f"{rw.PROCESSSED_EXTRACTED_PATH}{sheet_name}_pprocessed.csv"
dset.to_csv(outfname)

if verbose:
    print(f"Read {infname}")
    print(f"Pre-processed dataset to ")
    print(f"- replace missing values with '{cb.VALUE_ADDED_NOT_SPECIFIED}'")
    print(f"- add names to panels and output types")
    print(f"Saved pre-processed dataset to {outfname}")

# print overall statistics
# ------------------------
print()
print(f"Records      : {dset.shape[0]}")
print(f"Institutions : {dset[cb.COL_INST_NAME].nunique()}")
print(f"Output types : {dset[cb.COL_OUTPUT_TYPE_NAME].nunique()}")
print(f"Panels       : {dset[cb.COL_PANEL_NAME].nunique()}")
print(f"UOA          : {dset[cb.COL_UOA_NAME].nunique()}")

vis.calculate_and_visualise_counts(dset, 
                                   cb.COL_OUTPUT_TYPE_NAME,
                                   do_print=True,
                                   do_plot=None)

# select and save software outputs
target_output_type = "Software"
fname_suffix = f"{target_output_type.lower()}"
outfname = f"{rw.PROCESSSED_SUBSETS_PATH}{sheet_name}_{fname_suffix}.csv"
dset[dset[cb.COL_OUTPUT_TYPE_NAME] == target_output_type].to_csv(outfname)

if verbose:
    print()
    print(f"Saved '{fname_suffix}' subset to {outfname}")   

Read ../data/raw/extracted/Outputs.csv
Pre-processed dataset to 
- add names to panels and output types
- replace missing values with 'Not specified - PP ADDED'
Saved pre-processed dataset to ../data/processed/extracted/Outputs_pprocessed.csv

Records      : 185354
Institutions : 158
Output types : 22
Panels       : 5
UOA          : 35

                                      Records  Records (%)
Output type name                                          
Journal article                        154826    83.529894
Authored book                           11801     6.366736
Chapter in book                          9475     5.111840
Conference contribution                  2272     1.225763
Edited book                              2135     1.151850
Other                                    1146     0.618276
Exhibition                                751     0.405171
Research report for external body         431     0.232528
Composition                               430     0.231989
Working pape

## Explore the `ImpactCaseStudies` table

In [27]:
importlib.reload(cb)
importlib.reload(pp)

sheet_name = "ImpactCaseStudies"

infname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(infname, index_col=None, dtype={0: str})

# pre-processing
# --------------
dset = dset.fillna(cb.VALUE_ADDED_NOT_SPECIFIED)
# shift columns from title to the left
columns = dset.columns.tolist()
dset = dset.drop(cb.COL_IMPACT_TITLE, axis=1)
dset.columns = columns[:-1]
# replace markdown in summary column
for column in [cb.COL_IMPACT_SUMMARY, 
               cb.COL_IMPACT_UNDERPIN_RESEARCH,
               cb.COL_IMPACT_REFERENCES_RESEARCH,
               cb.COL_IMPACT_DETAILS,
               cb.COL_IMPACT_CORROBORATE
               ]:
    dset = pp.clean_markdown(dset, column)

# assign names where we only have codes
# -------------------------------------
dset[cb.COL_PANEL_NAME] = dset[cb.COL_PANEL_CODE].map(cb.PANEL_NAMES)

# save pre-processed data
# -----------------------
outfname = f"{rw.PROCESSSED_EXTRACTED_PATH}{sheet_name}_pprocessed.csv"
dset.to_csv(outfname)

if verbose:
    print(f"Read {infname}")
    print(f"Pre-processed dataset to ")
    print(f"- replace missing values with '{cb.VALUE_ADDED_NOT_SPECIFIED}'")
    print(f"- shift columns from title to the left to fix data issue")
    print(f"- add names to panels and output types")
    print(f"Saved pre-processed dataset to {outfname}")

# print overall statistics
# ------------------------
print()
print(f"Records      : {dset.shape[0]}")
print(f"Institutions : {dset[cb.COL_INST_NAME].nunique()}")
print(f"Panels       : {dset[cb.COL_PANEL_NAME].nunique()}")
print(f"UOA          : {dset[cb.COL_UOA_NAME].nunique()}")

display(dset.head())

Read ../data/raw/extracted/ImpactCaseStudies.csv
Pre-processed dataset to 
- replace missing values with 'Not specified - PP ADDED'
- shift columns from title to the left to fix data issue
- add names to panels and output types
Saved pre-processed dataset to ../data/processed/extracted/ImpactCaseStudies_pprocessed.csv

Records      : 6362
Institutions : 156
Panels       : 4
UOA          : 35


Unnamed: 0,Institution UKPRN code,Institution name,Main panel,Unit of assessment number,Unit of assessment name,Multiple submission letter,Multiple submission name,Joint submission,Title,Is continued from 2014,...,Global research identifiers,Name of funders,Researcher ORCIDs,Grant funding,1. Summary of the impact,2. Underpinning research,3. References to the research,4. Details of the impact,5. Sources to corroborate the impact,Main panel name
0,10007764,Heriot-Watt University,D,25.0,Area Studies,Not specified - PP ADDED,Not specified - PP ADDED,Not specified - PP ADDED,"""Community Placemaking"" through Heritage Work",0.0,...,Not specified - PP ADDED,[European Commission];[Scottish Graduate Schoo...,[0000-0003-1727-000X];[0000-0001-7094-8268];[0...,[693289: 160738];[not found: 28013];[not found...,Connections between people and place ca...,### 2. Underpinning research \n\n Research fr...,### 3. References to the research \n\n \[3.1\...,### 4. Details of the impact \n\n IRC researc...,### 5. Sources to corroborate the impact \n\n...,Arts and humanities
1,10007789,The University of East Anglia,A,5.0,Biological Sciences,Not specified - PP ADDED,Not specified - PP ADDED,Not specified - PP ADDED,"""Superdosing"" with phytase: improving animal f...",0.0,...,[grid.418100.c],[BBSRC],[0000-0001-6179-9109],[BB/N002024/1: 188506];[BB/M022978/1: 459551],Phytate in animal feed impairs the growth...,"2. Underpinning research \n\n \nPhytate, a com...",3. References to the research \n\n <ins> *Unde...,4. Details of the impact \n\n **Brearley**'s r...,5. Sources to corroborate the impact \n\n1. L...,"Medicine, health and life sciences"
2,10006842,The University of Liverpool,D,34.0,"Communication, Cultural and Media Studies, Lib...",Not specified - PP ADDED,Not specified - PP ADDED,Not specified - PP ADDED,#Speakout: tackling online harassment in stude...,0.0,...,[grid.496775.e],[HIGHER EDUCATION FUND COUNCIL FOR ENGLAND (UK)],[0000-0003-1195-8882],[N/A: 34597],Our research and evidence\-based interv...,# 2. Underpinning research \n\n The \#Speakou...,# 3. References to the research \n\n **Southe...,# 4. Details of the impact \n\n Our research ...,# 5. Sources to corroborate the impact \n\n# ...,Arts and humanities
3,10007855,Swansea University / Prifysgol Abertawe,D,26.0,Modern Languages and Linguistics,Not specified - PP ADDED,Not specified - PP ADDED,Not specified - PP ADDED,(Re)Discovering Europeans’ Visions of Wales: E...,0.0,...,[426413.6],[Arts and Humanities Research Council],[0000-0002-3077-6673],[AH/K001817/1: 419686],Previous perceptions of Wales as ‘unkno...,### 2. Underpinning research \n\n **European ...,### 3. References to the research \n\n The un...,### 4. Details of the impact \n\n **Enhancing...,### 5. Sources to corroborate the impact \n\n...,Arts and humanities
4,10003645,King's College London,D,28.0,History,Not specified - PP ADDED,Not specified - PP ADDED,Not specified - PP ADDED,(Towards) Informed Intelligence: Embedding Kno...,0.0,...,[grid.426413.6];[grid.450921.b];[grid.434257.3],[AHRC];[British Academy];[ESRC],[0000-0002-6093-7343];[0000-0002-6093-7343];[0...,[n/a: 2000];[AH/M504208/1: 60000];[SG112525: 9...,The last 15 years have witnessed signif...,### 2. Underpinning research \n\n Goodman’s r...,### 3. References to the research \n\n 1. **G...,### 4. Details of the impact \n\n Goodman’s r...,### 5. Sources to corroborate the impact \n\n...,Arts and humanities


## Explore the `ResearchGroups` table

In [8]:
sheet_name = "ResearchGroups"

infname = f"{rw.RAW_EXTRACTED_PATH}{sheet_name}.csv"
dset = pd.read_csv(infname, index_col=None, dtype={0: str})

# pre-processing
# --------------
dset = dset.fillna(cb.VALUE_ADDED_NOT_SPECIFIED)

# assign names where we only have codes
# -------------------------------------
dset[cb.COL_PANEL_NAME] = dset[cb.COL_PANEL_CODE].map(cb.PANEL_NAMES)

# save pre-processed data
# -----------------------
outfname = f"{rw.PROCESSSED_EXTRACTED_PATH}{sheet_name}_pprocessed.csv"
dset.to_csv(outfname)

if verbose:
    print(f"Read {infname}")
    print(f"Pre-processed dataset to ")
    print(f"- replace missing values with '{cb.VALUE_ADDED_NOT_SPECIFIED}'")
    print(f"- add names to panels")
    print(f"Saved pre-processed dataset to {outfname}")

# some info about the data
print()
print(f"Records      : {dset.shape[0]}")
print(f"Institutions : {dset[cb.COL_INST_NAME].nunique()}")
print(f"RGs          : {dset[cb.COL_RG_NAME].nunique()}")
print(f"RG types     : {dset[cb.COL_RG_CODE].nunique()}")
print(f"Panels       : {dset[cb.COL_PANEL_NAME].nunique()}")
print(f"UOA          : {dset[cb.COL_UOA_NAME].nunique()}")


Read ../data/raw/extracted/ResearchGroups.csv
Pre-processed dataset to 
- add names to panels
- replace missing values with 'Not specified - PP ADDED'
Saved pre-processed dataset to ../data/processed/extracted/ResearchGroups_pprocessed.csv

Records      : 2036
Institutions : 83
RGs          : 1788
RG types     : 32
Panels       : 4
UOA          : 34
