### Import packages and load data:

In [None]:
import pandas as pd
import re
from itertools import accumulate
import matplotlib.pyplot as plt
import numpy as np
import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.io as pio
pio.renderers.default = 'iframe'

import plotly.express as px

%matplotlib inline


In [None]:

dfs = list()
for yr in range(2022, 2027):
    data = pd.read_csv(f"NCMAS {yr} outcomes.csv")
    data['year'] = yr
    dfs.append(data)

raw_data = pd.concat(dfs, ignore_index=True)
raw_data['shortID'] = raw_data['Project ID'].map(lambda x: x.split('-')[-1])
raw_data['shortInst'] = raw_data['Institution'].map(lambda x: dict(zip(
    pd.unique(raw_data['Institution']),
    ['Curtin', 'USyd', 'UNSW', 'ANU', 'UWA', 'JCU', 'UQ', 'QUT',
     'UniSA', 'UMelb', 'Deakin', 'Newc', 'Monash', 'UTas', 'UWol',
     'GU', 'UAde', 'UNSW-C', 'Swin', 'Macq', 'RMIT', 'Garvan',
     'Flind', 'AusGov', 'Antarc', 'CSIRO', 'UTS', 'UNE', 'LaTrobe',
     'CCI', 'UniSQ', 'ECU', 'UWS', 'VicU', 'PMCC', 'AusU']))[x])
totall = "Total Allocation kSU"
raw_data['ID_kSU_label'] = raw_data.apply(lambda row: row['shortID']+f":{str(row[totall]/1000)}k", axis=1)

ci_by_year = pd.pivot(raw_data, index='Lead CI', columns='year', values=totall)

def year_offset_isnan(row, offset=1):
    try:
        return np.isnan(ci_by_year.loc[row['Lead CI']][row['year']+offset])
    except KeyError:
        return True

raw_data['isNewCI'] = raw_data.apply(lambda x: year_offset_isnan(x, offset=-1), axis=1)
raw_data['isDroppedCI'] = raw_data.apply(lambda x: year_offset_isnan(x, offset=1), axis=1)

alloc_by_year = raw_data.groupby('year').agg(
    proj_count=(totall, "count"),
    alloc_sum_kSU=(totall, "sum"),
    alloc_mean_kSU=(totall, "mean"),
    alloc_median_kSU=(totall, "median"),
    alloc_sd_kSU=(totall, "std"),
    alloc_min_kSU=(totall, "min"),
    alloc_max_kSU=(totall, "max"),
    new_CIs=('isNewCI', "sum"),
    dropped_CIs=('isDroppedCI', "sum")
)

Go8 = ['Australian National University',
       'University of Melbourne',
       'University of Sydney',
       'University of NSW',
       'University of Queensland',
       'Monash University',
       'University of Western Australia',
       'University of Adelaide']

alloc_Go8_by_year = raw_data.loc[raw_data['Institution'].isin(Go8)].groupby('year').agg(
    proj_count=(totall, "count"),
    alloc_sum_kSU=(totall, "sum"),
    alloc_mean_kSU=(totall, "mean"),
    alloc_median_kSU=(totall, "median"),
    alloc_sd_kSU=(totall, "std"),
    alloc_min_kSU=(totall, "min"),
    alloc_max_kSU=(totall, "max"),
    new_CIs=('isNewCI', "sum"),
    dropped_CIs=('isDroppedCI', "sum")
)

def for_seo_list(text, return_seo=False):
    codes = re.findall(r'\d{6}', str(text))
    percs = [float(i[:-1])/100 for i in re.findall(r'\d{1,3}%', str(text))]
    acc = list(accumulate(percs))
    try: 
        loc = acc.index(1.)+1
        forcodes = codes[:loc]
        forpercs = percs[:loc]
        seocodes = codes[loc:]
        seopercs = percs[loc:]
    except ValueError:
        forcodes = codes
        forpercs = percs
        seocodes = seopercs = []
    if return_seo:
        return forcodes, forpercs, seocodes, seopercs
    else:
        return list(forcodes), list(forpercs)

raw_data['FoR-codes'], raw_data['FoR-%'] = zip(*raw_data['FoR-08'].map(for_seo_list))
for_data = raw_data.explode(list(('FoR-codes', 'FoR-%')))
for_data['FoR-X'] = for_data['FoR-codes'].map(lambda t:str(t)[:1])
for_data['FoR-XX'] = for_data['FoR-codes'].map(lambda t:str(t)[:2])
for_data['FoR-kSU'] = for_data[totall]*for_data['FoR-%']

### Resulting DataFrames:

- `raw_data`: All `outcomes.csv` entries with an additional "year" column

- `alloc_by_year`: Allocation statistics aggregated by year.

- `alloc_Go8_by_year`: Allocation statistics aggregated by year for only Go8 institutions.

- `for_data`: As per `raw_data` but each FoR code is listed with an "FoR-kSU" column breaking up the Total Allocation kSU by FoR%.

For example, a `raw_data` allocation of 1,000 kSU with FoR codes 60% 123456 and 40% 654321 will have two `for_data` rows:

```
... Lead Investigator Total Allocation kSU FoR-codes FoR-kSU
... Ash Fairenough    1000                 123456    600
... Ash Fairenough    1000                 654321    400
```

### Example viz: Histogram of Allocation Frequency by Size for different years, and Allocation Statistics by Year and Institution (Go8 / not)

In [None]:
fig, ax = plt.subplots()

for year in range(2023, 2027):
    data = raw_data.loc[raw_data['year'] == year, 'Total Allocation kSU']
    hist, bin_edges = np.histogram(data, bins=32, range=(-5000, 55000))
    bincent = 0.5*(bin_edges[1:]+bin_edges[:-1])
    ax.plot(bincent/1000, hist, label=year)
ax.legend()
ax.set_xlabel('Alloc size (,000 kSU)')
ax.set_ylabel('Alloc frequency')
ax.set_title('NCMAS allocation frequency by size, 2023-2026')
plt.show()

In [None]:
fig, axs = plt.subplots(3,2, figsize=(6,10))
ax = axs.flatten()
for i, col in enumerate(['proj_count', 'alloc_sum_kSU', 'alloc_mean_kSU', 'alloc_median_kSU', 'alloc_min_kSU', 'alloc_max_kSU']):
    mult = (1 if i==0 else 0.001)
    ax[i].plot(alloc_by_year.index, mult*alloc_by_year[col], 'o-', label="All inst.")
    ax[i].plot(alloc_Go8_by_year.index, mult*alloc_Go8_by_year[col], 'o-', label="Go8")
    ax[i].set_ylim(0, [170, 950, 11, 10, 5, 60][i])
    ax[i].set_ylabel(['No. projects', 'Total alloc. (,000 kSU)', 'Mean alloc. (,000 kSU)', 'Median alloc. (,000 kSU)', 'Min alloc. (,000 kSU)', 'Max alloc. (,000 kSU)'][i])
    ax[i].set_xlabel('Year')
    ax[i].legend()
fig.suptitle('NCMAS Allocation Statistics by Year and Institutions')
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 2)
for n, df in enumerate([alloc_by_year, alloc_Go8_by_year]):
    ax[0].plot(df.index[1:], df['new_CIs'][1:], 'o-', label=["All inst.", "Go8"][n])
    ax[1].plot(df.index[:-1], df['dropped_CIs'][:-1], 'o-', label=["All inst.", "Go8"][n])
    
for n, a in enumerate(ax):
    a.set_ylim(0, [60, 90][n])
    a.set_xlabel("Year")
    a.set_ylabel(["New Lead CIs", "Dropped CIs"][n])
    a.legend()

fig.suptitle("New Lead CIs and 'Dropped' CIs (no alloc in following year) by Year")
fig.tight_layout()
plt.show()

### Plotly Treemap Comparing 2026 and 2025 Allocations

Box labels are "short IDs" (2-3 digit project IDs each year) followed by kSUs allocated

In [None]:
df = pd.concat((raw_data.loc[raw_data["year"] == 2025],raw_data.loc[raw_data["year"] == 2026]))
fig = px.treemap(df, path=[px.Constant("all"), 'year', 'ID_kSU_label'],
                 values=totall, color=totall)
fig.update_layout(margin = dict(t=30, l=25, r=25, b=25))
fig.update_traces(marker=dict(cornerradius=2))
fig.show()

In [None]:
totall_lists = [np.array(raw_data.loc[raw_data["year"] == yr, totall]*.001) for yr in range(2023, 2027)]
t_acc = [np.array(list(accumulate(np.sort(t)[::-1]))) for t in totall_lists]
fig, ax = plt.subplots()
[ax.plot(t[:200], "o-", label=f"{2023+i}",
         fillstyle="none",
         markevery=10) for i, t in enumerate(t_acc)]
ax.legend()
ax.set_ylabel("cumulative million SU")
ax.set_xlabel("Allocation no.")

plt.show()