We have classified a subset of 80 software mentions from a random sample of 100 software mention candidates from the CORD-19 dataset by mention type.
The mention types are those used in Howison & Bullard 2015 (doi:10.1002/asi.23538).
We did not use the following annotations:
- *Cite to users manual*
The mention types we found are listed in a table extracted from the annotated dataset used in the access study.

In [None]:
import pandas as pd

# Read the dataset
df = pd.read_csv(r'../data/access_study/CSM_sampled_mention_access.csv', encoding='unicode_escape', engine='python', index_col=False).fillna(0)
# Get the raw annotations for mention type
raw_types = df['Mention Type']
mentions_total = len(raw_types)
print(mentions_total)
# Build a list of single types, i.e., split and strip comma-separated values
types = []
for separated in raw_types:
    vals = separated.split(',')
    vals = [v.strip() for v in vals]
    types.extend(vals)
# Create a dataframe for just the single types, sorted alphabetically
type_df = pd.DataFrame(data=types, columns=['Type']).sort_values(by='Type')
# Create a new dataframe including the counts for the single types
counts_df = pd.DataFrame(type_df['Type'].value_counts())
counts_df.rename(columns = {'Type':'Mentions in our dataset sample'}, inplace=True)
# Insert the actual types, which are the index right now, into its own column
counts_df.insert(0, 'Type',counts_df.index)
# Add the values from the Howison & Bullard paper, normalized against the number of mentions (80, in 'mentions_totla') in our dataset sample

counts_df

Add the data from Howison & Bullard 2005, Table 1.

In [None]:
hb_mentions = {
    'PUB': 105,
    'MAN': 6, # Citing user manual
    'PRO': 15,
    'INS': 53,
    'URL': 13,
    'NAM': 90,
    'NEN': 4 # Not even name mentioned
}

# We have no evidence for NEN, and no occurrences of MAN, in our sample, so drop these from the H&B data
del hb_mentions['MAN']
del hb_mentions['NEN']

hb_mentions

Now add normalize the H&B data, and add it to the existing dataframe.

In [None]:
# Our no. of mentions
print('No. of mentions in our sample: ' + str(mentions_total))

hb_mentions_total = sum(hb_mentions.values())
print('No. of mentions in Howison & Bullard 2005 data: ' + str(hb_mentions_total))

counts_df['Howison & Bullard 2005'] = counts_df['Type'].map(hb_mentions)
counts_df

In [None]:
# Calculate percentages for both datasets
counts_df. insert(2, '% of our dataset', counts_df['Mentions in our dataset sample']/counts_df['Mentions in our dataset sample'].sum()*100)
counts_df. insert(4, '% of Howison & Bullard dataset', counts_df['Howison & Bullard 2005']/counts_df['Howison & Bullard 2005'].sum()*100)
counts_df['% of our dataset'] = counts_df['% of our dataset'].round(decimals = 1)
counts_df['% of Howison & Bullard dataset'] = counts_df['% of Howison & Bullard dataset'].round(decimals = 1)
counts_df

In [None]:
df_transposed = counts_df.transpose()

In [None]:
print(df_transposed.to_latex())
df_transposed

In [None]:
# Render the percentage data as a nice stacked bar chart
import matplotlib.pyplot as plt

# Transpose dataframe
types_df = counts_df.transpose()
print(types_df)

# Drop unneeded data
types_df = types_df.drop(['Type', 'Mentions in our dataset sample', 'Howison & Bullard 2005'], axis=0)
types_df.round(1)
print(types_df)

# Collect columns to render (exclude Total)
plot_cols = ['% of our dataset', '% of Howison & Bullard dataset']

# Colourblind/-friendly colours adapted from https://gist.github.com/thriveth/8560036
my_colors = ['#4daf4a', '#f781bf', '#e41a1c', '#984ea3', '#999999', '#a65628', '#dede00']

# Create the plot
ax = types_df.plot(kind='bar', stacked=True, figsize=(5,8), color=my_colors)
ax.legend(ncol=5, bbox_to_anchor=(0, 1),
              loc='lower left', fontsize='small')

# Add a title and rotate the x-axis labels to be horizontal
# plt.title('Comparison of mention types')
plt.xticks(rotation=45, ha='center')

for c in ax.containers:
    ax.bar_label(c, label_type='center')