# CORD-19 Software Mentions

A notebook to explore the dataset and output some lightly processed data

## Setup imports and files

In [None]:
import numpy as np
import pandas as pd
import csv
import ast
import collections
import matplotlib.pyplot as plt

In [None]:
CORD19_CSVFILE = '../data/cord-19/CORD19_software_mentions.csv'
POPULARITY_CSVFILE = '../data/output/CORD19_software_popularity.csv'

## Have a quick look at the data using pandas

In [None]:
df = pd.read_csv(CORD19_CSVFILE)
df.head()

## Load the data and extract the software mentions

In [None]:
software = []

with open(CORD19_CSVFILE, newline='',encoding='Latin1') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        mentions = set(ast.literal_eval(row['software']))
        for mention in mentions:
            software.append(mention)

## Take a quick look at the statistics

In [None]:
occurrences = collections.Counter(software)

print(occurrences)

In [None]:
common_software = occurrences.most_common(20)

In [None]:
common_software

In [None]:
labels, ys = zip(*common_software)
xs = np.arange(len(labels)) 
width = 1

plt.bar(xs, ys, width, align='center')
plt.title('Frequency of software mentions in CORD-19 data')
plt.xlabel('Software Title')
plt.ylabel('Number of Mentions')

plt.xticks(xs, labels, rotation='vertical')
#plt.yticks(ys)
plt.show()

## Output the slightly processed data

In [None]:
with open(POPULARITY_CSVFILE, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for row in occurrences.most_common(len(occurrences)):
        writer.writerow(row)
        