# CFF analysis


## Naming convention

Are all files called CITATION.cff


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import display

fontsize = 18

In [None]:
data_name = 'Named CITATION.cff'
df = pd.DataFrame({data_name: ['Yes', 'No'],
                   'Number of files': [700, 300]})

display(df)
df.set_index(data_name, inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize})

plt.savefig(data_name.replace(" ","_")+'.png')

## Files validity 

What is the ratio of valid:invalid CFF files


In [None]:
data_name = 'Validity'
df = pd.DataFrame({data_name: ['Not parsable', 'Valid', 'Not Valid'],
                   'Number of files': [200, 400, 400]})

display(df)
df.set_index(data_name, inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize})

plt.savefig(data_name.replace(" ","_")+'.png')

## use of `cffinit`

How many files have been created using [cffinit](https://bit.ly/cffinit) (judging by comment in file)

In [None]:
data_name = 'cffinit used'
df = pd.DataFrame({data_name: ['Yes', 'No'],
                   'Number of files': [600, 400]})

display(df)
df.set_index(data_name, inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize})

plt.savefig(data_name.replace(" ","_")+'.png')

## Adherence to the [software citation principles](https://peerj.com/articles/cs-86/): What are the ratios for:
- providing `version`
- providing `repository-code` only
- providing `doi` or `identifiers/doi` and others
- providing `preferred-citation`

In [None]:
fontsize_1 = fontsize -8
data_name1 = 'provide version'
data_name2 = 'provide repository-code only'
data_name3 = 'provide identifiers/doi'
data_name4 = 'provide preferred-citation'
df = pd.DataFrame({'criteria': ['#yes', '#No'],
                   data_name1: [600, 400],
                   data_name2: [300, 700],
                   data_name3: [500, 500],
                   data_name3: [900, 100],
                  })

display(df)
df.set_index('criteria', inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize}, figsize=(12,4))

plt.savefig(data_name.replace(" ","_")+'.png')

## `type`
Usage of `type: dataset` vs. (`type: software` || `None`)

In [None]:
data_name = 'type'
df = pd.DataFrame({data_name: ['dataset', 'software', 'None'],
                   'Number of files': [600, 300, 100]})

display(df)
df.set_index(data_name, inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize})

plt.savefig(data_name.replace(" ","_")+'.png')

## Usage of references 

- Uses references

In [None]:
data_name = 'Uses references'
df = pd.DataFrame({data_name: ['Yes', 'No'],
                   'Number of files': [600, 400]})

display(df)
df.set_index(data_name, inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize})

plt.savefig(data_name.replace(" ","_")+'.png')

- avg. number of references


In [None]:
data_name = 'Number of references'
num_references = np.round(np.random.uniform(1,10, 1000))

df = pd.DataFrame(
    num_references,
    columns = [data_name])

ax = df.plot.hist(bins=12, alpha=0.5)

- reference type distribution

In [None]:
data_name = 'References type'
df = pd.DataFrame({data_name: ['URL', 'doi'],
                   'Number of refernces': [5000, 1000]})

display(df)
df.set_index(data_name, inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize})

plt.savefig(data_name.replace(" ","_")+'.png')

## Non-standard fields

Usage of non-standard fields. Such fields might reveal a gap in the information stored by the default schema and might inform future development for the CFF schema 

In [None]:
data_name = 'Uses Non-standard fields'
df = pd.DataFrame({data_name: ['Yes', 'No'],
                   'Number of files': [600, 400]})

display(df)
df.set_index(data_name, inplace=True)

plot = df.plot.pie(subplots=True, title=data_name, legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0, textprops={'fontsize': fontsize})

plt.savefig(data_name.replace(" ","_")+'.png')