# Classification correction
Our classification algorithm isn't perfect, so we manually corrected the classifications. Here, we'll merge the corrections with the graph, and quantify the mistakes.

In [3]:
import pandas as pd
import networkx as nx
from sklearn.metrics import f1_score
from collections import defaultdict
import jsonlines

## Read in the data

In [4]:
debugged_graph = nx.read_graphml('../data/citation_network/core_collection_destol_or_anhydro_FINAL_CLASSIFIED_GRAPH_AUTOMATED_RESULT_mains_only_debugged_kings_no_isolates_NO_CLASSIFICATION_11Mar2024.graphml')

In [5]:
original = pd.read_csv('../data/citation_network/DEBUGGED_automatic_classifications_core_collection_destol_or_anhydro.csv')
original.head()

Unnamed: 0,UID,title,abstract,study_system
0,WOS:A1990ET59600010,RESPONSE OF 4 SORGHUM LINES TO MID-SEASON DROU...,Four sorghum (Sorghum bicolor L. Moench) lines...,Plant
1,WOS:000244317000009,Effects of abscisic acid on growth and dehydra...,Cynanchum komarovii is well adapted to hot and...,Plant
2,WOS:000186691200001,LEAping to conclusions: A computational reanal...,Background: The late embryogenesis abundant (L...,Plant
3,WOS:000178654700046,Early salt stress effects on the changes in ch...,A technique based on Fourier transform infrare...,Plant
4,WOS:A1997XH01700004,Approaches to elucidate the basis of desiccati...,"Plants undergo a series of physiological, bioc...",Plant


In [6]:
manual = pd.read_csv('../data/citation_network/DEBUGGED_automatic_classifications_core_collection_destol_or_anhydro_RV_manually_corrected.csv')
manual.head()

Unnamed: 0,UID,title,abstract,study_system
0,WOS:000249421700004,Phenotypic plasticity mediates climate change ...,Synergies between global change and biological...,Animal
1,WOS:000189080600003,"The importance of cuticular permeability, osmo...",Euedaphic collembolans have recently been show...,Animal
2,WOS:A1993LX94500007,GEOGRAPHICAL VARIATION IN THE ACCLIMATION RESP...,Populations may adapt to climatic stresses by ...,Animal
3,WOS:000182189500050,Transition from natively unfolded to folded st...,Late embryogenesis abundant (LEA) proteins are...,Animal
4,WOS:000170973900012,Mechanisms of plant desiccation tolerance,Anhydrobiosis ('life without water') is the re...,Animal


In [7]:
comparison = original[['UID', 'study_system', 'title', 'abstract']].merge(manual[['UID', 'study_system']], on='UID', suffixes=('_original', '_manual'))

In [8]:
comparison.head()

Unnamed: 0,UID,study_system_original,title,abstract,study_system_manual
0,WOS:A1990ET59600010,Plant,RESPONSE OF 4 SORGHUM LINES TO MID-SEASON DROU...,Four sorghum (Sorghum bicolor L. Moench) lines...,Plant
1,WOS:000244317000009,Plant,Effects of abscisic acid on growth and dehydra...,Cynanchum komarovii is well adapted to hot and...,Plant
2,WOS:000186691200001,Plant,LEAping to conclusions: A computational reanal...,Background: The late embryogenesis abundant (L...,Plant
3,WOS:000178654700046,Plant,Early salt stress effects on the changes in ch...,A technique based on Fourier transform infrare...,Plant
4,WOS:A1997XH01700004,Plant,Approaches to elucidate the basis of desiccati...,"Plants undergo a series of physiological, bioc...",Plant


In [10]:
comparison[comparison['study_system_manual'] != comparison['study_system_original']]

Unnamed: 0,UID,study_system_original,title,abstract,study_system_manual
34,WOS:000169703200006,Animal,Changes in oligosaccharide content and antioxi...,Seeds of bean (Phaseolos vulgaris cv. Vernel) ...,Plant
39,WOS:000087110100001,NOCLASS,Dehydration in dormant insects,Many of the mechanisms used by active insects ...,Animal
46,WOS:000171149800011,Fungi,Levels of variation in stress resistance in Dr...,Stress resistance traits in Drosophila often s...,Animal
63,WOS:000226518000010,Animal,Emergence stress and morphological constraints...,The effects of emergence stress and morphologi...,Plant
65,WOS:A1996VM10600001,NOCLASS,Stress tolerance in intertidal seaweeds,Intertidal seaweeds are periodically exposed t...,Plant
...,...,...,...,...,...
5513,WOS:000425958000005,Microbe,A Grapevine-Inducible Gene Vv-alpha-gal/SIP Co...,Grapevine is an important fruit crop cultivate...,Plant
5516,WOS:000427208800001,NOCLASS,Desiccation resistance determines distribution...,Forest edges show strong abiotic and biotic gr...,Plant
5588,WOS:000373125900005,Animal,Biofilm assembly becomes crystal clear - filam...,Pseudomonas aeruginosa is an opportunistic bac...,Microbe
5615,WOS:000455747900024,Fungi,Plasticity of a holobiont: desiccation induces...,The role of host-associated microbiota in endu...,Plant


In [41]:
comparison[comparison['UID'] == 'WOS:000244031100003']

Unnamed: 0,UID,study_system_original,title,abstract,study_system_manual,correct
33,WOS:000244031100003,Animal,Modelling the effects of microclimate on bean ...,Bean seed storage ability is of major interest...,Animal,True


## Quantify misclassifications
### Amount of each class
How many belong to each class in the automatic versus manually verified annotations?

In [27]:
comparison.study_system_original.value_counts()

study_system_original
Plant      3380
Animal     1273
Microbe     570
Fungi       237
NOCLASS     166
Name: count, dtype: int64

In [28]:
comparison.study_system_manual.value_counts()

study_system_manual
Plant      3548
Animal     1335
Microbe     556
Fungi       143
NOCLASS      44
Name: count, dtype: int64

### Calculating performance
To get a general idea of how we performed, we'll ignore specific classes and just check correct versus incorrect.

In [29]:
comparison['correct'] = comparison['study_system_original'] == comparison['study_system_manual']

In [30]:
accuracy = comparison['correct'].value_counts()[True]/(comparison['correct'].value_counts()[True] + comparison['correct'].value_counts()[False])
print(f'Overall classification accuracy was {accuracy*100:.2f}%')

Overall classification accuracy was 93.23%


We can look at an overall F1 score:

In [31]:
f1_overall = f1_score(comparison['study_system_manual'], comparison['study_system_original'], average='weighted') # weighted accounts for class imbalance
print(f'Overall F1 score is {f1_overall:.2f}')

Overall F1 score is 0.94


As well as F1 for each class:

In [32]:
f1_by_class = f1_score(comparison['study_system_manual'], comparison['study_system_original'], average=None)
f1_by_class = {n: f for n, f in zip(sorted(comparison['study_system_manual'].unique()), f1_by_class)}
for n, f in f1_by_class.items():
    print(f'F1 score for class {n} is {f:.2f}')

F1 score for class Animal is 0.92
F1 score for class Fungi is 0.70
F1 score for class Microbe is 0.95
F1 score for class NOCLASS is 0.42
F1 score for class Plant is 0.96


## Updating graph
We want to both update the classifications, as well as remove nodes that are true NOCLASS.

In [33]:
to_update = comparison.set_index('UID')['study_system_manual'].to_dict()

In [34]:
nx.set_node_attributes(debugged_graph, to_update, name='study_system')

In [35]:
to_drop = [n for n, c in to_update.items() if c == 'NOCLASS']
len(to_drop)

44

In [36]:
debugged_graph.remove_nodes_from(to_drop)

Double check that we updated and dropped correctly by verifying numbers:

In [37]:
verification = defaultdict(int)
for node, attrs in debugged_graph.nodes(data=True):
    verification[attrs['study_system']] += 1
verification

defaultdict(int, {'Plant': 3318, 'Animal': 1222, 'Microbe': 514, 'Fungi': 130})

Looks good! Now write out:

In [38]:
nx.write_graphml(debugged_graph, '../data/citation_network/FINAL_DEBUGGED_MANUALLY_VERIFIED_core_collection_destol_or_anhydro_classified_network_25Mar2024.graphml')