<a href="https://colab.research.google.com/github/sinairusinek/JRoL/blob/main/notebooks/listrelation_gephi_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting CSV for Gephi based on listRelation annotation

This notebook was written by Dror Guldin, Avigail Friedland and Anne K in the framework of the Potsdam DH-Jewish Hackathon 2022.

To use it, you need:

**Input**:

1. XML that includes a listRelation annotation

**Output**:

1. CSV titled "edges", to be used as the edges input in Gephi
2. CSV titled "labled_nodes", to be used as the nodes input in Gephi


In [None]:
from google.colab import files
import xml.etree.ElementTree as ET
import csv
import pandas as pd

In [None]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving profiledesc_big.xml to profiledesc_big.xml
User uploaded file "profiledesc_big.xml" with length 271715 bytes


In [None]:
abstract_xml_tree = ET.fromstring(uploaded[list(uploaded.keys())[0]])

In [None]:
relations_dict = {}

for profileDesc in abstract_xml_tree:
  for children_element in profileDesc:
    if children_element.tag == 'correspDesc' and len(children_element.attrib['ref']) > 1:
      corresp_ref = children_element.attrib['ref']
      relations_dict[corresp_ref] = {}
    if children_element.tag == 'abstract':
      for abstract_element in children_element:
        if abstract_element.tag == 'listRelation':
          listrelation_dict = abstract_element[0].attrib
          relations_dict[corresp_ref] = listrelation_dict

In [None]:
relations_df = pd.DataFrame(relations_dict).T.reset_index()

In [None]:
relations_df = relations_df[relations_df.isnull().sum(axis=1) < len(relations_df.columns) - 1]

In [None]:
edges_df = relations_df.groupby(['active','passive', 'name']).count().reset_index()[['active','passive','name','index']]
edges_df.columns = ['source','target','label','weight']

In [None]:
node_dict = {'E':'Event',
             'K':'Organization',
             'O':'Place',
             'P':'Person',
             'W':'Work'}

In [None]:
nodes_set = set(list(edges_df.source) + list(edges_df.target))
nodes_df = pd.concat([pd.DataFrame(nodes_set), pd.DataFrame(nodes_set)], axis=1)
nodes_df.columns = ['Id', 'Label']
nodes_df['node_type'] = nodes_df.apply(lambda x: x.Id[0], axis=1)
nodes_df['node_type'] = labled_nodes_df.apply(lambda x: node_dict.get(x.node_type, x.node_type), axis=1)

In [None]:
edges_df.to_csv('edges.csv',index=False)
nodes_df.to_csv('nodes.csv',index=False)

## Add labels

In [None]:
from io import StringIO, BytesIO

In [None]:
uploaded2 = files.upload()

Saving key_label-1663342729524.tsv to key_label-1663342729524.tsv


In [None]:
labels_df = pd.read_table(BytesIO(uploaded2[list(uploaded2.keys())[0]]), header=None)
labels_df.columns = ['id', 'label']
labels_df = labels_df.set_index('id')

In [None]:
labled_nodes_df = nodes_df.set_index('Id').join(labels_df, how='left')[['label','node_type']].reset_index()

In [None]:
labled_nodes_df['node_type'] = labled_nodes_df.apply(lambda x: node_dict.get(x.node_type, x.node_type), axis=1)

In [None]:
labled_nodes_df

Unnamed: 0,Id,label,node_type
0,E.0000031,,Event
1,P.0004014,"Pomeranz, Jochanan Hans",Person
2,K.0000001,Zionistische Organisation,Organization
3,E.0000047,,Event
4,O.0000007,Wien,Place
...,...,...,...
59,W.0000005.01,Zwei Geschichten von der Cholera,Work
60,W.0000028.01,,Work
61,K.0000017,,Organization
62,W.0000071.01.01,"Dichter, Denker, Helfer",Work


In [None]:
labled_nodes_df.to_csv('labled_nodes.csv',index=False)

### Output

In [None]:
files.download('labled_nodes.csv')
files.download('edges.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>