In [1]:
"""
Code to create expt-id for the share data of JCIM.

Raw data was obtained from PoLyInfo, and the data was processed to create a DataFrame with expt-id. (to assign an expt-id for each rid)

[Columns of the DataFrame]
Sample ID: following the format of PoLyInfo
Reference: reference information (e.g. 'Kraybill, Richard R. , Polymer Engineering and Science , 21 , 3 , 124-128 (1981)')
rid: id of reference
expt: id of experimental group (expt-id)

expt-id is created based on the identical author list in the references.
Note that rid and expt-id are different because different references can have the same author. (Each expt-id has a unique set of rid.)
Note that data size of this product (df_save) is different from that of final data used for prediction (data_[ETC].csv) because the list of polymers and fillers will be limited after the data processing.
"""
# %%
import pandas as pd
from collections import defaultdict

ETC = ['EC','TC'][1]
df_raw = pd.read_csv(f'sample_detail_{ETC}.csv')

# insert rid (id of reference)
df_raw.insert(1,'rid',df_raw['Sample ID'].str.split('-').str[0])
df_rid = df_raw[['Reference','rid']].drop_duplicates(subset='rid',keep='first').reset_index(drop=True)
# copy of the raw data to save
df_save = df_raw.copy()

In [2]:
set_author = set()
# dictionary of rid and list of author
d_rid2author = {}
for ref,id in df_rid[['Reference','rid']].values:
  # without author information or unknown
  if ref[0]==' ' or 'cond-mat' in ref:
    a=ref.split('(')[0]
    if len(a)==0: a = 'unknown'
    la = [a]
  # with author information
  else:
    # list of author
    la = []
    # make author list by split reference information with ';'
    for a in ref.split(';'):
      # remove the part after the authors' name if it exists
      a_ = a.split(' ,')[0].lstrip()
      # remove the part after the authors' surname and first name (', X') if it exists
      a__ = a_[:a_.find(',')+3]
      # if the name is not empty, add it to the author list
      if len(a__)>0: la.append(a__)
  d_rid2author[id] = la
  set_author|=set(la)

In [3]:
# dictionary of author and rid
d_author2rid = {author:set(df_rid[df_rid['Reference'].str.contains(author)]['rid']) for author in set_author}
# dictionary of author and number of rid
d_author2Nrid = {author:len(df_rid[df_rid['Reference'].str.contains(author)]['rid']) for author in set_author}
# sort the dictionary by the number of rid
d_author2Nrid = dict(sorted(d_author2Nrid.items(), key=lambda item: item[1], reverse=True))

# dictionary of author and coauthor (author who has the same rid)
d_author2coauthor = defaultdict(set)
for author in set_author:
  for id in d_author2rid[author]:
    d_author2coauthor[author]|=set(d_rid2author[id])
d_author2coauthor = dict(d_author2coauthor)

d_author2group = d_author2coauthor.copy()
set_group = set()
set_group_raw = list(d_author2coauthor.values())

In [4]:
# group of authors who have the same rid
while len(set_group_raw)!=len(set_group):
  d_author2group_raw = d_author2group.copy()
  set_group_raw = set_group.copy()
  d_author2group = {}
  set_group = set()
  for author,group in d_author2group_raw.items():
    set_ = set()
    for a in group: set_|=d_author2group_raw[a]
    d_author2group[author] = set_
  for v in d_author2group.values():
    if len(v)>0: set_group.add(tuple(sorted(v)))

#  set of rid for each group
set_rid_group = set()
for group in set_group:
  set_rid = set()
  for author in group: 
    set_rid|=d_author2rid[author]
  if len(set_rid)>0: 
    set_rid_group.add(tuple(sorted(set_rid)))

# dictionary of expt-id and rid
d_expt2rid_raw = {i+1:group for i,group in enumerate(sorted(set_rid_group, key=lambda item: len(item), reverse=True))}

# In order to mitigate the imbalance in data size in expt-group partitioning, the expt-id with the highest number of rids is partitioned to match the count of the second highest number of rids.
n_expt = len(d_expt2rid_raw)
n_no1 = len(d_expt2rid_raw[1])
n_no2 = len(d_expt2rid_raw[2])
n_sep = n_no1//n_no2
for i in range(n_sep):
  d_expt2rid_raw[n_expt+i+1] = d_expt2rid_raw[1][i*n_no2:(i+1)*n_no2]

In [5]:
# dictionary of expt-id and rid
d_expt2rid = {i+1:rid for i,rid in enumerate(sorted(list(d_expt2rid_raw.values())[1:], key=lambda item: len(item), reverse=True))}
# dictionary of expt-id and author group
d_expt2group = {expt:set([author for rid in set_rid for author in d_rid2author[rid]]) for expt,set_rid in d_expt2rid.items()}
# dictionary of rid and expt-id (final product)
d_rid2expt = {rid:expt for expt,set_rid in d_expt2rid.items() for rid in set_rid}

# assign expt-id for each rid
df_save.insert(2,'expt',df_save['rid'].map(d_rid2expt))