In [1]:
import numpy as np
import pandas as pd
import rispy as rs
import os
from fuzzywuzzy import fuzz
from pandas_dedupe import dedupe_dataframe
import ast

In [2]:
def find_partitions(df, match_func, max_size=None, block_by=None):
    '''
    Recursive algorithm for finding duplicates in a DataFrame. 
    Currently not used, took incredibly long
    '''

    # If block_by is provided, then we apply the algorithm to each block and
    # stitch the results back together
    if block_by is not None:
        blocks = df.groupby(block_by).apply(lambda g: find_partitions(
            df=g,
            match_func=match_func,
            max_size=max_size
        ))

        keys = blocks.index.unique(block_by)
        for a, b in zip(keys[:-1], keys[1:]):
            blocks.loc[b, :] += blocks.loc[a].iloc[-1] + 1

        return blocks.reset_index(block_by, drop=True)

    def get_record_index(r):
        return r[df.index.name or 'index']

    # Records are easier to work with than a DataFrame
    records = df.to_records()

    # This is where we store each partition
    partitions = []

    def find_partition(at=0, partition=None, indexes=None):

        r1 = records[at]

        if partition is None:
            partition = {get_record_index(r1)}
            indexes = [at]

        # Stop if enough duplicates have been found
        if max_size is not None and len(partition) == max_size:
            return partition, indexes

        for i, r2 in enumerate(records):

            if get_record_index(r2) in partition or i == at:
                continue

            if match_func(r1, r2):
                partition.add(get_record_index(r2))
                indexes.append(i)
                find_partition(at=i, partition=partition, indexes=indexes)

        return partition, indexes

    while len(records) > 0:
        partition, indexes = find_partition()
        partitions.append(partition)
        records = np.delete(records, indexes)

    return pd.Series({
        idx: partition_id
        for partition_id, idxs in enumerate(partitions)
        for idx in idxs
    })

In [3]:
def default_csv_read(filepath, file):
    return pd.read_csv(f'{filepath}/{file}',
            header=0,
            delimiter='  - ',
            names=['label','content'],
            engine='python')

def read_ris(filepath):
    with open(filepath, 'r') as risfile:
        return rs.load(risfile)

def write_ris(entries, filepath):
    with open(filepath, 'w') as risfile:
        rs.dump(entries, risfile)

def get_titles(risfiles):
    titles_list = []
    for risfile in risfiles:
        titles = []
        for entry in risfile:
            titles += [entry['title']]
        titles_list += [titles]

    return titles_list

def compare_lists(list_of_lists):
    '''
    Should be able to compare n>0 lists and find dupes
    TODO: Integrate fuzzy search instead of 'in seen' should be 'if fuzzy enough add to set
        Possibly incompatible with the use of sets
    '''
    seen = set()
    repeated = set()
    
    for l in list_of_lists:
        for i in set(l):
            if i in seen:
                repeated.add(i)
            else:
                seen.add(i)
    return repeated

def matching(list1, list2):
    no_matching = []
    matching = []
    m_score = 0
    cutoff = 60
    for item1 in list1:    
        for item2 in list2:        
            m_score = fuzz.ratio(item1.lower(), item2.lower())
            if m_score > cutoff:
                matching.append(item1)
        if m_score < cutoff and not(item1 in matching):
            no_matching.append(item1)
    return matching, no_matching

def get_dupes(path):
    risfiles = []
    filenames = os.listdir(path)
    for name in filenames:
        risfiles += [read_ris(f'{path}/{name}')]

    # Find duplicate titles    
    tls = get_titles(risfiles)

    # dupes = list(compare_lists(tls))
    dupes = matching(tls[0], tls[1])[0]

    print(f'{len(dupes)} duplicates found in {filenames}: \n{dupes}')

    # Get the entries in the ris files corresponding to the duplicate titles
    fe = []
    seen = set()
    for title in dupes:
        for file in risfiles:
            for entry in file:
                if entry['title'] == title and title not in seen:
                    fe += [entry]
                    seen.add(title)

    return fe, risfiles

def dedupe_naive(file, keys=['doi','title']):
    '''Deduplicate a large ris file using naive string matching (non-fuzzy)'''
    risfile = read_ris(file)
    df = pd.DataFrame.from_dict(risfile)

    def remove_dupes(df, subset):
        for sub in subset:
            # df=df[(~df.duplicated(subset=sub)) | (df[sub].isnull())]
            df = df[~df[sub].str.lower().duplicated() | (df[sub].isnull())]
        return df

    dfd = remove_dupes(df, keys)

    print(f'before:{df.shape}, after: {dfd.shape}')
    
    return dfd

def merge_ris(path, outfilename, overwrite=False):
    files = os.listdir(path)

    if not os.path.exists(outfilename) or overwrite:
        with open(outfilename, 'w') as outfile:
            for fname in files:
                fname = f'{path}/{fname}'
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        
        # Remove special characters that mess up dedupe after
        input = outfilename
        output = outfilename+'_.ris'
        with open(input, 'r') as infile, \
            open(output, 'w') as outfile:
            data = infile.read()
            data = data.replace("﻿", "")
            outfile.write(data)
        os.remove(outfilename)
        os.rename(output, output[:-5])
    else:
        print('merge file already exists, change name or set overwrite function parameter to True')
    return None

def dedupe_ml(file, keys=['doi','title','authors']):
    '''
    machine learning dedupe
    clusters of equal entries are given the same cluster id
    ''' 
    # read file
    risfile = read_ris(file)
    df = pd.DataFrame.from_dict(risfile)
    # find dupes and assign cluster ids to them
    dfd = dedupe_dataframe(df, keys)

    # dedupe using found cluster ids by uniquefying
    # NOTE: 1. The below commented doesn't work because dedupe_dataframe removes list structure from authors, so the subsequent write to ris fails using write-ris_dedupe
    # a = dfd[~dfd['cluster id'].duplicated()]
    # print(f'deduped from {df.shape} to {a.shape}')

    # Get original dataframe entries with good .ris format that correspond to a unique cluster id
    # NOTE: 1.1 Do manually instead
    temp = []
    seen = set()
    for index, row in df.iterrows():
        cluster_id = dfd.iloc[index]['cluster id']
        if cluster_id not in seen:
            temp.append(row)
            seen.add(cluster_id)
    a = pd.DataFrame(temp)
    print(f'deduped from {df.shape} to {a.shape}')  

    dl = [{k:v for k,v in m.items() if pd.notnull(v) and v != 'nan'} for m in a.to_dict('records')]

    def destring(dl):
        s = set(['authors', 'keywords', 'notes', 'first_authors', 'secondary_authors'])
        
        for i in dl:
            for k,v in i.items():
                if k in s:
                    i[k] = ast.literal_eval(v)
                # else:
                    # i[k] = v

        return dl

    dl = destring(dl)

    return dl, a, df, risfile

In [5]:
'''
Right now only works for 2 ris files
Provides a list of duplicates from a collection of .ris files
Duplicates are identified by title only, based on a fuzzy string search 
Using the fuzzywuzzy package with an equality cutoff ratio of 60/100
'''
name = '220526'
datapath = f'../data/{name}'

In [6]:
# dupes, risfiles = get_dupes(path=f'{datapath}/core')
# merge_ris(path=f'{datapath}/core', outfilename=f'{datapath}/core.ris', overwrite=False)

In [7]:
# merge_ris(path=f'{datapath}/merge', outfilename=f'{datapath}/{name}.ris', overwrite=False)

In [8]:
dl, a, df, risfile = dedupe_ml(f'{datapath}/{name}.ris')

f = write_ris(dl, f'../asreview/input/{name}_dedupe.ris')

Importing data ...


  dfd = dedupe_dataframe(df, keys)
doi : 10.1159/000024641
title : uk charitable organisations concerned with the effects of carbon monoxide poisoning
authors : None

doi : None
title : uk charitable organisations concerned with the effects of carbon monoxide poisoning
authors : anonymous

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


Starting active labeling...


doi : 10.4324/9781315613789
title : the ashgate research companion to world methodism
authors : gibson, w., forsaith, p., wellings, m.

doi : None
title : the ashgate research companion to world methodism
authors : gibson, w., forsaith, p., wellings, m.

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious
doi : None
title : to make a difference: the founding of the crohns and colitis foundation of america.
authors : rosenthal, s.

doi : None
title : to make a difference: the founding of the crohns and colitis foundation of america
authors : rosenthal, s

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious
doi : None
title : the enterprise in the society: corporate social engagement by sanlam, 1918-1980
authors : verhoef, g.

doi : None
title : the enterprise in the society: corporate social engagement by sanlam, 1918-1980
authors : verhoef, g

3/10 p

Clustering...
# duplicate sets 2443
deduped from (3530, 31) to (2443, 31)


# Experiments

In [64]:
# naive dedupe
# filepath = '../asreview/input/220420.ris'
# df = dedupe(filepath)

# https://stackoverflow.com/questions/26033301/make-pandas-dataframe-to-a-dict-and-dropna
# a=[ v.dropna().to_dict() for k,v in dfd.iterrows() ]
# write_ris(a,'test.ris')

In [65]:
# files = ['1-20scopus.ris', '1-20savedrecs.ris']
# filepath = '../data/220420'

# dfs = []
# for file in files:
#     risfile = read_ris(f'{filepath}/{file}')
#     dfs += [pd.DataFrame.from_dict(risfile)]
# df = pd.concat(dfs,ignore_index=True)
# def same_title(t1, t2):
#     return fuzz.ratio(t1['title'].lower(), t2['title'].lower()) > 75
    
# df['real_id'] = find_partitions(
#     df=df,
#     match_func=same_title
# )

# df[df.duplicated(subset='real_id')]