# Compare titles with the help of journal, volume and year

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth',500)
pd.set_option('display.max_rows', 200)

In [2]:
import json
from fuzzywuzzy import fuzz
import unidecode as ud
from translate import Translator #pip install translate
translator= Translator(from_lang="es",to_lang="en",email='rescolo@gmail.com')#10 000 with email 1000 without email

def partialdecode(s):
    s=str(s)
    return ud.unidecode(s.lower().strip())
def fulldecode(s):
    s=str(s)
    return partialdecode(s
                       ).replace('.',''
                       ).replace(' ',''
                       ).replace('(',''
                       ).replace(')',''
                       ).replace('[',''
                       ).replace(']',''
                       ).replace('{',''
                       ).replace('}',''
                       ).replace('-',''
                       ).replace('_',''
                       ).replace('#',''
                       ).replace('$',''
                       ).replace('\\',''
                       ).replace('%',''
                       ).replace('/',''
                       ).replace('&',''
                       ).replace('"',''
                       ).replace("'",''
                       ).replace("*",''
                       ).replace("^",''
                       ).replace("+",''
                       ).replace("~",''
                       ).replace("`",''
                       ).replace("´",''
                       ).replace(">",''
                       ).replace("<",''
                       ).replace(":",''
                       ).replace(";",''
                       ).replace(",",''
                       ).replace("!",''
                       ).replace("¡",''
                       ).replace("?",''
                       ).replace("¿",''
                       ).replace("|",''
                       )

def quality(row,min_ratio=95,check_ratio=90,min_translation=60,
                                            penalty_partial_ratio=-5,
                                            penalty_token_set_ratio=-5,
                                            penalty_partial_token_set_ratio=-10,
                                            DEBUG=False):
    '''
    Similarity check for article titles. 
    Assign a quality number to the comparision of two titles: 
      * `title` and `title_gs`
    If the similarity is less than min_ratio check also:
      * `journal` and `journal_gs`
      * `volume` and `volume_gs`
      * `year` and `year_gs`
    As a final resource compare the two titles after an english translation
      * Only if both journal are sufficiently similar and 
        either the volume or the year are the same
    '''
    if not isinstance( row['title_gs'],str):
        return 0
    if not row['title_gs']:
        return 0
    qr=fuzz.ratio(partialdecode(row['title']),partialdecode(row['title_gs']))
    #At least a subset of the journal name is sufficiently similar
    qjr=fuzz.partial_token_set_ratio(partialdecode(row['journal']),partialdecode(row['journal_gs']))
    if qjr<min_ratio:
        qjr=fuzz.partial_token_set_ratio(fulldecode(row['journal']),fulldecode(row['journal_gs']))

    # High similarity match
    if qr>min_ratio:
        if DEBUG: print('1:')                    
        return qr#,'1:'
    #Additional checks required from here on
    if qr>=check_ratio:
        # Subset of jourla or exact volume or year
        if qjr>=min_ratio:
            if DEBUG: print('2:')                        
            return qr#,'2:'
        if str(row['volume'])==str(row['volume_gs']):
            return qr
        if str(row['year'])==str(row['year']):
            if DEBUG: print('3:')                        
            return qr#,'3:'
        
    #Partial ratio: one title is included in the other    
    #with min_ratio
    qpr=fuzz.partial_ratio(partialdecode(row['title']),partialdecode(row['title_gs']))
    if qpr>=min_ratio:
        if qjr>=min_ratio or str(row['volume'])==str(row['volume_gs']) or str(row['year'])==str(row['year']):
            if DEBUG: print('4:')            
            return qpr+penalty_partial_ratio#,'4:',qpr
    
    #Stricter journal match    
    qjr=fuzz.partial_ratio(partialdecode(row['journal']),partialdecode(row['journal_gs']))
    if qjr<min_ratio:
        qjr=fuzz.partial_ratio(fulldecode(row['journal']),fulldecode(row['journal_gs']))        
    #Token set ratio
    qtser=fuzz.token_set_ratio(partialdecode(row['title']),partialdecode(row['title_gs']))
    #with min_ratio
    if qtser>=check_ratio:
        if qjr>=min_ratio and ( str(row['volume'])==str(row['volume_gs']) or str(row['year'])==str(row['year_gs']) ):
            if DEBUG: print('5:')            
            return qtser+penalty_token_set_ratio#,'5:',qtser
        
    qptser=fuzz.partial_token_set_ratio(partialdecode(row['title']),partialdecode(row['title_gs']))
    if (qr>=min_translation or qpr>=min_translation or qtser>=min_translation) and qptser>min_ratio:
        if (qjr>=min_ratio and 
            (str(row['volume'])==str(row['volume_gs']) or 
             str(row['year'])==str(row['year_gs'])
           )):
            if DEBUG: print('7:')            
            return max(qr,qpr)+penalty_partial_token_set_ratio#,'7:',max(qr,qpr)
    #with min_translation
    qtsor=fuzz.token_sort_ratio(partialdecode(row['title']),partialdecode(row['title_gs']))
    if (qpr>=min_translation or qtser>=min_translation) and qtsor> min_translation and qptser>min_translation:
        if (qjr>=min_ratio and 
            str(row['volume'])==str(row['volume_gs']) and
            str(row['year'])==str(row['year_gs'])
           ):
            if DEBUG: print('8:')            
            return qpr+penalty_partial_token_set_ratio#,'8:',qpr
                
    #Without Translation    
    if (qjr>=min_ratio and 
            str(row['volume'])==str(row['volume_gs']) and
            str(row['year'])==str(row['year_gs'])
           ):
        if qr>=min_translation:
            if DEBUG: print('9:')            
            return qr+penalty_partial_token_set_ratio#,'9:',qjr

    #With Translation!. Only use as last resource (very slow...)
    tpr=0
    if (  qptser>min_translation and 
          qjr>=min_ratio and 
        (str(row['volume'])==str(row['volume_gs']) and 
                 str(row['year'])==str(row['year_gs'])
        )  ):
        try:
            s1=translator.translate( str(row['title']) )
            s2=translator.translate( str(row['title_gs']) )
            s1=partialdecode(s1)
            s2=partialdecode(s2)
            if s1.find('mymemory warning:')==-1 and s2.find('mymemory warning:')==-1:
                print(s1,s2)
                tpr=fuzz.partial_ratio(s1,s2)
                if tpr>check_ratio:
                    if DEBUG: print('10:')
                    return tpr+penalty_partial_token_set_ratio#,'10:',tpr
        except:
            pass
        
    return 0#qr,qpr,qtser,qtsor,qptser,tpr,qjr

In [3]:
with open(r"data/consorcia_unique.json", "r") as read_file:
    data = json.load(read_file)

In [4]:
cs=pd.DataFrame(data)

In [5]:
with open(r"data/scholar-fixes.json", "r") as read_file:
    data = json.load(read_file)

In [6]:
gs=pd.DataFrame(data)
del(data)

In [7]:
gs.columns

Index(['_id', 'abstract', 'article_id', 'author', 'bibtex', 'cid', 'cites',
       'cites_link', 'country', 'doi', 'issue', 'journal', 'language',
       'old_title', 'pages', 'pdf', 'profiles', 'publisher', 'quality_author',
       'quality_title', 'ref', 'rp', 'timestamp', 'title', 'volume', 'year'],
      dtype='object')

In [8]:
cs.shape

(204536, 5)

In [9]:
cs.loc[0]

doi                                                                                        10.1007/s10856-006-0536-5
journal                                                          Journal of Materials Science: Materials in Medicine
title      cytotoxicity study of plasma-sprayed hydroxyapatite coating on high nitrogen austenitic stainless steels.
volume                                                                                                            17
year                                                                                                            2006
Name: 0, dtype: object

In [10]:
gs=gs.rename({'journal':'journal_gs','volume':'volume_gs','year':'year_gs','title':'title_gs'},axis='columns')

In [11]:
gs.columns

Index(['_id', 'abstract', 'article_id', 'author', 'bibtex', 'cid', 'cites',
       'cites_link', 'country', 'doi', 'issue', 'journal_gs', 'language',
       'old_title', 'pages', 'pdf', 'profiles', 'publisher', 'quality_author',
       'quality_title', 'ref', 'rp', 'timestamp', 'title_gs', 'volume_gs',
       'year_gs'],
      dtype='object')

In [12]:
cs.columns

Index(['doi', 'journal', 'title', 'volume', 'year'], dtype='object')

In [13]:
csgs=cs.merge(gs,on='doi',how='left')

## 1) split DataFrame into exact match

In [14]:
exact_match=( csgs['title'].apply(lambda s: fulldecode(s) if isinstance(s,str) else s) ==
               csgs['title_gs'].apply(lambda s: fulldecode(s) if isinstance(s,str) else s)
           )
csgsy=csgs[exact_match].reset_index(drop=True)
csgsp=csgs[~exact_match].reset_index(drop=True)

In [15]:
csgsy['Q']=100

In [16]:
csgsy.shape

(134794, 31)

In [17]:
csgsp.shape

(76248, 30)

In [18]:
filter_columns=['doi','title','journal','volume','year','title_gs','journal_gs','volume_gs','year_gs','Q']

## 2) apply `quality` function in each row

In [174]:
csgsp['Q']=csgsp.apply(quality,axis='columns')

In [None]:
csgsp

In [183]:
csgst=csgsy.append(csgsp).reset_index(drop=True)

In [185]:
csgst[csgst['Q']==0].shape

(43863, 31)

In [186]:
csgst[csgst['Q']==0][filter_columns]['doi'].dropna().shape

(11514,)

In [199]:
csgst[filter_columns].to_json('data/consortia_unique_quality.json',orient='records',force_ascii=False)

In [200]:
with open(r"data/consortia_unique_quality.json", "r") as read_file:
    data = json.load(read_file)

In [203]:
ls -lh data/consortia_unique_quality.json

-rw-r--r-- 1 restrepo restrepo 82M Jul 14 00:31 data/consortia_unique_quality.json


In [206]:
#cs=pd.DataFrame(data)
cs=pd.read_json('data/consortia_unique_quality.json')
del(data)

In [205]:
cs[:1]

Unnamed: 0,Q,doi,journal,journal_gs,title,title_gs,volume,volume_gs,year,year_gs
0,100,10.1007/s10856-006-0536-5,Journal of Materials Science: Materials in Medicine,Journal of Materials Science: Materials in Medicine,cytotoxicity study of plasma-sprayed hydroxyapatite coating on high nitrogen austenitic stainless steels.,Cytotoxicity study of plasma-sprayed hydroxyapatite coating on high nitrogen austenitic stainless steels,17,17,2006,2006


In [166]:
kk=csgsp[csgsp['doi']=='10.1016/0031-0182(78)90077-9']
kk.apply(quality,axis='columns',DEBUG=True)

94    0
dtype: int64

In [157]:
kk[filter_columns]

Unnamed: 0,doi,title,journal,volume,year,title_gs,journal_gs,volume_gs,year_gs,Q
94,10.1016/0031-0182(78)90077-9,prehistoric man of the sabana de bogotá: data for an ecological prehistory,"Palaeogeography, Palaeoclimatology, Palaeoecology",25,1978,Stratigraphy and environments of the Upper Quaternary of the El Abra corridor and rock shelters (Colombia),"Palaeogeography, Palaeoclimatology, Palaeoecology",25,1978,89.0


## Example translation

In [23]:
import unidecode as ud
from fuzzywuzzy import fuzz
from translate import Translator #pip install translate
translator= Translator(from_lang="es",to_lang="en",email='restrepo@udea.edu.co')

def partialdecode(s):
    s=str(s)
    return ud.unidecode(s.lower().strip())

s="primer registro del pez vaca cola amarilla hypoplectrus chlorurus (serranidae) en el caribe continental colombiano" 
s1= translator.translate(s)
s='First record of yellowtail hamlet Hypoplectrus chlorurus (Serranidae) in the Colombian continental Caribbean'
s2=translator.translate(s)
s1=partialdecode(s1)
s2=partialdecode(s2)
fuzz.partial_ratio(s1,s2)

100

In [27]:
from translate import providers

In [32]:
p=providers.MyMemoryProvider(from_lang="es",to_lang="en",email='restrepo@udea.edu.co')

In [33]:
p.get_translation('hola mundo')



In [35]:
import time

In [39]:
import goslate

text = "Hello World"

gs = goslate.Goslate()
s1="primer registro del pez vaca cola amarilla hypoplectrus chlorurus (serranidae) en el caribe continental colombiano" 
s2='First record of yellowtail hamlet Hypoplectrus chlorurus (Serranidae) in the Colombian continental Caribbean'
translatedText = gs.translate(s1,'en')
time.sleep(0.1)

print(translatedText)

HTTPError: HTTP Error 429: Too Many Requests

## Example `fuzz`

In [697]:
s1='Revista Integración'
s2='Revista Integración, temas de matemáticas'
s1=partialdecode(s1)
s2=partialdecode(s2)

In [698]:
fuzz.ratio(s2,s1)

63

In [699]:
fuzz.partial_ratio(s1,s2)

100

In [700]:
fuzz.token_sort_ratio(s1,s2)

64

In [701]:
fuzz.token_set_ratio(s1,s2)

100

In [702]:
fuzz.partial_token_set_ratio(s1,s2)

100

In [599]:
fuzz.partial_token_sort_ratio(s1,s2)

58