<a href="https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/WOS_SCI_SCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WOS+SCI+SCP+PTJ+CTR

Merge the bibliographic datasets for 
* Web of Science (WOS), 
* Scielo (SCI)
* Scopus  (SCP)
* Puntaje (UDEA)
* Center (CTR)
of the scientific articles of Universidad de Antioquia

For details see [merge.ipynb in Colaboratory](https://colab.research.google.com/github/restrepo/medicion/blob/master/cienciometria/merge.ipynb)

Implementation:
The input pure o partially processed database with WOS-SCI-SCP and may be some UDEA entries from PTJ and Center information with additional data about the Full Name UDEA authors.

Addtionaly UDEA entries can be captured from:
1. A previous WOS-SCI-SCP-UDEA
2. A Data Base with a column with full names (FULL LAST NAMES NAMES, e.g VALDEZ GÚZMAN JUAN ALBERTO) and a list of author Aliases in WOS format (Lastname, Name, e.g Valdez-Gúzman, J.A.) with a list of registered affiliations. TODO: Test
3. The database from Puntaje (UDEA). 

In [None]:
import os
VERSION='NEW'
if os.getcwd()=='/content':
    !pip install openpyxl xlrd wosplus > /dev/null

In [77]:
# Delete UDEA_columns and start from schratch
REBUILD=False
MERGE_WITH_TRAINED=False

## functions

In [78]:
import pandas as pd
import wosplus as wp
pd.set_option('display.max_colwidth',200)

In [79]:
# %load wos_sci_scp_ptj_ctr.py

In [80]:
from wos_sci_scp_ptj_ctr import *

##  Configure public links of  files in Google Drive
* If it is a Google Spreadsheet the corresponding file is downloaded as CSV
* If it is in excel or text file the file is downloaded  directly

To define your  own labeled IDs for public google drive files edit the next cell:

In [81]:
%%writefile drive.cfg
[FILES]
WOS_SCI_SCP_PTJ_CTR.json.gz=19E1C1kRk4I0V3uXojqko8-NEicWaPp1j
WOS_SCP_UDEA_SJR_SIU.xlsx=0BxoOXsn2EUNIQ3R4WDhvSzVLQ2s
Base_de_datos_investigadores_Definitiva.csv=12oalgUeKhpvzkTPBP8pXCeHTrF-KO223dy9ov9w9QKs
UDEA_authors_with_WOS_info.json=1o1eVT4JD0FMMICq_oxrTJOzWh47veBMw
produccion_fecha_vig_2003_2018.xlsx=1WbtX4K__TTLxXRjuLvqUYz9tuHCIlS5v
UDEA_WOS_SCI_SCP_PTJ.json=1OkVytKbxJwGvXZDkynkSoUDtkUOTaT4A

Overwriting drive.cfg


##  Load data bases

In [82]:
affil='Univ Antioquia'
drive_files=wp.wosplus('drive.cfg')

#### DEBUG: if False stop in UDEA_PTJ!!!!

if os.path.exists(UDEAjsonfile):
    UDEA=               pd.read_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)
else:    
    UDEA=drive_files.read_drive_json(UDEAjsonfile,compression='gzip').reset_index(drop=True)

In [83]:
RECOVER=True #False for test purposes
UDEAjsonfile='WOS_SCI_SCP_PTJ_CTR.json.gz'
#Test purposes
#UDEAjsonfile='UDEA_WOS_SCI_SCP_PTJ.json'
if RECOVER:
    #Requieres latest wosplus!
    tmp=drive_files.load_biblio(UDEAjsonfile,compression='gzip')# TODO CHANGE FOR LAST VERSION IN GOOGLE DRIVE
else:
    tmp=drive_files.load_biblio('UDEAtmp.json')
    #drive_files.load_biblio(
    #  'https://raw.githubusercontent.com/restrepo/medicion/master/cienciometria/data/UDEAtmp300.json'
    #    )#Test: 199+1=200 found
    
UDEA=drive_files.biblio['WOS'].reset_index(drop=True)
#DEBUG
#UDEA=UDEA.sample(300,replace=True).reset_index(drop=True) #Test: 77 found
#tmp=drive_files.load_biblio('Sample_WOS.xlsx')



In [84]:
if REBUILD:
    UDEA=clean_institutional_columns(UDEA,prefix='UDEA',Tipo='Tipo')
    UDEA['UDEA_authors']=None


In [85]:
for t in UDEA.Tipo.unique():
    print( '{}:{}'.format( t, UDEA[ UDEA.Tipo==t].shape[0] ) )

WOS_SCP_UDEA:1043
SCI:2258
WOS:1541
SCP:2112
WOS_SCP:4777
WOS_SCI_SCP:557
SCI_SCP:1116
WOS_UDEA:343
SCI_SCP_UDEA:500
SCI_UDEA:634
SCP_UDEA:461
WOS_SCI_SCP_UDEA:211
WOS_SCI_UDEA:47
WOS_SCI:100


In [86]:
UDEA.shape

(15700, 181)

## Load trained old data 

### Merge WOS_SCP_SCI with trained data set PTJ_CTR

Merge requires split in DI and TI


15700 (15700, 152)
(7072, 169) (8628, 169)

In [87]:
if MERGE_WITH_TRAINED:
    if os.path.exists('WOS_SCP_UDEA_SJR_SIU.xlsx'):
        SIU=pd.read_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')
    else:    
        SIU=drive_files.read_drive_excel('WOS_SCP_UDEA_SJR_SIU.xlsx')
        
    UDEA,SIU=fill_trained_data(UDEA,SIU)#TODO: Remnove SIU

In [88]:
if MERGE_WITH_TRAINED:
    UDEA.to_json('UDEAtmp.json')
    RECOVER=False
    if RECOVER:
        UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [89]:
if 'UDEA_autores' in UDEA.columns and UDEA[UDEA['UDEA_autores']==''].shape[0]:
    UDEA['UDEA_autores']=UDEA['UDEA_autores'].apply(lambda s: pd.np.nan if type(s)==str and s=='' else s)

In [90]:
if 'UDEA_autores' in UDEA.columns:
    print(UDEA[UDEA['UDEA_autores']==''].shape[0],UDEA['UDEA_autores'].dropna().shape[0])

0 10311


# Puntaje

UDEA

In [91]:
qq=UDEA.copy()

In [92]:
drive_files.biblio['WOS']=qq
drive_files.biblio['WOS'].shape

(15700, 181)

In [93]:
tmp=drive_files.load_biblio('produccion_fecha_vig_2003_2018.xlsx',prefix='UDEA')

In [94]:
pp= drive_files.biblio['UDEA'].copy()

In [95]:
drive_files.biblio['UDEA']=pp

In [96]:
df=merge_puntaje(drive_files)

(32581, 24)
va1 0 0
......................................................

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


7258 : 5388 + 1 = 5389
va2 0 5388
......................................................

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


7258 : 5388 + 0 = 5388
va3 0 5388
......................................................

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


va4 0 5388
7258 : 5388 + 0 = 5388
(1, 174) + (5388, 152) = 5389


In [97]:
#TODO: Check why not zero
if 'UDEA_autores' in df.columns:
    print(0,'=',df[df['UDEA_autores']==''].shape[0],'; found:',df['UDEA_autores'].dropna().shape[0])

0 = 0 ; found: 10312


In [98]:
#df['UDEA_autores'].apply(lambda s: pd.np.nan if type(s)==str and s=='' else s).dropna().shape

In [99]:
UDEA=df.copy()

In [100]:
UDEA.shape

(15700, 181)

## Fill C1 for not WOS entries in WOS format and extract  affiliation from C1

In [101]:
#Fill from SCI_C1
UDEA['C1']=SCI_C1_to_C1(UDEA)

In [102]:
#Fill from SCP_C1='SCP_Authors with affiliations
UDEA['C1']=SCP_Authors_with_affiliations_to_C1(UDEA)

In [103]:
UDEA[UDEA['C1'].isnull()].shape

(0, 181)

In [104]:
UDEA[UDEA.Tipo=='WOS'].reset_index(drop=True).C1.loc[0]

'[Bedoya Hernandez, Mauricio Hernando] Univ Antioquia, Educ, Medellin, Colombia.\n[Bedoya Hernandez, Mauricio Hernando] Univ Antioquia, Psicol, Medellin, Colombia.\n[Bedoya Hernandez, Mauricio Hernando] Univ Antioquia, Ciencias Sociales, Medellin, Colombia.\n[Bedoya Hernandez, Mauricio Hernando] Univ Antioquia, Medellin, Colombia.\n'

In [105]:
UDEA['authors_WOS']=UDEA.C1.apply(lambda x: x.split('\n') if x else x).apply(
    lambda x:   [y.replace('[','').replace('] ','; ') for y in x if y.find(affil)>-1 ] if x else x ).apply(
     lambda x: get_author_info(x) if x else x)

# Improve normalization: remove C1s with only affiliation (from Scielo)
UDEA['authors_WOS']=UDEA['authors_WOS'].apply( 
    lambda x: [d for d in x if d.get('WOS_author').find(affil)==-1] if type(x)==list else x )

In [106]:
UDEA[UDEA.Tipo=='SCP'].reset_index(drop=True).loc[0].authors_WOS

[{'WOS_author': 'Mesa-Vanegas, A. M.',
  'affiliation': ['Grupo de Investigación Malaria, Sede de Investigación Universitaria(SIU), Facultad de Medicina, Univ Antioquia, Medellín, Colombia'],
  'i': 0},
 {'WOS_author': 'Cardona, F.',
  'affiliation': ['Grupo de Estudios Botánicos, Herbario Univ Antioquia, Medellín, Colombia'],
  'i': 1},
 {'WOS_author': 'Sáez-Vega, J. A.',
  'affiliation': ['Grupo Química de Plantas Colombianas, Instituto de Química, Univ Antioquia, Medellín, Colombia'],
  'i': 2},
 {'WOS_author': 'Trujillo, S. B.',
  'affiliation': ['Grupo de Investigación Malaria, Sede de Investigación Universitaria(SIU), Facultad de Medicina, Univ Antioquia, Medellín, Colombia'],
  'i': 3}]

## Prepare UDEA columns

In [107]:
#TODO: Remove from fill_trained_data(..)
if 'UDEA_autores' in UDEA.columns:
    UDEA['UDEA_autores']=UDEA['UDEA_autores'].apply(lambda s: re.sub('\s+',' ',s) if type(s)==str else s)
    UDEA['UDEA_authors']=UDEA['UDEA_autores'].apply(lambda s: s.split(';') if type(s)==str else s).apply(
                           lambda l: [{'full_name':y} for y in l ] if type(l)==list else l)

## Merge with official researcher list: PTJ

In [108]:
AU=drive_files.read_drive_excel('Base_de_datos_investigadores_Definitiva.csv')

In [109]:
UPDATE_UDEA_authors_with_AU=True
if (UDEA['UDEA_authors'].dropna().shape[0] and 
    UPDATE_UDEA_authors_with_AU):
    kkn=UDEA.copy()
    kkn=update_institutional_authors(kkn,AU)
    print(kkn.shape,UDEA.shape)
    UDEA=kkn.copy()

0
1
2
3
4
5
6
7
8
9
10
11
(15700, 181) (15700, 181)


Quality check

In [110]:
key_contains_in_list_of_dictionaries(UDEA,'Restrepo, D',column='authors_WOS',key='WOS_author').loc[1:2]

1    [{'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'i': 0, 'WOS_author': 'Restrepo, Diego'}]
2       [{'affiliation': ['Univ Antioquia, Inst Fis, Medellin 1226, Colombia.'], 'i': 0, 'WOS_author': 'Restrepo, D.'}]
Name: authors_WOS, dtype: object

In [111]:
if UPDATE_UDEA_authors_with_AU:
    UDEA.to_json('UDEAtmp.json')
    RECOVER=False
    if RECOVER:
        UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Add `UDEA.authors_WOS` info* within `UDEA.UDEA_authors` data**
(\*) obtained from `UDEA.C1`

(\*\*) Obtained from [puntaje trained old UDEA data](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-trained-data-set) and the [official researcher list](./WOS_SCI_SCP_PTJ_GS_LNS.ipynb#Merge-with-official-researcher-list)

Obtain name parts and initials from full name in `UDEA_authors` dictionary and update `UDEA_authors` with them

In [112]:
import sys
if 'UDEA_authors' not in UDEA.columns and REBUILD==False:
    sys.exit('Make MERGE_WITH_TRAINED True and run again')

In [113]:
# Obtain spanish name parts from full name
dictupdatetmp=UDEA['UDEA_authors'].apply(lambda x: [y.update( 
                split_full_names(y,full_name='full_name')  ) if not pd.isnull(
                y.get('full_name')) else y for y in x] 
                                   if type(x)==list 
                                   else x)

In [114]:
kk=UDEA['authors_WOS'].combine( UDEA['UDEA_authors'], func=combinewos )

In [115]:
UDEA['UDEA_authors'].loc[0]

[{'INICIALES': 'D. S.',
  'NOMBRES': 'Del Socorro',
  'PRIMER APELLIDO': 'Lopez',
  'SEGUNDO APELLIDO': 'Gomez',
  'WOS_affiliation': ['Univ Antioquia, Fac Ciencias Econ, Bogota, Colombia.'],
  'WOS_author': ['Lopez Gomez, Maria del Socorro'],
  'full_name': 'LOPEZ GOMEZ MARIA DEL SOCORRO'}]

In [116]:
UDEA.to_json('UDEAtmp.json')

### Load output restuls of previous Cell runs

In [117]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

## Build a single profile for all

### Fill UDEA_authors with WOS_author info

Obtain UDEA_authors DataFrame: `aunly`

In [118]:
aunly=DataFrame_authors(UDEA)

TABORDA AGUDELO FARLAN


In [119]:
if not aunly.empty:
    aunly.to_json('UDEA_authors_with_WOS_info.json')

In [120]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [121]:
UDEA.shape

(15700, 181)

In [122]:
if RECOVER:
    if os.path.exists('UDEA_authors_with_WOS_info.json' ):
        aunly=pd.read_json('UDEA_authors_with_WOS_info.json')
    else:
        aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json')

In [123]:
aunly.shape

(1273, 2)

(800, 2)

## Merge UDEA with authors

In [124]:
UDEA['UDEA_authors']=UDEA['UDEA_authors'].apply(lambda l:fill_full_wos_author_info(l,aunly) )

In [125]:
if UDEA['UDEA_authors'].dropna().shape[0]:
    UDEA.to_json('UDEAtmp.json')

In [126]:
RECOVER=False
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [127]:
UDEA.shape

(15700, 181)

In [128]:
kk=UDEA.authors_WOS.combine(UDEA.UDEA_authors,func=lambda x,y: get_UDEA_authors(x,y,aunly))

In [129]:
UDEA.UDEA_authors.dropna().shape

(10312,)

(7072,)

(10960,)

In [130]:
UDEA['UDEA_authors']=kk

In [131]:
UDEA.UDEA_authors.dropna().shape,UDEA.shape

((10900,), (15700, 181))

((8446,), (15700, 169))

((10963,), (15704, 181))

In [132]:
aunly.shape

(1273, 2)

(1461, 2)

In [133]:
if not aunly.empty:
    print(aunly.drop_duplicates('tmp_author').shape)

(1273, 2)


In [134]:
if not aunly.empty:
    aunly.to_json('UDEA_authors_with_WOS_info.json')

In [135]:
RECOVER=False
if RECOVER:
    if os.path.exists('UDEA_authors_with_WOS_info.json' ):
        aunly=pd.read_json('UDEA_authors_with_WOS_info.json')
    else:
        aunly=drive_files.read_drive_json('UDEA_authors_with_WOS_info.json')

In [136]:
if UDEA['UDEA_authors'].dropna().shape[0]:
    UDEA.to_json('UDEAtmp.json')

In [137]:
if RECOVER:
    UDEA=pd.read_json('UDEAtmp.json').reset_index(drop=True)

In [138]:
UDEA.to_json('WOS_SCI_SCP_PTJ_CTR.json.gz',compression='gzip')

In [139]:
if 'UDEA_autores' in UDEA.columns:
    print(UDEA[UDEA['UDEA_autores']==''].shape[0],UDEA['UDEA_autores'].dropna().shape[0])

0 10312


In [140]:
if 'UDEA_authors' in UDEA.columns:
    print(UDEA[UDEA['UDEA_authors']==''].shape[0],UDEA['UDEA_authors'].dropna().shape[0])

0 10900


In [141]:
print 1

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-141-c94594b6b28f>, line 1)

## Add PTJ directly from `UDEA_authors` with `WOS_info` DataFrame

In [445]:
UDEA=pd.read_json('WOS_SCI_SCP_PTJ_CTR.json.gz',compression='gzip').reset_index(drop=True)

In [144]:
def build_institutional_authors(x,author_df,x_author_key='WOS_author',x_affiliation_key='affiliation',
                                        author_key='WOS_author',
                                        affiliation_key='WOS_affiliation'):
    if type(x)!=list:
        return None
    ll=[]
    for j in range(len(x)):
        
                                #author_WOS→affiliation always have single affiliation
        kk=find_author_affiliation(x[j].get(x_author_key),x[j].get(x_affiliation_key)[0],
                                        author_df=author_df,
                                        author_key=author_key,
                                        affiliation_key=affiliation_key,
                                        ratio=0.9 )
        if kk:
            ll.append(kk)
    if not ll:
        ll=None
    return ll

In [145]:
if not UDEA['UDEA_authors'].dropna().shape[0]:
    UDEA['UDEA_authors']=UDEA.authors_WOS.apply(lambda l: build_institutional_authors(l,aunly) )

## Experimental: Change similirity by merge search

In [466]:
UDEA_YES=UDEA[~UDEA['UDEA_authors'].isna()].reset_index(drop=True)

In [467]:
UDEA_YES.shape

(10902, 181)

In [468]:
import fuzzywuzzy.process as fwp
from fuzzywuzzy import fuzz
UDEA_NOT=UDEA[UDEA['UDEA_authors'].isna()].reset_index(drop=True)
df2=pd.DataFrame( list( aunly['UDEA_authors'].values ) )
df2['UDEA_authors']=aunly['UDEA_authors']
contents=df2[['WOS_author','WOS_affiliation','UDEA_authors']].reset_index(drop=True)
contents['WOS_author']=contents['WOS_author'].astype(str)
contents['WOS_affiliation']=contents['WOS_affiliation'].astype(str)

In [469]:
print( UDEA_NOT['authors_WOS'].loc[0][0].get('WOS_author'),
      fwp.extractOne(  UDEA_NOT['authors_WOS'].loc[0][0].get('WOS_author'),
                     contents['WOS_author'],scorer=fuzz.partial_ratio  ) )

Gutierrez, Carlos A. ("['Velez, Carlos A.']", 87, 71)


In [470]:
pr_author_min=95
pr_affiliation_min=60
scorer=fuzz.partial_ratio
df1['TI']=UDEA_NOT[['TI']].copy()
df1['UDEA_authors_raw']='[]'
df1['UDEA_authors_raw']=df1['UDEA_authors_raw'].apply(lambda s: eval(s))
import time

s=time.time()
#intialize result dataframe
for i in range(UDEA_NOT['authors_WOS'].apply(len).max()):
#if True:
    kk=pd.DataFrame( list(  UDEA_NOT['authors_WOS'].str[i].apply(
                lambda d: d if type(d)==dict else {} ).values )).reset_index(drop=True)
    df1['WOS_author' ]=kk['WOS_author' ]
    df1['affiliation']=kk['affiliation']    
    df1=df1.reset_index(drop=True)
    
    df1['mtch']=df1['WOS_author'].apply(lambda s: 
                    list(fwp.extractOne(s,contents['WOS_author'],scorer=scorer ))  
                           if type(s)==str else s)


    # Prepare merge with affiliations
    df1['mtch_0']=df1['mtch'].str[0]

    kk=df1[['mtch_0']].merge( contents,left_on='mtch_0',right_on='WOS_author',how='left')
    nmtch=kk['UDEA_authors'].dropna().shape[0]
    print(nmtch)
    if not nmtch:
        break
    df1['aff']=kk['WOS_affiliation']
    df1['UDEA_authors_j']=kk['UDEA_authors']

    df1['aff_pr']=df1['affiliation'].astype(str).combine( 
                  df1['aff'],func=lambda s,r: scorer(s,r) 
                  if type(s)==str and type(r)==str else 0)
    
    tmp=df1['UDEA_authors_j'].combine(
          df1['mtch'].str[1],func=lambda d,x: d.update({'pr_author':x}) 
          if type(d)==dict else d)
    tmp=df1['UDEA_authors_j'].combine(
          df1['aff_pr'],func=lambda d,x: d.update({'pr_affiliation':x})
          if type(d)==dict else d)
    
    tmp=df1['UDEA_authors_raw'].combine(df1['UDEA_authors_j'],func=
                            lambda l,d: l.append(d) if type(d)==dict else l )
    
df1['UDEA_authors']=df1['UDEA_authors_raw'].apply(lambda l: 
                          [d for d in l if d.get('pr_author')>pr_author_min and 
                              d.get('pr_affiliation')>pr_affiliation_min ] 
                          if type(l)==list else l).apply(
                          lambda l: l if l else None )
print(time.time()-s)

3912
1544
876
426
203
102
54
34
14
5
3
1
1
1
1
1
1
1
1
1
1
310.33805799484253


In [471]:
UDEA_NOT['UDEA_authors']=df1['UDEA_authors']

In [472]:
UDEA_NOT['UDEA_authors'].dropna().shape

(923,)

In [436]:
UDEA=UDEA_YES.append(UDEA_NOT).reset_index(drop=True)

In [438]:
UDEA['UDEA_authors'].dropna().shape

(11825,)

In [292]:
mtch=df1[ pd.np.logical_and( df1['mtch'].str[1]>95,df1['aff_pr']>70 ) 
       ][['TI','UDEA_authors_j','WOS_author','mtch','affiliation','aff','aff_pr']].reset_index(drop=True)

In [None]:
df1['UDEA_authors_fin'].dropna().shape

## Try other approachs

In [474]:
wp.merge_with_close_matches??

In [477]:
%%writefile test.cfg
[FILES]
Sample_WOS.xlsx = 1--LJZ4mYyQcaJ93xBdbnYj-ZzdjO2Wq2
Sample_SCI.xlsx = 1-3a-hguQTk5ko8JRLCx--EKaslxGVscf
Sample_SCP.xlsx = 1-IAWlMdp2U-9L2jvZUio04ub1Ym3PX-H

Writing test.cfg


In [479]:
cib=wp.wosplus('test.cfg')
#cib.Debug=True
cib.load_biblio('Sample_WOS.xlsx')
cib.load_biblio('Sample_SCI.xlsx',prefix='SCI')
cib.load_biblio('Sample_SCP.xlsx',prefix='SCP')

In [558]:
def get_close_matches_Levenshtein(
        word,
        possibilities,
        n=3,
        cutoff=0.6,
        full=False):
    '''Replaces difflib.get_close_matches with faster algortihm based on
       Levenshtein.ratio.
       HINT: Similarity increase significatively after lower() and unidecode()

       Refs: https://en.wikipedia.org/wiki/Levenshtein_distance
    '''
    import pandas as pd
    import Levenshtein
    if isinstance(possibilities, str):
        possibilities = [possibilities]
    rs = pd.DataFrame()
    MATCH = False
    for p in possibilities:
        similarity = Levenshtein.ratio(word, p)
        # print(word,'::',p,similarity)
        # sys.exit()
        if similarity >= cutoff:
            MATCH = True
            rs = rs.append({'similarity': similarity,
                            'match': p}, ignore_index=True)

    if MATCH:
        rs = rs.sort_values(
            'similarity', ascending=False).reset_index(drop=True)
        if full:
            return list(rs['match'][:n].values), list(
                rs['similarity'][:n].values)
        else:
            return list(rs['match'][:n].values)
    else:
        if full:
            return ([], 0)
        else:
            return []

In [592]:
get_close_matches_Levenshtein(
            words[i], possibilities, n=1, cutoff=0, full=True)

(['treatment of an aedes aegypti colony with the cry11aa toxin for 54 generations results in the development of resistance'],
 [1.0])

In [595]:
def get_close_matches_Levenshtein_new(
        word,
        possibilities,
        n=1,
        cutoff=0.6,
        full=False):
    '''cuttof used for backwards compatibility'''
    r=fwp.extract(words[i],possibilities,scorer=scorer,limit=n)
    if full:
        return [t[0] for t in r],[t[1]/100. for t in r]
    else:
        return [t[0] for t in r]

In [597]:
get_close_matches_Levenshtein_new(
            words[i], possibilities, n=2, cutoff=0, full=True)

(['treatment of an aedes aegypti colony with the cry11aa toxin for 54 generations results in the development of resistance',
  'interaction between paracoccidioides brasiliensis conidia and the coagulation system: involvement of fibrinogen'],
 [1.0, 0.47])

In [536]:
def merge_with_close_matches_new(
        left,
        right,
        left_on='ST',
        right_on='UDEA_simple_título',
        left_extra_on='SO',
        right_extra_on='UDEA_nombre revista o premio',
        how='inner',
        n=1,
        cutoff=0.6,
        full=True,
        cutoff_extra=0.6):
    '''For each entry of the column: left_on of DataFrame left (cannot have empty fields),
       try to find the close match inside each row of right DataFrame, by comparing with
       the right_on entry of the row. When a row match is found, the full right row is appended
       to the matched row in the left DataFrame.
       If the similarity between the entries at left_on and right_on is less than 0.8,
       an additional check is performed between the entries left_extra_on and right_extra_on
       of the matched row.

       how implemented: inner and left (Default: inner)
    '''
    import numpy as np
    from unidecode import unidecode
    import pandas as pd
    # import sys #import globally
    # print(left[left_on][0])
    # sys.exit()
    words = left[left_on].str.lower().map(unidecode)
    possibilities = right[right_on].str.lower().map(unidecode)

    joined = pd.DataFrame()
    mi = np.array([])
    for i in left.index:
        if i % 100 == 0:
            print('.', end="")
        joined_series = left.loc[i]
        #joined_series=joined_series.append(pd.Series( {similarity_column:0} ))
        title, similarity = get_close_matches_Levenshtein(
            words[i], possibilities, n=n, cutoff=cutoff, full=full)
        # print(i,words[i],title,similarity) #cutuff 0.6 0.7 0.8 0.85 0.91 0.95
        # sys.exit()
        if title:
            mtch = right[possibilities == title[0]]
            # >=cutoff, e.g 0.65 0.95 0.81 0.86 0.9 0.96
            chk_cutoff = similarity[0]
            crosscheck = cutoff + 0.2  # 0.8 # e.g. 0.8 0.9 0.9 0.9 0.9 0.9
            if crosscheck >= 1:
                # force check if match worst than this (by experience)
                crosscheck = 0.95
            if chk_cutoff < crosscheck:  # e.g 0.65<0.8 0.95~<0.9 0.81~<0.0 0.86<0.9 0.91<~0.9 0.96~<0.9
                if get_close_matches_Levenshtein(unidecode(left[left_extra_on][i].lower()), [unidecode(
                        mtch[right_extra_on][mtch.index[0]].lower())], cutoff=cutoff_extra):  # cutoff=0.6
                    chk_cutoff = crosscheck + 0.1

            if chk_cutoff >= crosscheck:
                joined_series = joined_series.append(mtch.loc[mtch.index[0]])
                if how == 'outer':
                    mi = np.concatenate((mi, mtch.index.values))
                # joined_series[similarity_column]=similarity[0]

            #return joined_series
            if how == 'inner':
                joined = joined.append(joined_series, ignore_index=True)

        if (how == 'left' or 'outer'):
            joined = joined.append(joined_series, ignore_index=True)
    if how == 'outer':
        joined = joined.append(right.drop(
            right.index[list(mi.astype(int))]).reset_index(drop=True))
    return joined

In [537]:
kk=merge_with_close_matches_new(cib.biblio['WOS'],cib.biblio['SCI'].drop('Tipo',axis='columns'),
                            left_on='TI',right_on='SCI_TI',right_extra_on='SCI_SO',how='left')

.