In [2]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz
import jellyfish as jf 
from cleanco import basename
import requests



In [3]:
def data_scrap(url):
    filename = url.split('/')[-1]
    print(filename)
    sl = requests.get(url, stream=True)
    f = open(filename,'wb')
    for chunk in sl.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)
    f.close()
    return

data_scrap('https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1059386/UK_Sanctions_List.ods')

UK_Sanctions_List.ods


In [54]:
df1 = pd.read_csv('supplier_list.csv')

In [5]:
#supplier_list = df1.loc[df1['Name'].str.startswith('RU')]

In [6]:
translated_names = pd.read_csv('translate.csv')

In [7]:
df2 = pd.read_excel('UK_Sanctions_List.ods', engine='odf',skiprows=1,header=1)

In [8]:
uk_sanctions = df2.loc[df2['Unique ID'].str.startswith('RUS')]

In [9]:
uk_sanction = uk_sanctions.loc[uk_sanctions['Individual, Entity, Ship']=='Entity']

In [159]:
def f_get_modified_str(input_str):
    # Transform to lower case
    input_str = input_str.strip().lower()
    # Replace '&' char to word 'and'
    input_str = input_str.replace(' & ', ' and ')
    # Remove all other special chars
    input_str = " ".join(re.findall("[a-zA-Z0-9]+", input_str))
    # Remove extra spaces
    input_str = " ".join(input_str.split())
    return input_str.strip()

# Function to decode any string type to unicode
def to_unicode(obj, encoding='utf-8'):
    if not isinstance(obj, str):
        return obj.decode(encoding, errors='ignore')
    return obj


def name_match(name1, name2):
    name1 = name1.split()
    name2 = name2.split()
    if name1 and name2:
        ratio1 = 0.0
        for i in range(len(name1)):
            for j in range(i, len(name2)):
                if fuzz.ratio(name1[i], name2[j]) > 90:
                    ratio1 += 1.0
                    break
        try:
            ratio1 = 100 * (ratio1 / len(name1))
        except ZeroDivisionError:
            ratio1 = 0.0
        ratio2 = 0.0
        for i in range(len(name2)):
            for j in range(i, len(name1)):
                if fuzz.ratio(name2[i], name1[j]) > 90:
                    ratio2 += 1.0
                    break
        try:
            ratio2 = 100 * (ratio2 / len(name2))
        except ZeroDivisionError:
            ratio2 = 0.0
        return int(max(ratio1, ratio2))
    return 0

# Function to calculate fuzzy match score for two given strings
def f_name_match_score(str1, str2):
    # Transform name
    if len(str1) > 1 and len(str2) > 1:
        str1 = f_get_modified_str(str1)
        str2 = f_get_modified_str(str2)
        # Convert to unicode to avoid error
        str1 = to_unicode(str1)
        str2 = to_unicode(str2)
        # Compute match scores
        score1 = fuzz.ratio(str1, str2)
        score2 = fuzz.token_sort_ratio(str1, str2)
        score3 = jf.levenshtein_distance(str1,str2)
        score3 = (1-(score3/max(len(str1),len(str2))))*100
        #score4 = name_match(str1, str2)
        if len(str1.split()) == 1:
            score4 = fuzz.ratio(str1.split()[0], str2.split()[0])
        s_max = max(score1, score2, score3)
    else:
        s_max = 0
    return s_max

In [160]:
def clean_company_legal_entities(entity_name):
    rp = ['LLC','PJSC','CJSC','IP','GUP','OJSC','JSC','OOO','OO','Limited Liability Company','Open Joint Stock Company','Joint-Stock Company','Public Joint Stock Company','Joint Stock Company','Joint-stock company','AO']
    for k in rp:
        if k in entity_name:
            entity_name = entity_name.replace(k,'')
    # Using basename twice for better clean
    entity_name = basename(entity_name)
    entity_name = basename(entity_name)
    # Remove all other special chars
    entity_name = " ".join(re.findall("[a-zA-Z0-9]+", entity_name))
    # Remove extra spaces
    entity_name = " ".join(entity_name.split())
    return entity_name.strip()

In [161]:
# Entity matching for Russia only
def entity_matching_for_rus(translated_list, uk_sanction_list):    
    df = pd.DataFrame(columns=['Match1','Score1','Match2','Score2','Match3','Score3','Match4','Score4','Match5','Score5'])
    df.index.name = 'Supplier Name'
    fl_dict = {}
    for i in translated_list['Translated value using Google Translate']:
        t_dict={}
        sorted_dict = {}
        i = clean_company_legal_entities(i)
        for j in uk_sanction_list['Name 6']:
            j = clean_company_legal_entities(j)
            if j not in t_dict.keys():
                t_dict[j] = f_name_match_score(i,j)
        sorted_dict = sorted(t_dict.items(),key = lambda kv: kv[1],reverse=True)
        lst = sorted_dict[:5]
        df = df.append(pd.Series({'Match1':lst[0][0],'Score1':lst[0][1],'Match2':lst[1][0],'Score2':lst[1][1],'Match3':lst[2][0],'Score3':lst[2][1],'Match4':lst[3][0],'Score4':lst[3][1],'Match5':lst[4][0],'Score5':lst[4][1]},name=i))
    df = df[~df.index.duplicated(keep='first')]
    df.to_csv('top_Match_rus2.csv')   

In [162]:
entity_matching_for_rus(translated_names, uk_sanction)

In [163]:
# Matching for rest of the countries
df1 = df1.loc[~df1['Name'].str.startswith('RU')]

In [164]:
# Entity matching for Rest countries including Macao
def entity_matching_for_all(supplier_list, uk_sanction_list):    
    df = pd.DataFrame(columns=['Country Code','Match1','Score1','Match2','Score2','Match3','Score3','Match4','Score4','Match5','Score5'])
    df.index.name = 'Supplier Name'
    fl_dict = {}
    for i,cc in zip(supplier_list['Display Name'],supplier_list['Name']):
        t_dict={}
        sorted_dict = {}
        i = str(i)
        i = clean_company_legal_entities(i)
        for j in uk_sanction_list['Name 6']:
            j = clean_company_legal_entities(j)
            if j not in t_dict.keys():
                t_dict[j] = f_name_match_score(i,j)
        sorted_dict = sorted(t_dict.items(),key = lambda kv: kv[1],reverse=True)
        lst = sorted_dict[:5]
        df = df.append(pd.Series({'Country Code':cc ,'Match1':lst[0][0],'Score1':lst[0][1],'Match2':lst[1][0],'Score2':lst[1][1],'Match3':lst[2][0],'Score3':lst[2][1],'Match4':lst[3][0],'Score4':lst[3][1],'Match5':lst[4][0],'Score5':lst[4][1]},name=i))
    df = df[~df.index.duplicated(keep='first')]
    df.to_csv('top_match_for_all_countries2.csv')   

In [165]:
entity_matching_for_all(df1,uk_sanction)