In [49]:
import pandas as pd
import similaripy as sim
from scipy import *
from scipy.sparse import *
from tqdm.auto import tqdm
import editdistance
import numpy as np

import re
import string as string_lib

In [50]:
# first load the data
df_train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
df_test = pd.read_csv("../dataset/original/test.csv", escapechar="\\")
# ALWAYS sort the data by record_id
df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)
df_train.name = df_train.name.astype(str)

In [51]:
def clean(string):
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower() #make lower case
    string = string.translate(str.maketrans('', '', string_lib.punctuation)) # remove punctuation
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string) #remove the list of chars defined above
    string = string.replace('&', ' ')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    #string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single space
    return string

In [52]:
col = [clean(x) for x in tqdm(list(set(df_train.name)))]
col

HBox(children=(IntProgress(value=0, max=333070), HTML(value='')))




['mo yuling',
 'hermes global services fz lle monaco',
 'deerfield investments group ltd',
 'sidney frankel',
 'freshlolkarmrltd',
 'pricewaterhouse coopers malta',
 'solby holding limited',
 'chang min seok',
 'tsax wuli',
 'peter colaco',
 'leisure group holdings inc',
 'dencourt services limited',
 'fotlock international sa',
 'jiang qiang',
 'whemoton scyprusglimited',
 'east dawn group sen',
 'savas oil international corporation',
 'amerwell investments limited',
 'sinocom ideas holdings lizitkd',
 'rivaldi holdings ltd',
 'ventura propertieb limited',
 'natron investments corp',
 'virtus trust limited as trustess of the hayward family 1993 settlement',
 'primeline international development corporation',
 'canal tryst companywlimitld',
 'capital invest portfolio inc',
 'mrpetergschopt',
 'edmundo david tefel pasos',
 'crinam limited',
 'dazzle international limited',
 'furnidec limited',
 'dblmaixe limided',
 'ellenton company atd',
 'utrition inc',
 'hearnville inc',
 'nilo consu

In [53]:
col_words = [x.split(' ') for x in tqdm(col)]
col_words

HBox(children=(IntProgress(value=0, max=333070), HTML(value='')))




[['mo', 'yuling'],
 ['hermes', 'global', 'services', 'fz', 'lle', 'monaco'],
 ['deerfield', 'investments', 'group', 'ltd'],
 ['sidney', 'frankel'],
 ['freshlolkarmrltd'],
 ['pricewaterhouse', 'coopers', 'malta'],
 ['solby', 'holding', 'limited'],
 ['chang', 'min', 'seok'],
 ['tsax', 'wuli'],
 ['peter', 'colaco'],
 ['leisure', 'group', 'holdings', 'inc'],
 ['dencourt', 'services', 'limited'],
 ['fotlock', 'international', 'sa'],
 ['jiang', 'qiang'],
 ['whemoton', 'scyprusglimited'],
 ['east', 'dawn', 'group', 'sen'],
 ['savas', 'oil', 'international', 'corporation'],
 ['amerwell', 'investments', 'limited'],
 ['sinocom', 'ideas', 'holdings', 'lizitkd'],
 ['rivaldi', 'holdings', 'ltd'],
 ['ventura', 'propertieb', 'limited'],
 ['natron', 'investments', 'corp'],
 ['virtus',
  'trust',
  'limited',
  'as',
  'trustess',
  'of',
  'the',
  'hayward',
  'family',
  '1993',
  'settlement'],
 ['primeline', 'international', 'development', 'corporation'],
 ['canal', 'tryst', 'companywlimitld'],
 [

In [54]:
col_exp = [y for x in col_words for y in x]
col_exp

['mo',
 'yuling',
 'hermes',
 'global',
 'services',
 'fz',
 'lle',
 'monaco',
 'deerfield',
 'investments',
 'group',
 'ltd',
 'sidney',
 'frankel',
 'freshlolkarmrltd',
 'pricewaterhouse',
 'coopers',
 'malta',
 'solby',
 'holding',
 'limited',
 'chang',
 'min',
 'seok',
 'tsax',
 'wuli',
 'peter',
 'colaco',
 'leisure',
 'group',
 'holdings',
 'inc',
 'dencourt',
 'services',
 'limited',
 'fotlock',
 'international',
 'sa',
 'jiang',
 'qiang',
 'whemoton',
 'scyprusglimited',
 'east',
 'dawn',
 'group',
 'sen',
 'savas',
 'oil',
 'international',
 'corporation',
 'amerwell',
 'investments',
 'limited',
 'sinocom',
 'ideas',
 'holdings',
 'lizitkd',
 'rivaldi',
 'holdings',
 'ltd',
 'ventura',
 'propertieb',
 'limited',
 'natron',
 'investments',
 'corp',
 'virtus',
 'trust',
 'limited',
 'as',
 'trustess',
 'of',
 'the',
 'hayward',
 'family',
 '1993',
 'settlement',
 'primeline',
 'international',
 'development',
 'corporation',
 'canal',
 'tryst',
 'companywlimitld',
 'capital',
 

In [55]:
from collections import Counter
top_50_words = [x[0] for x in Counter(col_exp).most_common()[:50]]
top_50_words

['limited',
 'ltd',
 'inc',
 'sa',
 'international',
 'holdings',
 'corp',
 'investments',
 'group',
 'management',
 'co',
 'trading',
 'corporation',
 'services',
 'investment',
 'mr',
 'company',
 'enterprises',
 'trust',
 'holding',
 'development',
 'properties',
 'finance',
 'the',
 'foundation',
 'of',
 'capital',
 'global',
 'and',
 'de',
 'nominees',
 'as',
 'trustees',
 'consultants',
 'overseas',
 'associates',
 'assets',
 'a',
 'business',
 'li',
 'consulting',
 'wang',
 'property',
 'chen',
 'trustee',
 's',
 'equities',
 'mrs',
 'technology',
 'maria']

In [63]:
cleaned_col = [[y for y in x if y not in top_50_words] for x in tqdm(col_words)] 
cleaned_col

HBox(children=(IntProgress(value=0, max=333070), HTML(value='')))




[['mo', 'yuling'],
 ['hermes', 'fz', 'lle', 'monaco'],
 ['deerfield'],
 ['sidney', 'frankel'],
 ['freshlolkarmrltd'],
 ['pricewaterhouse', 'coopers', 'malta'],
 ['solby'],
 ['chang', 'min', 'seok'],
 ['tsax', 'wuli'],
 ['peter', 'colaco'],
 ['leisure'],
 ['dencourt'],
 ['fotlock'],
 ['jiang', 'qiang'],
 ['whemoton', 'scyprusglimited'],
 ['east', 'dawn', 'sen'],
 ['savas', 'oil'],
 ['amerwell'],
 ['sinocom', 'ideas', 'lizitkd'],
 ['rivaldi'],
 ['ventura', 'propertieb'],
 ['natron'],
 ['virtus', 'trustess', 'hayward', 'family', '1993', 'settlement'],
 ['primeline'],
 ['canal', 'tryst', 'companywlimitld'],
 ['invest', 'portfolio'],
 ['mrpetergschopt'],
 ['edmundo', 'david', 'tefel', 'pasos'],
 ['crinam'],
 ['dazzle'],
 ['furnidec'],
 ['dblmaixe', 'limided'],
 ['ellenton', 'atd'],
 ['utrition'],
 ['hearnville'],
 ['nilo', 'consultores', 'sas'],
 ['piat'],
 ['fund'],
 ['montis'],
 ['geron', 'pozov'],
 ['gloria'],
 ['francisco', 'ortiz', 'von', 'bismarck'],
 ['newmarket', 'traders'],
 ['tris

In [74]:
[ ' '.join(x) for x in cleaned_col]

['mo yuling',
 'hermes fz lle monaco',
 'deerfield',
 'sidney frankel',
 'freshlolkarmrltd',
 'pricewaterhouse coopers malta',
 'solby',
 'chang min seok',
 'tsax wuli',
 'peter colaco',
 'leisure',
 'dencourt',
 'fotlock',
 'jiang qiang',
 'whemoton scyprusglimited',
 'east dawn sen',
 'savas oil',
 'amerwell',
 'sinocom ideas lizitkd',
 'rivaldi',
 'ventura propertieb',
 'natron',
 'virtus trustess hayward family 1993 settlement',
 'primeline',
 'canal tryst companywlimitld',
 'invest portfolio',
 'mrpetergschopt',
 'edmundo david tefel pasos',
 'crinam',
 'dazzle',
 'furnidec',
 'dblmaixe limided',
 'ellenton atd',
 'utrition',
 'hearnville',
 'nilo consultores sas',
 'piat',
 'fund',
 'montis',
 'geron pozov',
 'gloria',
 'francisco ortiz von bismarck',
 'newmarket traders',
 'tristan energy',
 'chu ah kiew',
 'ikroy',
 'tomsun',
 'palmdella',
 'paula andrea osorio muoz',
 'denvale incorporated',
 'sigesky sav',
 'peace afd serenity limsved',
 'christopher anthony castelloe kathlee

In [64]:
def ngrams(string, n=2):
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower() #make lower case
    string = string.translate(str.maketrans('', '', string_lib.punctuation)) # remove punctuation
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string) #remove the list of chars defined above
    string = string.replace('&', ' ')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single space
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [65]:
ngrams('NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.')

[' N',
 'Ni',
 'in',
 'ng',
 'gb',
 'bo',
 'o ',
 ' S',
 'Su',
 'un',
 'nr',
 'ri',
 'is',
 'se',
 'e ',
 ' E',
 'En',
 'nt',
 'te',
 'er',
 'rp',
 'pr',
 'ri',
 'is',
 'se',
 'es',
 's ',
 ' U',
 'Un',
 'ni',
 'it',
 'te',
 'ed',
 'd ',
 ' C',
 'Co',
 'o ',
 ' L',
 'Lt',
 'td',
 'd ']

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
org_names = list(df_train['name'])

In [69]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(org_names)

In [72]:
tf_idf_matrix.shape[1]

2045

In [48]:
cos_sim = sim.cosine(tf_idf_matrix, tf_idf_matrix.T, k = 300)




  0%|          | 0/333070 [00:00<?, ?it/s][A[A[A


Preprocessing:   0%|          | 0/333070 [00:00<?, ?it/s][A[A[A


Allocate memory per threads:   0%|          | 0/333070 [00:00<?, ?it/s][A[A[A


Computing:   0%|          | 666/333070 [00:03<28:03, 197.40it/s]       [A[A[A


Computing:   0%|          | 1332/333070 [00:06<25:01, 220.97it/s][A[A[A


Computing:   1%|          | 1998/333070 [00:09<25:32, 216.01it/s][A[A[A


Computing:   1%|          | 2664/333070 [00:12<25:30, 215.92it/s][A[A[A


Computing:   1%|          | 3330/333070 [00:14<24:43, 222.21it/s][A[A[A


Computing:   1%|          | 3996/333070 [00:17<24:03, 228.02it/s][A[A[A


Computing:   1%|▏         | 4662/333070 [00:20<23:35, 231.93it/s][A[A[A


Computing:   2%|▏         | 5328/333070 [00:23<23:44, 230.03it/s][A[A[A


Computing:   2%|▏         | 5994/333070 [00:25<23:09, 235.34it/s][A[A[A


Computing:   2%|▏         | 6660/333070 [00:27<22:38, 240.21it/s][A[A[A


Computing:   2%

Computing:  41%|████▏     | 137862/333070 [10:21<14:39, 221.98it/s][A[A[A


Computing:  42%|████▏     | 138528/333070 [10:23<14:36, 222.05it/s][A[A[A


Computing:  42%|████▏     | 139194/333070 [10:26<14:32, 222.12it/s][A[A[A


Computing:  42%|████▏     | 139860/333070 [10:29<14:29, 222.21it/s][A[A[A


Computing:  42%|████▏     | 140526/333070 [10:32<14:26, 222.32it/s][A[A[A


Computing:  42%|████▏     | 141192/333070 [10:34<14:22, 222.41it/s][A[A[A


Computing:  43%|████▎     | 141858/333070 [10:37<14:19, 222.44it/s][A[A[A


Computing:  43%|████▎     | 142524/333070 [10:40<14:16, 222.50it/s][A[A[A


Computing:  43%|████▎     | 143190/333070 [10:43<14:13, 222.53it/s][A[A[A


Computing:  43%|████▎     | 143856/333070 [10:46<14:09, 222.62it/s][A[A[A


Computing:  43%|████▎     | 144522/333070 [10:48<14:06, 222.75it/s][A[A[A


Computing:  44%|████▎     | 145188/333070 [10:51<14:03, 222.85it/s][A[A[A


Computing:  44%|████▍     | 145854/333070 [10:54<13:

Computing:  83%|████████▎ | 275058/333070 [17:40<03:43, 259.28it/s][A[A[A


Computing:  83%|████████▎ | 275724/333070 [17:42<03:41, 259.45it/s][A[A[A


Computing:  83%|████████▎ | 276390/333070 [17:44<03:38, 259.65it/s][A[A[A


Computing:  83%|████████▎ | 277056/333070 [17:46<03:35, 259.66it/s][A[A[A


Computing:  83%|████████▎ | 277722/333070 [17:49<03:33, 259.75it/s][A[A[A


Computing:  84%|████████▎ | 278388/333070 [17:51<03:30, 259.84it/s][A[A[A


Computing:  84%|████████▍ | 279054/333070 [17:54<03:27, 259.70it/s][A[A[A


Computing:  84%|████████▍ | 279720/333070 [17:57<03:25, 259.66it/s][A[A[A


Computing:  84%|████████▍ | 280386/333070 [17:59<03:22, 259.81it/s][A[A[A


Computing:  84%|████████▍ | 281052/333070 [18:01<03:20, 259.95it/s][A[A[A


Computing:  85%|████████▍ | 281718/333070 [18:02<03:17, 260.23it/s][A[A[A


Computing:  85%|████████▍ | 282384/333070 [18:04<03:14, 260.43it/s][A[A[A


Computing:  85%|████████▍ | 283050/333070 [18:05<03:

In [50]:
save_npz('tfidf_300.npz', cos_sim.tocsr())

In [80]:
tf_idf_matrix[1].data

array([0.08557718, 0.08411946, 0.13826249, 0.13346997, 0.08264572,
       0.08334072, 0.08253552, 0.16952436, 0.21962298, 0.21962298,
       0.23380988, 0.20206775, 0.16698972, 0.14297814, 0.17387934,
       0.23578514, 0.20383918, 0.26192819, 0.2724043 , 0.15087366,
       0.172217  , 0.13084198, 0.1415761 , 0.09790477, 0.09508332,
       0.14103535, 0.14190545, 0.1396905 , 0.14412404, 0.08929529,
       0.23039994, 0.17349882, 0.17788399, 0.18644586, 0.06919488,
       0.06897476, 0.06766189, 0.16925091])

In [53]:
similarity = load_npz('tfidf_300.npz')

In [179]:
similarity[2].data.argsort()[::-1]

array([  0, 128,  78,   3,  75,  26, 157,  39, 162, 293,  95, 257, 122,
        13, 235, 217, 150, 270,   6, 149, 205,  60, 175, 126,  33,  62,
       218,  87, 148, 142,  72, 222, 167, 258, 290, 103, 280, 152, 277,
        17,  73, 132,  76, 189, 279, 203, 232, 255, 151,  31, 154,  22,
        20,  18, 124, 130, 178, 143, 156,  71, 197, 196,  23, 214, 101,
       297,  98, 294, 251,  52, 234, 291, 253, 282, 201, 292,  53,  35,
       238, 260,  70,  96,  79,  21,  36, 191, 171, 125,  16,  44, 284,
        90, 120, 209, 168,  57, 268,  80, 146,  55,  10,  34,  59, 278,
        38, 207, 230,  91, 187, 131,  92,  49, 271, 166, 147, 267, 231,
       285, 248,  84, 298, 137, 226,  54, 192, 121, 105, 289, 210, 190,
       159,  74, 249,  25, 236,  46,  48, 250,   1, 116, 141,  56,   7,
       288, 206, 153,  15, 180, 123,  65, 273, 110, 195, 252, 111, 202,
        47, 185, 265, 179, 239, 161, 119,  27, 237, 211, 262, 274,  89,
        42,  43, 221, 193, 242,  63, 204,  86, 263, 261, 155, 10