In [43]:
import re
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from catboost import CatBoostClassifier
from transliterate import translit
import networkx as nx
# from sklearn.metrics.pairwise import cosine_similarity
# from gensim.models.phrases import Phrases, Phraser

In [2]:
df = pd.read_csv("train.csv", index_col=0)
df.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0


## Feature statistics

In [3]:
len(set(df.name_1.to_list()))

17656

In [4]:
len(set(df.name_2.to_list()))

17684

In [5]:
len(set(df.name_1.to_list()) - set(df.name_2.to_list()))

338

## Target statistics

In [6]:
df['is_duplicate'].value_counts()

0    494161
1      3658
Name: is_duplicate, dtype: int64

## Preprocessing

In [7]:
# Приведение к нижнему регистру
def lower_str(data, column):
    return data[column].str.lower()

In [8]:
df['name_1'] = lower_str(df, 'name_1')
df['name_2'] = lower_str(df, 'name_2')

In [60]:
rus_letters = [
    "а",
    "б",
    "в",
    "г",
    "д",
    "е",
    "ё",
    "ж",
    "з",
    "и",
    "й",
    "к",
    "л",
    "м",
    "н",
    "о",
    "п",
    "р",
    "с",
    "т",
    "у",
    "ф",
    "х",
    "ц",
    "ч",
    "ш",
    "щ",
    "ъ",
    "ы",
    "ь",
    "э",
    "ю",
    "я",
]

In [63]:
def translit_to_eng(company_name, language_code="ru"):
        if any([i in rus_letters for i in company_name]):
                return translit(company_name, language_code=language_code, reversed=True)
        else:
                return company_name

In [65]:
df['name_1'] = df['name_1'].apply(lambda x: translit_to_eng(x))
df['name_2'] = df['name_2'].apply(lambda x: translit_to_eng(x))

In [66]:
y = df['is_duplicate'].values
X = df.drop('is_duplicate', axis=1)

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=y)

In [68]:
name = (X_train['name_1'] + X_train['name_2']).unique()

In [69]:
tfidf_veczr = TfidfVectorizer(ngram_range=(1,3))    
tf_corpus = tfidf_veczr.fit_transform(name)

In [70]:
pickle.dump(tfidf_veczr, open("tfidf.pickle", "wb"))

In [71]:
train_name_1 = tfidf_veczr.transform(X_train['name_1'].apply(lambda x: x).tolist())
train_name_2 = tfidf_veczr.transform(X_train['name_2'].apply(lambda x: x).tolist())
X_train = train_name_1 + train_name_2

In [72]:
test_name_1 = tfidf_veczr.transform(X_test['name_1'].apply(lambda x: x).tolist())
test_name_2 = tfidf_veczr.transform(X_test['name_2'].apply(lambda x: x).tolist())
X_test = test_name_1 + test_name_2

In [73]:
X_train

<398255x951979 sparse matrix of type '<class 'numpy.float64'>'
	with 5710353 stored elements in Compressed Sparse Row format>

## Model

In [74]:
model = CatBoostClassifier(
    random_seed=43,
)
model.fit(X_train, y_train)

Learning rate set to 0.132801
0:	learn: 0.4129293	total: 536ms	remaining: 8m 55s
1:	learn: 0.2583494	total: 1.15s	remaining: 9m 34s
2:	learn: 0.1628200	total: 1.65s	remaining: 9m 8s
3:	learn: 0.1091222	total: 2.23s	remaining: 9m 14s
4:	learn: 0.0783559	total: 2.72s	remaining: 9m
5:	learn: 0.0603225	total: 3.15s	remaining: 8m 41s
6:	learn: 0.0484237	total: 3.56s	remaining: 8m 25s
7:	learn: 0.0416666	total: 3.96s	remaining: 8m 10s
8:	learn: 0.0365629	total: 4.35s	remaining: 7m 59s
9:	learn: 0.0331784	total: 4.75s	remaining: 7m 50s
10:	learn: 0.0304922	total: 5.15s	remaining: 7m 43s
11:	learn: 0.0284843	total: 5.54s	remaining: 7m 36s
12:	learn: 0.0269928	total: 5.93s	remaining: 7m 30s
13:	learn: 0.0259225	total: 6.32s	remaining: 7m 25s
14:	learn: 0.0248885	total: 6.72s	remaining: 7m 21s
15:	learn: 0.0240866	total: 7.11s	remaining: 7m 17s
16:	learn: 0.0233534	total: 7.5s	remaining: 7m 13s
17:	learn: 0.0227590	total: 7.88s	remaining: 7m 10s
18:	learn: 0.0222268	total: 8.27s	remaining: 7m 7s

<catboost.core.CatBoostClassifier at 0x151d820e640>

In [75]:
model.save_model('catboost_model') 

In [76]:
y_test_pred = model.predict(X_test)

In [77]:
def eval_metric(test, train):
    f1 = f1_score(test, train)
    precision = precision_score(test, train)
    recall = recall_score(test, train)
    print (f"f1: {f1}\nprecision: {precision}\nrecall: {recall}")

In [78]:
eval_metric(y_test, y_test_pred)

f1: 0.9152046783625731
precision: 0.9842767295597484
recall: 0.855191256830601


## NetworkX
Get all the connected components

In [79]:
df_connected = df[df.is_duplicate==1]

In [80]:
df_connected.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
162,jx nippon oil & gas exploration (brasil) ltda,jx nippon oil & gas exploration technical serv...,1
604,pirelli neumaticos s.a.i.c.,"pirelli tyre co., ltd.",1
836,brenntag australia (pty) ltd.,brenntag group,1
1329,"paul bauder gmbh & co kg, bochum plant",paul bauder ag,1
1563,total ceska republika s.r.o.,total france (arnay le duc),1


In [81]:
G = nx.Graph()
G.add_edges_from(df_connected[['name_1','name_2']].to_numpy().tolist())
conn_comp_lst = list(nx.connected_components(G))

In [82]:
with open("conn_comp", "wb") as fp:
    pickle.dump(conn_comp_lst, fp)

## Matching the input company name with the group

In [83]:
def match_company(model, veczr, connected_components_list, input_name):
    predict_lst = []
    for group in tqdm(connected_components_list):
        group_lst = [input_name] * len(list(group))
        input_vec = veczr.transform(group_lst)
        group_vec = veczr.transform(list(group))
        summ = input_vec + group_vec
        predict_lst.append(model.predict_proba(summ)[:,1].mean())
    max_idx = predict_lst.index(max(predict_lst))

    return connected_components_list[max_idx]

In [84]:
input_name = 'jx nippon oil & gas exploration'
match_company(model, tfidf_veczr, conn_comp_lst, input_name)

100%|██████████| 441/441 [13:11<00:00,  1.79s/it]


{'ooo"guchchi rus"'}

In [86]:
conn_comp_lst

[{'china southern petroleum exploration and development corporation',
  'jx nippon oil & gas exploration (brasil) ltda',
  'jx nippon oil & gas exploration (malaysia) limited',
  'jx nippon oil & gas exploration (myanmar) limited',
  'jx nippon oil & gas exploration (qatar) limited',
  'jx nippon oil & gas exploration (sarawak), ltd',
  'jx nippon oil & gas exploration australia pty ltd.',
  'jx nippon oil & gas exploration corporation',
  'jx nippon oil & gas exploration technical services corporation',
  'nippon oil exploration (sarawak) limited',
  'nippon oil exploration limited',
  'nippon oil exploration u.s.a. limited'},
 {'pirelli de venezuela c.a.',
  'pirelli neumaticos argentina sa',
  'pirelli neumaticos s.a. de c.v.',
  'pirelli neumaticos s.a. de cv',
  'pirelli neumaticos s.a.i.c.',
  'pirelli neumaticos sa',
  'pirelli neumaticos sa de',
  'pirelli neumaticos sa de cv boulev',
  'pirelli neumaticos sa de cv.',
  'pirelli pneus ltda',
  'pirelli tire llc',
  'pirelli tyr