In [1]:
import re
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from transliterate import translit
import networkx as nx

In [2]:
df = pd.read_csv("train.csv", index_col=0)
df.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0


## Feature statistics

In [3]:
len(set(df.name_1.to_list()))

17656

In [4]:
len(set(df.name_2.to_list()))

17684

In [5]:
len(set(df.name_1.to_list()) - set(df.name_2.to_list()))

338

## Target statistics

In [6]:
df['is_duplicate'].value_counts()

0    494161
1      3658
Name: is_duplicate, dtype: int64

## Preprocessing

In [7]:
# Приведение к нижнему регистру
def lower_str(data, column):
    return data[column].str.lower()

In [8]:
df['name_1'] = lower_str(df, 'name_1')
df['name_2'] = lower_str(df, 'name_2')

In [9]:
rus_letters = [
    "а",
    "б",
    "в",
    "г",
    "д",
    "е",
    "ё",
    "ж",
    "з",
    "и",
    "й",
    "к",
    "л",
    "м",
    "н",
    "о",
    "п",
    "р",
    "с",
    "т",
    "у",
    "ф",
    "х",
    "ц",
    "ч",
    "ш",
    "щ",
    "ъ",
    "ы",
    "ь",
    "э",
    "ю",
    "я",
]

In [10]:
def translit_to_eng(company_name, language_code="ru"):
        if any([i in rus_letters for i in company_name]):
                return translit(company_name, language_code=language_code, reversed=True)
        else:
                return company_name

In [11]:
df['name_1'] = df['name_1'].apply(lambda x: translit_to_eng(x))
df['name_2'] = df['name_2'].apply(lambda x: translit_to_eng(x))

In [12]:
y = df['is_duplicate'].values
X = df.drop('is_duplicate', axis=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=y)

In [14]:
name = (X_train['name_1'] + X_train['name_2']).unique()

In [15]:
tfidf_veczr = TfidfVectorizer(ngram_range=(1,3))    
tf_corpus = tfidf_veczr.fit_transform(name)

In [16]:
pickle.dump(tfidf_veczr, open("tfidf.pickle", "wb"))

In [17]:
train_name_1 = tfidf_veczr.transform(X_train['name_1'].apply(lambda x: x).tolist())
train_name_2 = tfidf_veczr.transform(X_train['name_2'].apply(lambda x: x).tolist())
X_train = train_name_1 + train_name_2

In [18]:
test_name_1 = tfidf_veczr.transform(X_test['name_1'].apply(lambda x: x).tolist())
test_name_2 = tfidf_veczr.transform(X_test['name_2'].apply(lambda x: x).tolist())
X_test = test_name_1 + test_name_2

In [19]:
X_train

<398255x951979 sparse matrix of type '<class 'numpy.float64'>'
	with 5710353 stored elements in Compressed Sparse Row format>

## Model

In [20]:
model_logreg = LogisticRegression(random_state=43)
model_logreg.fit(X_train, y_train)

In [21]:
# model.save_model('catboost_model') 

In [22]:
y_test_pred = model_logreg.predict(X_test)

In [23]:
def eval_metric(test, train):
    f1 = f1_score(test, train)
    precision = precision_score(test, train)
    recall = recall_score(test, train)
    print (f"f1: {f1}\nprecision: {precision}\nrecall: {recall}")

In [24]:
eval_metric(y_test, y_test_pred)

f1: 0.7916666666666666
precision: 0.9573643410852714
recall: 0.674863387978142


## NetworkX
Get all the connected components

In [25]:
df_connected = df[df.is_duplicate==1]

In [26]:
df_connected.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
162,jx nippon oil & gas exploration (brasil) ltda,jx nippon oil & gas exploration technical serv...,1
604,pirelli neumaticos s.a.i.c.,"pirelli tyre co., ltd.",1
836,brenntag australia (pty) ltd.,brenntag group,1
1329,"paul bauder gmbh & co kg, bochum plant",paul bauder ag,1
1563,total ceska republika s.r.o.,total france (arnay le duc),1


In [27]:
G = nx.Graph()
G.add_edges_from(df_connected[['name_1','name_2']].to_numpy().tolist())
conn_comp_lst = list(nx.connected_components(G))

In [28]:
with open("conn_comp", "wb") as fp:
    pickle.dump(conn_comp_lst, fp)

## Matching the input company name with the group

In [47]:
def match_company(model, veczr, connected_components_list, input_name):
    predict_lst = []
    for group in tqdm(connected_components_list):
        group_lst = [input_name] * len(list(group))
        input_vec = veczr.transform(group_lst)
        group_vec = veczr.transform(list(group))
        summ = input_vec + group_vec
        predict_lst.append(model.predict_proba(summ)[:,1].max())
    max_idx = predict_lst.index(max(predict_lst))

    return connected_components_list[max_idx]

In [48]:
INPUT_NAME = 'kildair service ulc'
match_company(model_logreg, tfidf_veczr, conn_comp_lst, INPUT_NAME)