In [89]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.phrases import Phrases, Phraser

In [90]:
df = pd.read_csv("train.csv", index_col=0)
df.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0


## Статистика признаков

In [91]:
len(set(df.name_1.to_list()))

17656

In [92]:
len(set(df.name_2.to_list()))

17684

In [93]:
len(set(df.name_1.to_list()) - set(df.name_2.to_list()))

338

## Статистика таргета

In [94]:
df['is_duplicate'].value_counts()

0    494161
1      3658
Name: is_duplicate, dtype: int64

## Препроцессинг

In [95]:
# Приведение к нижнему регистру
def lower_str(data, column):
    return data[column].str.lower()

In [96]:
df['name_1'] = lower_str(df, 'name_1')
df['name_2'] = lower_str(df, 'name_2')

In [97]:
df = df.dropna()

In [98]:
y = df['is_duplicate'].values
X = df.drop('is_duplicate', axis=1)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=y)

In [100]:
name = (X_train['name_1'] + X_train['name_2']).unique()

In [101]:
tfidf_veczr = TfidfVectorizer(ngram_range=(1,3))    
tf_corpus = tfidf_veczr.fit_transform(name)

In [102]:
train_name_1 = tfidf_veczr.transform(X_train['name_1'].apply(lambda x: x).tolist())
train_name_2 = tfidf_veczr.transform(X_train['name_2'].apply(lambda x: x).tolist())
X_train = train_name_1 + train_name_2

In [106]:
test_name_1 = tfidf_veczr.transform(X_test['name_1'].apply(lambda x: x).tolist())
test_name_2 = tfidf_veczr.transform(X_test['name_2'].apply(lambda x: x).tolist())
X_test = test_name_1 + test_name_2

In [103]:
X_train

<398255x952168 sparse matrix of type '<class 'numpy.float64'>'
	with 5710068 stored elements in Compressed Sparse Row format>

# Модель

In [105]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    random_seed=43,
)
model.fit(X_train, y_train)

Learning rate set to 0.132801
0:	learn: 0.4136569	total: 1.16s	remaining: 19m 22s
1:	learn: 0.2616907	total: 2.07s	remaining: 17m 11s
2:	learn: 0.1663241	total: 3.27s	remaining: 18m 6s
3:	learn: 0.1103492	total: 4.09s	remaining: 16m 59s
4:	learn: 0.0800046	total: 4.69s	remaining: 15m 33s
5:	learn: 0.0607727	total: 5.21s	remaining: 14m 23s
6:	learn: 0.0491707	total: 5.72s	remaining: 13m 31s
7:	learn: 0.0419523	total: 6.22s	remaining: 12m 51s
8:	learn: 0.0368256	total: 6.75s	remaining: 12m 23s
9:	learn: 0.0335781	total: 7.29s	remaining: 12m 1s
10:	learn: 0.0308733	total: 7.91s	remaining: 11m 50s
11:	learn: 0.0288918	total: 8.57s	remaining: 11m 45s
12:	learn: 0.0272444	total: 9.1s	remaining: 11m 31s
13:	learn: 0.0260346	total: 9.63s	remaining: 11m 18s
14:	learn: 0.0250761	total: 10.2s	remaining: 11m 7s
15:	learn: 0.0242951	total: 10.8s	remaining: 11m 1s
16:	learn: 0.0236602	total: 11.4s	remaining: 10m 58s
17:	learn: 0.0230125	total: 11.9s	remaining: 10m 50s
18:	learn: 0.0224436	total: 12.

<catboost.core.CatBoostClassifier at 0x20ca157bb20>

In [107]:
y_test_pred = model.predict(X_test)

In [108]:
from sklearn.metrics import f1_score
print('f1 macro -',f1_score(y_test, y_test_pred, average='macro'))
print('f1 micro -',f1_score(y_test, y_test_pred, average='micro'))
print('f1 weighted -',f1_score(y_test, y_test_pred, average='weighted'))

f1 macro - 0.9582616294686667
f1 micro - 0.9988650516250853
f1 weighted - 0.9988232908931395
