In [None]:
import os
import random
import re
import numpy as np
import pandas as pd
import csv
from google.colab import drive
from nltk import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.utils.fixes import sklearn
from scipy.linalg.decomp import inf

In [None]:
SEED = 1988

random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
sklearn.random.seed(SEED)
SEED

In [None]:
drive.mount('/content/gdrive')

In [None]:
train = pd.read_csv("/content/gdrive/MyDrive/Сompanies/train.csv")      #чтение train.csv

In [None]:
!pip install locationtagger

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
import locationtagger

def cleanText(value):
  value = re.sub(r'\([^()]*\)', '', value)      #удаление скобок () и внутреннего текста
  value = re.sub(r"\d+", "", value, flags=re.UNICODE)      #удаление цифр

  value = value.lower()      #приводим к нижнему регистру

  for ch in ['&', 'corporation', 'group', '*', ',', 'ооо', '"', '/', "'"]:      #удаление символов и некоторых слов
   value = value.replace(ch,'')

  array = value.split()
  result = []

  for word in array:
    entities = locationtagger.find_locations(text = word)      #поиск названия городов и стран

    if len(array) == 1:
      result.append(word)
    elif (len(word) > 1) and ('.' not in word) and (len(entities.countries) == 0) and (len(entities.cities) == 0):
      result.append(word)

  resultString = ' '.join(result)

  return resultString

In [None]:
#берем каждое название компании, преобразуем его и составляем csv с оригинальным названием и преобразованным
#готовый файл на 39998 записей можно взять https://drive.google.com/drive/folders/18I9-B__uaw9SzEuU9x_1pLC74MFWYYjy?usp=sharing
length = len(train)        

rows = []
header = ['index', 'original', 'transformed']

for i in range(len(train[1:length])):
    row = train.iloc[i]
    name1 = row['name_1']
    name2 = row['name_2']

    transformed1 = cleanText(name1)
    transformed2 = cleanText(name2)   

    rows.append([i, name1, transformed1])
    rows.append([i, name2, transformed2])

with open('/content/transformed_train.csv', 'w', encoding='UTF8', newline='') as f:
     writer = csv.writer(f)

     writer.writerow(header)

     writer.writerows(rows)

In [None]:
transformed_train = pd.read_csv("/content/gdrive/MyDrive/Сompanies/transformed_train.csv")
transformed_train

In [None]:
!pip install pybind11
!pip install fastwer

Levenshtein Distance - CER 

In [None]:
import fastwer

sumTrue = 0
length = len(train)

for i in range(len(train[0:length])): #с помощью расстояния Левенштейна с порогом CER > 10 сравниваем два обработанных названия, что помогает оценить одинаковые они или нет
    row = train.iloc[i]

    name1 = cleanText(row.name_1)
    name2 = cleanText(row.name_2)

    cer = fastwer.score_sent(name1, name2, char_level=True)

    if cer == inf:
      cer = 0

    isDuplicate = True

    if(cer > 10):
      isDuplicate = False

    if(int(isDuplicate) == row.is_duplicate):
      sumTrue += 1

print("Accuracy is: ", sumTrue/len(train[0:length]))

TF-IDF & KMeans & Levenshtein Distance - CER

In [None]:
transformed_train['transformed'] = transformed_train['transformed'].astype(str)

In [None]:
vectorizer = TfidfVectorizer(stop_words = 'english') 
X = vectorizer.fit_transform(transformed_train.transformed.to_list()) 

In [None]:
true_k = 20
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

In [None]:
search = cleanText("JX Nippon Oil & Gas Exploration (Brasil) Ltda")     #название, для которого мы хотим найти похожее, в результате получаем список похожих названий
Y = vectorizer.transform([search])
orgPrediction = model.predict(Y)
length = len(transformed_train)


for i in range(len(transformed_train[0:length])):
    row = transformed_train.iloc[i]

    text = row.transformed         

    vectorText = vectorizer.transform([text])
    predictionText = model.predict(vectorText)

    if predictionText == orgPrediction:       #если кластеры одинаковые, то рассматриваем cer c порогом 50
      cer = fastwer.score_sent(search, text, char_level=True)

      if cer < 50:
        print(row.original)

Word2Vec & KMeans

In [None]:
text_columns = ["original", "transformed"]

In [None]:
dfRaw = pd.read_csv("/content/gdrive/MyDrive/Сompanies/transformed_train.csv")
df = dfRaw.copy()

In [None]:
for col in text_columns:
    df[col] = df[col].astype(str)

In [None]:
df["tokens"] = df["transformed"].map(lambda x: word_tokenize(x))

In [None]:
_, idx = np.unique(df["tokens"], return_index=True)       #удаляем повторяющиеся элементы
df = df.iloc[idx, :]

docs = df["original"].values
tokenized_docs = df["tokens"].values

In [None]:
model = Word2Vec(sentences=tokenized_docs, workers=1, seed=SEED)

In [None]:
def vectorize(listOfDocs, model):
    features = []

    for tokens in listOfDocs:
        zeroVector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zeroVector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)      #берем среднее значение векторов

In [None]:
def mbkmeans_clusters(X, k, batch_size, print_silhouette_values):

    km = MiniBatchKMeans(n_clusters=k, batch_size=batch_size).fit(X)

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)

        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(silhouette_values, key=lambda tup: tup[2], reverse=True)

    return km, km.labels_

In [None]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=50, batch_size=500, print_silhouette_values=True)

In [None]:
df_clusters = pd.DataFrame({"text": docs, "tokens": [" ".join(text) for text in tokenized_docs], "cluster": cluster_labels})

In [None]:
X = vectorize(["JX Nippon Oil & Gas Exploration (Brasil) Ltda"], model=model)
test_cluster = clustering.predict(X)

In [None]:
most_representative_docs = np.argsort(np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1))

for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")