In [6]:
! pip install openpyxl gensim==3.8.3 fasttext lightgbm multipledispatch razdel

In [2]:
! wget http://vectors.nlpl.eu/repository/20/187.zip

In [3]:
! unzip 187.zip

In [4]:
! ls

In [7]:
import pandas as pd
import numpy as np
import fasttext
import razdel
import gensim

import fasttext
from gensim.models import FastText

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, average_precision_score
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
import joblib

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [8]:
columns = ['Общее наименование продукции', 'Раздел ЕП РФ (Код из ФГИС ФСА для подкатегории продукции)', 'Подкатегория продукции']
big = pd.read_excel('/kaggle/input/hackaton/big.xlsx', sheet_name='все ДС с кодами')[columns]
small = pd.read_excel('/kaggle/input/hackaton/small.xlsx', sheet_name='Данные для сопоставления')[columns]
big = big.rename(columns={"Общее наименование продукции": "product_name", 
                   "Раздел ЕП РФ (Код из ФГИС ФСА для подкатегории продукции)": "level",
                   "Подкатегория продукции": "category"})

small = small.rename(columns={"Общее наименование продукции": "product_name", 
                   "Раздел ЕП РФ (Код из ФГИС ФСА для подкатегории продукции)": "level",
                   "Подкатегория продукции": "category"})

all_data = pd.concat([big, small])

In [9]:
all_data['level_2'] = all_data['level'].str.split(';')
all_data['category'] = all_data['category'].str.split(';')

In [10]:
all_data = all_data.set_index(['product_name', 'level']).explode(['level_2', 'category']).reset_index()
all_data['level_1'] = all_data['level_2'].apply(lambda x: x.split('.')[0]).str.strip()
all_data['level_2'] = all_data['level_2'].str.strip()
all_data['category'] = all_data['category'].str.strip()

In [11]:
all_data.shape

In [12]:
dictionary = all_data[['category', 'level_1', 'level_2']].drop_duplicates()

In [13]:
dictionary.shape

In [14]:
dictionary.to_csv('dictionary_level_2.csv', sep=';', index=None)

In [15]:
all_data.shape

## Предобработка

In [16]:
import re
russian_stopwords = open('/kaggle/input/hackaton/stopwords-ru.txt', 'r').read().split('\n')
okpd = pd.read_excel('/kaggle/input/hackaton/okpd.xlsx').drop_duplicates(subset=['okpd'])
okpd = okpd[okpd['okpd'].str.len()>10].reset_index(drop=True).reset_index()
dictionary = pd.read_csv('/kaggle/input/hackaton/levels.csv', sep=';', encoding='cp1251').dropna(subset=['level_2'])
dictionary['level'] = dictionary['level_1'].astype(str) + '.' + dictionary['level_2'].astype(int).astype(str)
dictionary = dictionary.drop_duplicates(subset='level')

def delete_stopwords(s):
    return ' '.join([word for word in (re.sub(r'[()\s+]', u' ', s)).split() if word.lower() not in russian_stopwords]).split()

def delete_punctuation(s):
    return re.sub(r'[®?"\'-_/.:?!1234567890()%<>;,+#$&№\s+]', u' ', s)

def get_okpd(line) -> int :
    okpd_re = re.compile('окпд2\x20*(\d{2}\.\d{2}\.\d{2}\.\d{3})')
    res = re.findall(okpd_re, line.lower())
    return res[0] if len(res) > 0 else None 

In [17]:
all_data['okpd'] = all_data['product_name'].apply(get_okpd)
all_data = all_data.join(okpd.set_index('okpd'), on = 'okpd')
all_data['clean_product_name'] = all_data['product_name'].apply(lambda x: ' '.join(delete_stopwords(delete_punctuation((x)))))

In [18]:
all_data.columns

## Определение категории статистикой

In [19]:
all_categories = all_data.groupby(['level_2', 'category'])['level'].count().reset_index()
all_categories = all_categories.rename(columns={'level':'count'})

In [20]:
def get_category_sim(product_name:str, category: str, all_categories):
    all_categories['sim'] = get_similarity(product_name, all_categories)
    all_categories = all_categories.sort_values(by=['sim', 'count'], ascending=False)
    label, probability = all_categories[['level_2', 'sim']].values[0]
    return label, probability, category==label

def get_similarity(product_name:str, all_categories):
    probability = []
    for category in all_categories:
        probability.append(fuzz.token_sort_ratio(short_rp_name, category)/100)
    return probability#all_categories['category'].apply(lambda x: ).values.tolist()

In [21]:
# all_categories_list = all_categories[all_categories['count']<50]['category'].tolist()
# print(len(all_categories_list))
# all_data['features'] = all_data['clean_product_name'].progress_apply(lambda x: get_similarity(x, all_categories_list ))

## FastText обученный

In [22]:
import razdel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


def tokenize_with_razdel(text):
    tokens = [token.text for token in razdel.tokenize(text)]
    
    return tokens

In [23]:
model = gensim.models.KeyedVectors.load("model.model")
# model =  gensim.models.FastText.load_fasttext_format('/kaggle/input/fasttext-1/fb_model.bin')

In [24]:
all_data_tokens = np.array([tokenize_with_razdel(text) for text in all_data['clean_product_name'].str.lower()])

In [25]:
# from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer=lambda x: x)
tfidf = tfidf_vectorizer.fit(all_data_tokens)
word2weight = {word : tfidf.idf_[i] for word, i in tfidf.vocabulary_.items()}

In [26]:
# Если встретится слово, которое неизвестно модели tf-idf, то буду считать его редким  и дам ему максимальный idf

max_idf = max(tfidf.idf_)
# np.array([model[token.text.lower()]*word2weight[token.text.lower()] 

In [27]:
max_idf

In [28]:
def get_text_embedding_with_tf_idf(model, phrase):
    # построение эмбеддинга для предложения
    tokens = tokenize_with_razdel(phrase)[:15]
    embedding_word = np.array([model[token.lower()]*word2weight[token.lower()] 
                               if token.lower() in word2weight 
                               else model[token.lower()]*max_idf for token in tokens])
   
    embedding_mean = embedding_word.mean(axis=0)
    if np.any(np.isnan(embedding_mean)):
        return  np.zeros((1, model.vector_size))
    else:
        return embedding_mean

In [29]:
target_labels = dict(enumerate(set(all_data['level_2'].tolist())))
labels_target = {v: k for k, v in target_labels.items()}

In [30]:
import numpy as np

y = all_data['level_2'].apply(lambda x: labels_target[x]).to_numpy()

In [31]:
X = np.zeros((all_data.shape[0], model.vector_size))
for i, embedding in enumerate(all_data["clean_product_name"].reset_index(drop=True).values):
    X[i, :] = get_text_embedding_with_tf_idf(model, embedding)

# X = np.array([get_text_embedding_with_tf_idf(model, text) for text in all_data["clean_product_name"].reset_index(drop=True).values])

print(X.shape)
print(y)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

# clf = SVC(gamma ='auto', )
clf = LinearSVC(random_state=42)

clf.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

## Предсказание модели  

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from lightgbm import LGBMClassifier
import joblib