In [35]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline
from nltk import word_tokenize
import warnings;
warnings.filterwarnings('ignore')
from collections import Counter 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem.snowball import SnowballStemmer 

import re
import tqdm
from sklearn.feature_extraction.text import CountVectorizer

df_raw = pd.read_csv(r'C:\Users\zhiti\Documents\GitHub\Frontier-Control\backend\filter\data\model_dataframe.csv',sep=';',encoding='ANSI')
df_raw.drop_duplicates(inplace=True)


STOPWORDS = set(stopwords.words('russian'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s;")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 

def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers.
        text (str): input text
    return (str): modified initial text
    """
    text = str(text).lower()  # lowercase text
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True):
    """
    Lemmatize, tokenize, crop and remove stop words.
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    


def clean_sentences(df):
    """
    Remove irrelavant characters (in new column clean_sentence).
    Lemmatize, tokenize words into list of words (in new column tok_lem_sentence).
    """
    print('Cleaning sentences...')
    df['clean_sentence'] = df['OPISANIE_SPR'].apply(clean_text)
    return df
  
def stemming(df):
    russian_stopwords = stopwords.words("russian")
    russian_stopwords.extend(['…', '«', '»', '...', 'т.д.', 'т', 'д'])

    stemmer = SnowballStemmer("russian") 
    stemmed_texts_list = []
    for text in df['clean_sentence']:
        tokens = word_tokenize(text)    
        stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in russian_stopwords]
        text = " ".join(stemmed_tokens)
        stemmed_texts_list.append(text)
        
    return stemmed_texts_list
    
df = clean_sentences(df_raw)
df.dropna(inplace=True)

def extract_best_indices(m, topk, mask=None):
    """
    Use sum of the cosine distance over all tokens.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score 
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:topk]  
    return best_index

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Adapt stop words
token_stop = tokenizer(' '.join(STOPWORDS), lemmatize=False)

df['stemmed_clean'] = stemming(df)

# Fit TFIDF
vectorizer = CountVectorizer(stop_words=token_stop, tokenizer=tokenizer) 
tfidf_mat = vectorizer.fit_transform(df['stemmed_clean'].values) # -> (num_sentences, num_vocabulary)
tfidf_mat.shape

def get_recommendations_tfidf(sentence, tfidf_mat):
    
    """
    Return the database sentences in order of highest cosine similarity relatively to each 
    token of the target sentence. 
    """
    russian_stopwords = stopwords.words("russian")
    russian_stopwords.extend(['…', '«', '»', '...', 'т.д.', 'т', 'д'])
    stemmer = SnowballStemmer("russian")
    # Embed the query sentence
    sentence = sentence.upper()
    tokens = [str(tok) for tok in tokenizer(sentence)]
    stems = [stemmer.stem(token) for token in tokens if token not in russian_stopwords]
    
    
    vec = vectorizer.transform(stems)
    # Create list with similarity between query and dataset
    mat = cosine_similarity(vec, tfidf_mat)
    # Best cosine distance for each token independantly
    print(mat.shape)
    best_index = extract_best_indices(mat, topk=35)
    return best_index

query_sentence = 'Молоко'

best_index = get_recommendations_tfidf(query_sentence, tfidf_mat)

display(df[['KOD_TNVED_SPR', 'OPISANIE_SPR']].iloc[best_index])


Cleaning sentences...
(1, 12323)


Unnamed: 0,KOD_TNVED_SPR,OPISANIE_SPR
9427,8434200000,ОБОРУДОВАНИЕ ДЛЯ ОБРАБОТКИ И ПЕРЕРАБОТКИ МОЛОКА
1181,406903201,"СЫРЫ ПРОЧИЕ, ФЕТА ИЗ ОВЕЧЬЕГО МОЛОКА ИЛИ МОЛОК..."
624,302910000,"ПЕЧЕНЬ, ИКРА И МОЛОКИ, СВЕЖИЕ ИЛИ ОХЛАЖДЕННЫЕ"
740,303919000,"ПЕЧЕНЬ, ИКРА И МОЛОКИ, МОРОЖЕНЫЕ, ПРОЧИЕ"
1186,406905000,СЫРЫ ПРОЧИЕ: СЫРЫ ИЗ ОВЕЧЬЕГО МОЛОКА ИЛИ МОЛОК...
1045,401201101,МОЛОКО НЕСГУЩЕННОЕИ БЕЗ ДОБАВЛ. САХАРА ИЛИ Д...
639,303190000,"ПРОЧИЕ ЛОСОСЕВЫЕ, МОРОЖЕНЫЕ, ЗА ИСКЛЮЧЕНИЕМ ПЕ..."
852,305200000,"ПЕЧЕНЬ, ИКРА И МОЛОКИ РЫБ, СУШЕНЫЕ, КОПЧЕНЫЕ, ..."
540,302190000,"ПРОЧИЕ ЛОСОСОСЕВЫЕ, ЗА ИСКЛЮЧЕНИЕМ ПЕЧЕНИ, ИКР..."
739,303911000,ИКРА И МОЛОКИ ДЛЯ ПРОИЗВОДСТВА ДЕЗОКСИРИБОНУКЛ...


In [29]:
query_sentence = 'КЕФИР'
best_index = get_recommendations_tfidf(query_sentence, tfidf_mat)
display(df[['KOD_TNVED_SPR', 'OPISANIE_SPR']].iloc[best_index])

(1, 12323)


Unnamed: 0,KOD_TNVED_SPR,OPISANIE_SPR
1102,403905102,ПР.МОЛОЧНЫЕ ПРОД.БЕЗ ВКУСО-АРОМАТИЧ. ДОБАВОК И...
1105,403905302,ПР. МОЛ. ПРОД. БЕЗ ВКУСО-АРОМАТИЧ. ДОБАВОК И Б...
1101,403905101,ПР.МОЛ. ПРОД. БЕЗ ВКУСО-АРОМ. ДОБАВОК И БЕЗ ДО...
1104,403905301,ПР.МОЛ.ПРОД.БЕЗ ВКУСО-АРОМАТ. ДОБАВОК И БЕЗ ДО...
12322,9999999999,FIFA2018
4106,2932209000,ЛАКТОНЫ: ПРОЧИЕ
4114,2933119000,ПРОЧИЕ ПРОИЗВОДНЫЕ ФЕНАЗОНА (АНТИПИРИНА)
4113,2933111000,ПРОПИФЕНАЗОН (INN)
4112,2932990000,"СОЕДИНЕНИЯ ГЕТЕРОЦИКЛИЧЕСКИЕ, СОДЕРЖАЩИЕ ЛИШЬ ..."
4111,2932950000,ТЕТРАГИДРОКАННАБИНОЛЫ (ВСЕ ИЗОМЕРЫ)
