In [1]:
!pip install pymystem3

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from pymystem3 import Mystem
import pickle
from string import punctuation

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Collecting pymystem3
  Downloading pymystem3-0.2.0-py3-none-any.whl (10 kB)
Installing collected packages: pymystem3
Successfully installed pymystem3-0.2.0
/kaggle/input/products2/products2.csv
/kaggle/input/products3/products2.csv
/kaggle/input/products/products.csv
/kaggle/input/new123/products3.csv
/kaggle/lib/kaggle/gcp.py
/kaggle/input/products2/products2.csv
/kaggle/input/products3/products2.csv
/kaggle/input/products/products.csv
/kaggle/input/new123/products3.csv
/kaggle/working/__notebook_source__.ipynb


In [36]:
recipes=pd.read_csv('/kaggle/input/products3/products2.csv', 
                 names=['Text','t1','t2','t3'],
                 skiprows=1,
                 sep='|')
recipes.dropna(subset = ["Text"], inplace=True)
#recipes.head()

In [None]:
recipes.t1.value_counts().plot(kind='bar')

In [7]:
def clean_text(text):
    text = re.sub("\'", "", text) 
    text = re.sub("[^а-яА-ЯЁё]"," ",text) 
    text = ' '.join(text.split()) 
    text = text.lower() 
    
    return text

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]    
    text = " ".join(tokens)
    return text

stop_words = set(stopwords.words('russian'))
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

recipes.clean_text = recipes.Text.apply(lambda x: clean_text(x))
#recipes.clean_text[2]

In [37]:
def freq_words(x, terms = 30): 
  all_words = ' '.join([text for text in x]) 
  all_words = all_words.split() 
  fdist = nltk.FreqDist(all_words) 
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 
  
  # selecting top 20 most frequent words 
  d = words_df.nlargest(columns="count", n = terms) 
  
  # visualize words and frequencies
  plt.figure(figsize=(12,15)) 
  ax = sns.barplot(data=d, x= "count", y = "word") 
  ax.set(ylabel = 'Word') 
  plt.show()
  
#print 100 most frequent words 
#freq_words(recipes.clean_text, 10)

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
recipes.clean_text = recipes.clean_text.apply(lambda x: remove_stopwords(x))
recipes.clean_text = recipes.clean_text.apply(preprocess_text)
#freq_words(recipes.clean_text, 30)

In [17]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain, xval, ytrain, yval = train_test_split(recipes.clean_text, recipes.t1, test_size=0.2, random_state=9)

xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

clf.fit(xtrain_tfidf, ytrain)

In [22]:
y_pred = clf.predict(xval_tfidf)
f1_score(yval, y_pred, average="micro")

0.7838307401258638

In [26]:
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return q_pred

In [39]:
for i in range(5): 
  k = xval.sample(1).index[0] 
  print("Text: ", recipes.Text[k], "\nPredicted text: ", infer_tags(xval[k])), print("Actual genre: ",recipes.t1[k], "\n")

In [31]:
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))
loaded_model = pickle.load(open('/kaggle/working/finalized_model.sav', 'rb'))

In [40]:
def infer_tags2(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q = preprocess_text(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return q_pred
#infer_tags2('борщ')

In [33]:
pickle.dump(tfidf_vectorizer, open("tfidf.pickle", "wb"))
tfidf_loaded = pickle.load(open('/kaggle/working/tfidf.pickle', 'rb'))

In [35]:
def infer_tags3(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q = preprocess_text(q)
    q_vec = tfidf_loaded.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return q_pred
infer_tags3('Рецепт простого, быстрого и полезного перекуса! Если любите похрустеть, то это блюдо из цукини просто идеально подойдет! ')[0]

'Закуски'