In [295]:
#!pip install pymystem3

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from pymystem3 import Mystem
import pickle
from string import punctuation

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/products.csv
/kaggle/lib/kaggle/gcp.py
/kaggle/input/products.csv
/kaggle/working/__notebook_source__.ipynb
/kaggle/working/finalized_model.sav
/kaggle/working/tfidf.pickle
/kaggle/working/.ipynb_checkpoints/__notebook_source__-checkpoint.ipynb


In [259]:
recipes=pd.read_csv('/kaggle/input/products.csv', 
                 names=['Text','t1','t2','t3'],
                 skiprows=1,
                 sep='|')
#recipes.head()

In [260]:
#recipes.t1.value_counts().plot(kind='bar')

In [261]:
recipes.Text[2]

'Мягкие и невероятно вкусные рулетики из творожного теста. Их очень легко и быстро можно приготовить из доступных продуктов. Это не займёт много времени. Приготовьте и Вам обязательно понравится. Приятного аппетита.'

In [262]:
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^а-яА-ЯЁё]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

stop_words = set(stopwords.words('russian'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [263]:
recipes.clean_text = recipes.Text.apply(lambda x: clean_text(x))
recipes.clean_text[2]

  """Entry point for launching an IPython kernel.


'мягкие и невероятно вкусные рулетики из творожного теста их очень легко и быстро можно приготовить из доступных продуктов это не займёт много времени приготовьте и вам обязательно понравится приятного аппетита'

In [264]:
def freq_words(x, terms = 30): 
  all_words = ' '.join([text for text in x]) 
  all_words = all_words.split() 
  fdist = nltk.FreqDist(all_words) 
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 
  
  # selecting top 20 most frequent words 
  d = words_df.nlargest(columns="count", n = terms) 
  
  # visualize words and frequencies
  plt.figure(figsize=(12,15)) 
  ax = sns.barplot(data=d, x= "count", y = "word") 
  ax.set(ylabel = 'Word') 
  plt.show()
  
# print 100 most frequent words 
#freq_words(recipes.clean_text, 10)

In [265]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [266]:
recipes.clean_text = recipes.clean_text.apply(lambda x: remove_stopwords(x))

In [267]:
recipes.clean_text = recipes.clean_text.apply(preprocess_text)

In [268]:
#freq_words(recipes.clean_text, 30)

In [269]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [270]:
xtrain, xval, ytrain, yval = train_test_split(recipes.clean_text, recipes.t1, test_size=0.2, random_state=9)

In [271]:
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [272]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [273]:
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [275]:
y_pred = clf.predict(xval_tfidf)
f1_score(yval, y_pred, average="micro")

0.7874720357941835

In [276]:
#y_pred
#xval

In [278]:
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return q_pred

In [279]:
for i in range(5): 
  k = xval.sample(1).index[0] 
  #print("Text: ", recipes.Text[k], "\nPredicted text: ", infer_tags(xval[k])), print("Actual genre: ",recipes.t1[k], "\n")

In [280]:
infer_tags(' ')

array(['Горячие блюда'], dtype='<U32')

In [282]:
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))
loaded_model = pickle.load(open('/kaggle/working/finalized_model.sav', 'rb'))

In [289]:
def infer_tags2(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q = preprocess_text(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return q_pred
#infer_tags2('борщ')

In [285]:
pickle.dump(tfidf_vectorizer, open("tfidf.pickle", "wb"))
tfidf_loaded = pickle.load(open('/kaggle/working/tfidf.pickle', 'rb'))

In [290]:
def infer_tags3(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q = preprocess_text(q)
    q_vec = tfidf_loaded.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return q_pred
infer_tags3('куры гриль, охуеть')[0]