In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
import pickle

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# Any results you write to the current directory are saved as output.

In [None]:
!pip list

In [None]:
recipes=pd.read_csv('/kaggle/input/products.csv', 
                 names=['Text','t1','t2','t3'],
                 skiprows=1,
                 sep='|')
recipes.head()

In [None]:
recipes.t1.value_counts().plot(kind='bar')

In [None]:
recipes.Text[2]

In [None]:
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^а-яА-ЯЁё]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [None]:
recipes.clean_text = recipes.Text.apply(lambda x: clean_text(x))
recipes.clean_text[2]

In [None]:
def freq_words(x, terms = 30): 
  all_words = ' '.join([text for text in x]) 
  all_words = all_words.split() 
  fdist = nltk.FreqDist(all_words) 
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 
  
  # selecting top 20 most frequent words 
  d = words_df.nlargest(columns="count", n = terms) 
  
  # visualize words and frequencies
  plt.figure(figsize=(12,15)) 
  ax = sns.barplot(data=d, x= "count", y = "word") 
  ax.set(ylabel = 'Word') 
  plt.show()
  
# print 100 most frequent words 
freq_words(recipes.clean_text, 30)

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

recipes.clean_text = recipes.clean_text.apply(lambda x: remove_stopwords(x))

In [None]:
freq_words(recipes.clean_text, 30)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [None]:
xtrain, xval, ytrain, yval = train_test_split(recipes.clean_text, recipes.t1, test_size=0.2, random_state=9)

In [None]:
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [None]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [None]:
clf.fit(xtrain_tfidf, ytrain)

In [None]:
y_pred = clf.predict(xval_tfidf)

In [None]:
f1_score(yval, y_pred, average="micro")

In [None]:
y_pred

In [None]:
xval

In [None]:
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return q_pred

In [None]:
for i in range(5): 
  k = xval.sample(1).index[0] 
  print("Text: ", recipes.Text[k], 
        "\nPredicted text: ", 
        infer_tags(xval[k])), print("Actual genre: ",recipes.t1[k], "\n")

In [None]:
infer_tags(' ')

In [None]:
clf

In [None]:
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('/kaggle/working/finalized_model.sav', 'rb'))

In [None]:
def infer_tags2(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return q_pred
infer_tags2('борщ')

In [None]:
pickle.dump(tfidf_vectorizer, open("tfidf.pickle", "wb"))

In [None]:
tfidf_loaded = pickle.load(open('/kaggle/working/tfidf.pickle', 'rb'))

In [None]:
def infer_tags3(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_loaded.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return q_pred
infer_tags3('борщ')

In [None]:
infer_tags3('борщ')[0]