In [None]:
Ximport warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import json
import re, string
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange

import nltk
nltk.download('stopwords')

from nltk import word_tokenize
nltk.download('punkt')

from nltk.stem import *
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from pymystem3 import Mystem
from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def preprocess_data(df_1):

  # Preparation
  def remove_stickers(text):
    if text != text:
      return np.nan
    # Паттерн для поиска стикеров в формате [стикер]
    pattern = r"[^\w.!?,:;$\s/\"\']*"
    cleaned_text = re.sub(pattern, "", str(text))
    return cleaned_text

  def remove_punctuation(text):
      return "".join([ch if ch not in string.punctuation else ' ' for ch in text])

  def remove_numbers(text):
      return ''.join([i if not i.isdigit() else ' ' for i in text])

  def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

  mystem = Mystem()
  russian_stopwords = stopwords.words("russian")
  english_stopwords = stopwords.words("english")
  russian_stopwords.extend(['…', '«', '»', '...'])
  russian_stopwords.extend(english_stopwords)

  def lemmatize_text(text):
      tokens = mystem.lemmatize(text.lower())
      tokens = [token for token in tokens if token not in russian_stopwords and token != " "]
      text = " ".join(tokens)
      return text

  def remove_stop_words(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in russian_stopwords and token != ' ']
    return " ".join(tokens)

  # Removing rows with blank cells and NaN's
  df_1 = df_1.replace(r'^s*$', float('NaN'), regex = True)
  df_1 = df_1.dropna(subset=['answer'])

  # Shuffling
  df_1 = df_1.reset_index()
  df_1 = df_1.drop(['index'], axis=1)

  # Removing Stickers
  for i in range(df_1.shape[0]):
    df_1['answer'][i] = remove_stickers(str(df_1['answer'][i]))

  # Fast preproccessing
  preproccessing = lambda text: (remove_multiple_spaces(remove_numbers(remove_punctuation(text.lower()))))
  df_1['Preproccessed'] = list(map(preproccessing, df_1['answer']))

  # Stemming
  stemmer = SnowballStemmer("russian")
  stemmer_eng = SnowballStemmer("english")
  stemmed_texts_list = []
  for text in tqdm(df_1['Preproccessed']):
      tokens = word_tokenize(text)
      stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in russian_stopwords]
      stemmed_tokens = [stemmer_eng.stem(token) for token in tokens if token not in russian_stopwords]
      text = " ".join(stemmed_tokens)
      stemmed_texts_list.append(text)

  df_1['text_stem'] = stemmed_texts_list

  # Removing stop words
  sw_texts_list = []
  for text in tqdm(df_1['Preproccessed']):
      text = remove_stop_words(text)
      sw_texts_list.append(text)

  df_1['text_sw'] = sw_texts_list

  # Lemmatize
  lemm_texts_list = []
  for text in tqdm(df_1['text_sw']):
      try:
          text = lemmatize_text(text)
          lemm_texts_list.append(text)
      except Exception as e:
          print(e)

  df_1['text_lemm'] = lemm_texts_list

  print(df_1)
  return df_1

In [None]:

import os

def build_data_frame_from_jsons():
  df = pd.DataFrame(columns=['answer', 'question', 'semantic'])

  directory = 'data'
  for filename in os.scandir(directory):
      if filename.is_file():
          with open(filename.path) as f:
              decoded = f.read().encode().decode('utf-8-sig')
              text = json.loads(decoded)
              question = text['question']
              answers = text["answers"] # context cluster

              answers_texts = []
              semantics = []
              questions = []
              for i in answers:
                answers_texts.append(i['answer'])
                semantics.append(i['sentiment'])
                questions.append(question)

              df1 = pd.DataFrame(columns=['answer', 'semantic', 'question'])
              df1.loc[:, 'answer'] = answers_texts
              df1.loc[:, 'semantic'] = semantics
              df1.loc[:, 'question'] = questions

              df = pd.concat([df, df1], ignore_index = True)
              df.reset_index()

  df = df[df.semantic != 'context sentiment']
  df = df[df.semantic != 'context cluster']
  df.loc[df["semantic"] == "negative", "semantic"] = "negatives"

  return df

build_data_frame_from_jsons()

Unnamed: 0,answer,question,semantic
1,adkar,Ваши открытия за время обучения на модуле,neutrals
2,все,Ваши открытия за время обучения на модуле,neutrals
3,да,Ваши открытия за время обучения на модуле,neutrals
4,знакомства,Ваши открытия за время обучения на модуле,neutrals
5,методы и действия,Ваши открытия за время обучения на модуле,neutrals
...,...,...,...
5246,через практику,Как проверить практические навыки?,neutrals
5247,дэ,Как проверить практические навыки?,neutrals
5248,кейс,Как проверить практические навыки?,neutrals
5249,собеседование,Как проверить практические навыки?,neutrals


In [None]:
def train(print_accuracy=False):
  # Building data frame

  df_1 = build_data_frame_from_jsons()
  df_1 = preprocess_data(df_1)

  # Building model/pipeline

  #X = df_1['text_lemm']
  X = df_1['text_sw']
  y = df_1['semantic']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
  classes = df_1['semantic'].unique()

  logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=40)),
               ])
  logreg.fit(X_train, y_train)

  if print_accuracy:
    y_pred = logreg.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=classes))

  return logreg

In [None]:
from sklearn.naive_bayes import MultinomialNB
def train_nb():
  df_1 = build_data_frame_from_jsons()
  df_1 = preprocess_data(df_1)

  # Building model/pipeline

  #X = df_1['text_lemm']
  X = df_1['text_sw']
  y = df_1['semantic']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
  classes = df_1['semantic'].unique()
  nb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)
  y_pred = nb.predict(X_test)
  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred, target_names=classes))

  return nb


In [None]:
import pickle

if __name__ == '__main__':
	model = train(print_accuracy=True)

	model.predict(["пидарас ебаный ты"])

	filename = 'model_v1.pk'
	with open(filename, 'wb') as file:
		pickle.dump(model, file)

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


  0%|          | 0/5108 [00:00<?, ?it/s]

  0%|          | 0/5108 [00:00<?, ?it/s]

  0%|          | 0/5108 [00:00<?, ?it/s]

                           answer                                   question  \
0                           adkar  Ваши открытия за время обучения на модуле   
1                             все  Ваши открытия за время обучения на модуле   
2                              да  Ваши открытия за время обучения на модуле   
3                      знакомства  Ваши открытия за время обучения на модуле   
4               методы и действия  Ваши открытия за время обучения на модуле   
...                           ...                                        ...   
5103              через  практику         Как проверить практические навыки?   
5104                           дэ         Как проверить практические навыки?   
5105                         кейс         Как проверить практические навыки?   
5106                собеседование         Как проверить практические навыки?   
5107  через смуляторы и тренажёры         Как проверить практические навыки?   

      semantic                Preprocce

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model = train(print_accuracy=True)

  0%|          | 0/5096 [00:00<?, ?it/s]

  0%|          | 0/5096 [00:00<?, ?it/s]

  0%|          | 0/5096 [00:00<?, ?it/s]

                             answer  \
0                            сложно   
1                            тяжело   
2                          смятение   
3                       собранность   
4                 сосредоточенность   
...                             ...   
5091  совместная работа с коллегами   
5092                развитие связей   
5093      ура товарищи! поздравляю!   
5094         эмоциональная разрядка   
5095                           юмор   

                                         question   semantic  \
0     Опишите одним словом ваше текущее состояние  negatives   
1     Опишите одним словом ваше текущее состояние  negatives   
2     Опишите одним словом ваше текущее состояние  negatives   
3     Опишите одним словом ваше текущее состояние  positives   
4     Опишите одним словом ваше текущее состояние  positives   
...                                           ...        ...   
5091                     Что было ценным сегодня?   neutrals   
5092               

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model.predict(["отличная зарплата", "плохая зарплата"])

array(['positives', 'neutrals'], dtype=object)