In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import re, string
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange

import nltk
nltk.download('stopwords')

from nltk import word_tokenize
nltk.download('punkt')

from nltk.stem import *
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from pymystem3 import Mystem
from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
def preprocess_data(df_1):

  # Preparation
  def remove_stickers(text):
    if text != text:
      return np.nan
    # Паттерн для поиска стикеров в формате [стикер]
    pattern = r"[^\w.!?,:;$\s/\"\']*"
    cleaned_text = re.sub(pattern, "", str(text))
    return cleaned_text

  def remove_punctuation(text):
      return "".join([ch if ch not in string.punctuation else ' ' for ch in text])

  def remove_numbers(text):
      return ''.join([i if not i.isdigit() else ' ' for i in text])

  def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

  mystem = Mystem()
  russian_stopwords = stopwords.words("russian")
  russian_stopwords.extend(['…', '«', '»', '...'])
  def lemmatize_text(text):
      tokens = mystem.lemmatize(text.lower())
      tokens = [token for token in tokens if token not in russian_stopwords and token != " "]
      text = " ".join(tokens)
      return text

  russian_stopwords = stopwords.words("russian")
  russian_stopwords.extend(['…', '«', '»', '...', 'т.д.', 'т', 'д'])
  def remove_stop_words(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in russian_stopwords and token != ' ']
    return " ".join(tokens)

  def lemmatize_text(text):
    text_lem = mystem.lemmatize(text)
    tokens = [token for token in text_lem if token != ' ' and token not in russian_stopwords]
    return " ".join(tokens)

  # Removing rows with blank cells and NaN's
  df_1 = df_1.replace(r'^s*$', float('NaN'), regex = True)
  df_1 = df_1.dropna(subset=['Text'])

  # Shuffling
  df_1 = df_1.reset_index()
  df_1 = df_1.drop(['index'], axis=1)

  # Removing Stickers
  for i in range(df_1.shape[0]):
    df_1['Text'][i] = remove_stickers(str(df_1['Text'][i]))

  # Fast preproccessing
  preproccessing = lambda text: (remove_multiple_spaces(remove_numbers(remove_punctuation(text.lower()))))
  df_1['Preproccessed'] = list(map(preproccessing, df_1['Text']))

  # Stemming
  stemmer = SnowballStemmer("russian")
  stemmed_texts_list = []
  for text in tqdm(df_1['Preproccessed']):
      tokens = word_tokenize(text)
      stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in russian_stopwords]
      text = " ".join(stemmed_tokens)
      stemmed_texts_list.append(text)

  df_1['text_stem'] = stemmed_texts_list

  # Removing stop words
  sw_texts_list = []
  for text in tqdm(df_1['Preproccessed']):
      text = remove_stop_words(text)
      sw_texts_list.append(text)

  df_1['text_sw'] = sw_texts_list

  # Lemmatize
  lemm_texts_list = []
  for text in tqdm(df_1['text_sw']):
      try:
          text = lemmatize_text(text)
          lemm_texts_list.append(text)
      except Exception as e:
          print(e)

  df_1['text_lemm'] = lemm_texts_list

  return df_1

In [3]:
def build_data_frame():
  data = pd.read_csv("dataset_fixed.csv")

  df = pd.DataFrame()
  df.loc[:, 'Class'] = data['Class']
  df.loc[:, 'Text'] = data['Text']

  df_1 = pd.DataFrame(columns = ["Text", "Class", "Preproccessed"])

  # proxy - доверенность
  # contract - договор
  # act - акт
  # application - заявление
  # order - приказ
  # invoice - счет
  # bill - приложение
  # arrangement - соглашение
  # contract offer - договор оферты
  # statute - устав
  # determination - решение

  df_1.loc[:, 'Class'] = df['Class']
  df_1.loc[:, 'Text'] = df['Text']

  # df_pdf_add = pd.read_excel("test_pdf.xlsx")
  # df_docx_add = pd.read_excel("test_docx.xlsx")

  # df_noise = pd.read_csv("dataset.csv")
  # df_noise = df_noise.drop(['Unnamed: 0'], axis=1)

  # df_2 = pd.concat([df_1, df_pdf_add, df_docx_add], ignore_index=True)

  return df_1

In [4]:
def train(print_accuracy=False):
  # Building data frame

  df_1 = build_data_frame()
  df_1 = preprocess_data(df_1)

  # Building model/pipeline

  X = df_1['text_lemm']
  y = df_1['Class']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
  classes = df_1['Class'].unique()

  logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=50)),
               ])
  logreg.fit(X_train, y_train)

  if print_accuracy:
    y_pred = logreg.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=classes))

  return logreg

In [6]:
import pickle

if __name__ == '__main__':
	model = train(print_accuracy=True)

	filename = 'model_v4.pk'
	with open(filename, 'wb') as file:
		pickle.dump(model, file)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Text'][i] = remove_stickers(str(df_1['Text'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Text'][i] = remove_stickers(str(df_1['Text'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Text'][i] = remove_stickers(str(df_1['Text'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata

  0%|          | 0/6984 [00:00<?, ?it/s]

  0%|          | 0/6984 [00:00<?, ?it/s]

  0%|          | 0/6984 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.9875954198473282
                precision    recall  f1-score   support

   arrangement       1.00      1.00      1.00       172
   application       0.99      0.96      0.97       155
         proxy       1.00      1.00      1.00       408
      contract       0.97      0.95      0.96       152
           act       0.98      1.00      0.99       238
         order       1.00      1.00      1.00       124
 determination       1.00      0.96      0.98        93
       statute       1.00      0.98      0.99        94
contract offer       0.99      1.00      1.00       365
       invoice       0.96      1.00      0.98       213
          bill       0.99      0.96      0.98        82

      accuracy                           0.99      2096
     macro avg       0.99      0.98      0.98      2096
  weighted avg       0.99      0.99      0.99      2096



In [16]:
# Generating csv file with all datasets for extending it

data = pd.read_csv("sample.csv")

df = pd.DataFrame()
df.loc[:, 'Class'] = data['class']
df.loc[:, 'Text'] = data['text']

df_1 = pd.DataFrame(columns = ["Text", "Class"])
df_1.loc[:, 'Class'] = df['Class']
df_1.loc[:, 'Text'] = df['Text']

df_pdf_add = pd.read_excel("test_pdf.xlsx")
df_docx_add = pd.read_excel("test_docx.xlsx")

df_2 = pd.concat([df_1, df_pdf_add, df_docx_add], ignore_index=True)

df_2.to_csv("penis.csv")