In [None]:
# Multiclass classification of texts - small descriptions of
# tender purchases in Russian;

# train.csv contains 'index', 'proc_name' (text description in Russian) and
# 'target' - ground-truth allocation to classes;

# test.csv contains only 'index' and 'proc_name'.

import pandas as pd
import numpy as np
import datetime
import string
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import utils

from pymystem3 import Mystem

import nltk
from nltk.corpus import stopwords

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from tqdm import tqdm

In [None]:

t_path = 'C:/some_folder/'
path_train = t_path + 'train.csv'
path_test = t_path + 'test.csv'
result_path = t_path + 'result.csv'

rs = 42

In [None]:
nltk.download("stopwords")

In [None]:
# Uploading the dataframes and quickly looking at the data
train_df = pd.read_csv(path_train, sep=',')
test_df = pd.read_csv(path_test, sep=',')

In [None]:
train_df.head(5)

In [None]:
train_df.tail(5)

In [None]:
test_df.head(5)

In [None]:
train_df['target'].value_counts()

In [None]:
# Check for NAN values
for col in train_df.columns:
    s = train_df[col].isnull()==True
    print('na values in ' + col + '...' + str(sum(s)))

In [None]:
# Text cleaning
def clean_text(text):
    # lowercase
    text = text.lower()

    # Substituting line breaks to spaces
    text = re.sub("^\s+|\n|\r|\s+$", ' ', text)
    
    # Deleting numbers
    text = re.sub(r'\d+', '', text)
    
    # Substituting punctuation (then deleting it)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Deleting extra spaces
    text = text.strip()
    
    return text

In [None]:
# Applying the function to both dataframes
train_df['proc_name'] = train_df.apply(lambda x: clean_text(x['proc_name']), axis=1)
test_df['proc_name'] = test_df.apply(lambda x: clean_text(x['proc_name']), axis=1)

In [None]:
# Lemmatizing with the help of Pymystem3

# Trying to minimize Mystem() calls - that's why creating a big string out of a series;
# Otherwise it works too slow

def lemmatize_column(df, col_name):
    print(datetime.datetime.now())

    m = Mystem()


    a = list(df[col_name])
    b = '<<>>'.join(a)
    c = m.lemmatize(b)
    d = ''.join(c)
    d = ' '.join(d.split())
    df[col_name + '_lemma'] = d.split('<<>>')

    df.drop([col_name], axis=1, inplace=True)

    print(datetime.datetime.now())
    
    return df

In [None]:
train_df = lemmatize_column(train_df, 'proc_name')
test_df = lemmatize_column(test_df, 'proc_name')

In [None]:
# Replacing NAN after lemmatization
s = train_df['proc_name_lemma'].isnull()==True
train_df.loc[s, 'proc_name_lemma'] = 'unknown'

s = test_df['proc_name_lemma'].isnull()==True
test_df.loc[s, 'proc_name_lemma'] = 'unknown'

In [None]:
russian_stopwords = stopwords.words('russian')

In [None]:
train_df.tail(5)

In [None]:
# Stop-words (Nltk library)
def remove_russian_stopwords(text):
    a = text.split(' ')
    a = [item for item in a if item not in russian_stopwords]
    return ' '.join(a)

train_df['proc_name_lemma'] = train_df.apply(lambda x: remove_russian_stopwords(x['proc_name_lemma']), axis=1)
test_df['proc_name_lemma'] = test_df.apply(lambda x: remove_russian_stopwords(x['proc_name_lemma']), axis=1)

In [None]:
train_df.tail(5)

In [None]:
# First classifier based of CountVectorizer + Tf-Idf + Naive Bayes 

# Mixing the dataframe
train_df = train_df.sample(n=len(train_df), random_state=rs)

In [None]:
X = train_df['proc_name_lemma']
y = train_df['target']

In [None]:
# Stratified K-Fold
skf = StratifiedKFold(n_splits=5)

In [None]:
cnt = 0

for train_index, test_index in skf.split(X, y):
    cnt += 1
    

    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Setting max_df parameter to avoid words common to all descriptions
    count_vect = CountVectorizer(max_df=0.2)
    X_train = count_vect.fit_transform(X_train)
    X_test = count_vect.transform(X_test)
    
    tfidf_transformer = TfidfTransformer()
    X_train = tfidf_transformer.fit_transform(X_train)
    X_test = tfidf_transformer.transform(X_test)
    
    NB_classifier = MultinomialNB().fit(X_train, y_train)
    predicted = NB_classifier.predict(X_test)
    print('(NB) Fold ' + str(cnt) + ', accuracy: ... ' + str(np.mean(predicted == y_test)))

In [None]:
# Another model: creating features with Gensim doc2vec,
# then passing it to Logistic regression classifier

# Delayed sample
train, test = train_test_split(train_df, test_size=0.33, random_state=rs)

In [None]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

# Preparing the data, performing doc2vec learning
def get_doc_2_vec_data(train_df, test_df):
    
    train_tagged = train_df.apply(
        lambda x: TaggedDocument(words=x['proc_name_lemma'].split(), tags=[x['target']]), axis=1)
    
    # 'target' for train sample, 'index' for test sample
    if 'target' in test_df.columns:
        t_col = 'target'
    else:
        t_col = 'index'
    
    test_tagged = test_df.apply(
        lambda x: TaggedDocument(words=x['proc_name_lemma'].split(), tags=[x[t_col]]), axis=1)
    
    print(train_tagged.values[0])
    
    model_dbow = Doc2Vec(dm=0, vector_size=200, negative=5, hs=0, min_count=2, window=15)
    model_dbow.random.seed(rs)
    model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])
    
    for epoch in range(30):
        model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha
        
    y_train, X_train = vec_for_learning(model_dbow, train_tagged)
    y_test, X_test = vec_for_learning(model_dbow, test_tagged)
    
    return y_train, X_train, y_test, X_test

In [None]:
y_train, X_train, y_test, X_test = get_doc_2_vec_data(train, test)

In [None]:
# Fitting the logistic regression

multinomial_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=rs).fit(X_train, y_train)
predictions = multinomial_lr.predict(X_test)

In [None]:
print('Accuracy (Logistic Regression) - doc2vec features:')
print(str(np.mean(predictions==test['target'])))

In [None]:
# Final model based on Logistic regression (its' validation results are better)

train_df.sort_values(['index'], inplace=True)

In [None]:
y_train, X_train, y_test, X_test = get_doc_2_vec_data(train_df, test_df)

In [None]:
multinomial_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=rs).fit(X_train, y_train)
predictions = multinomial_lr.predict(X_test)

In [None]:
# Saving the results to 'result.csv' file

test_df['target'] = predictions

test_df.drop(['proc_name_lemma'], axis=1, inplace=True)

test_df.to_csv(result_path, sep=',', index=False)