# Users' Requests Classifier
---

In [1]:
import warnings
warnings.filterwarnings('ignore')

import json
import numpy as np
import pandas as pd
import re
import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

import translators as ts

from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

import requests

In [2]:
PATH_MODEL = r".\model.h5"

## Load data

In [3]:
df = pd.read_csv(r".\train_dataset.csv", delimiter=';')

df = shuffle(df)

df['Ответ'] = df['Ответ'].where(df['Ответ'] != '1.0', other='n/d')
df['Ответ'] = df['Ответ'].where(df['Ответ'] != '0.0', other='n/d')
df['Ответ'] = df['Ответ'].where(df['Ответ'] != 'None', other='n/d')
df['Ответ'] = df['Ответ'].where(pd.notna(df['Ответ']), other='n/d')
df = df.where(df['Ответ'] != 'n/d').dropna(how='all')
# df = df.where(df['Unnamed: 0'] == 'original').dropna(how='all')
df.index = range(len(df['Ответ']))

stemmer = PorterStemmer()

WORD_PATTERN = re.compile(r'\w+', re.IGNORECASE)

def stem(ser):
    for i in range(len(ser)):
        stc = re.findall(WORD_PATTERN, ser[i])
        stc = [stemmer.stem(w) for w in stc]
        ser[i] = ' '.join(stc)
    return ser

df['Обращение'] = stem(df['Обращение'])
df['Вопрос'] = stem(df['Вопрос'])

## Natural Language Processing
#### Term frequency-inverse document frequency
- Частота слова (Term Frequency) — подсчитывает, как часто выбранное слово появляется в документе.
- Обратная частота документа (Inverse Document Frequency) — снижает вес слов, которые часто встречаются в документах.

В дальнейшем результаты могут быть улучшены использованием Word2vec - отображения слов в соответствующие n-мерные векторы.

In [4]:
tfidf = TfidfVectorizer()
_ = tfidf.fit(df['Обращение'])

#### To categorical features

In [5]:
group_encoder = LabelEncoder()
_ = group_encoder.fit(df['Тип обращения'])

label_encoder = LabelEncoder()
_ = label_encoder.fit(df['Ответ'])
train_labels = label_encoder.transform(df['Ответ'])

#### Convert data to more efficient form
- English translation
- Vectorizing
- Stemming

In [6]:
def prepare_data(raw: pd.DataFrame, translate=False) -> pd.DataFrame:
    data_raw = raw[['Обращение', 'Тип обращения']].copy(True)
    data_raw['Тип обращения'] = group_encoder.transform(data_raw['Тип обращения'])
    
    if translate:
        for i in range(len(data_raw['Обращение'])):
            data_raw['Обращение'][i] = ts.google(data_raw['Обращение'][i], if_use_cn_host=True)
            if len(data_raw['Обращение']) > 25:
                print('{}/{}'.format(i + 1, len(data_raw['Обращение'])), end='\r')
        if len(data_raw['Обращение']) > 25:
            print()
        tr = data_raw.copy(True)
    
    data_v = pd.DataFrame(tfidf.transform(data_raw['Обращение']).toarray())
    data = data_v.join(data_raw['Тип обращения'])
    
    if translate:
        return data, tr
    else:
        return data

df_prep = prepare_data(df[['Обращение', 'Тип обращения']])

## Build and fit Neural Network model
- 3x Dense layers
- Crossentropy loss function: sparse_categorical_crossentropy
- Optimizer: adam
- Stop training when a loss metric has stopped improving
- Load pretrained model if exists

In [7]:
if not os.path.exists(PATH_MODEL):
    clf = Sequential()
    clf.add(Dense(1024*2, input_dim=df_prep.shape[1], activation='relu'))
    clf.add(Dense(512*2, activation='relu'))
    clf.add(Dense(len(label_encoder.classes_), activation='softmax'))

    callback = EarlyStopping(monitor='loss', patience=2)

    clf.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    _ = clf.fit(df_prep, train_labels, epochs=30, callbacks=[callback], batch_size=2)
    clf.save(PATH_MODEL)
else:
    clf = load_model(PATH_MODEL)

## Predict on data
Features: 'Обращение', 'Тип обращения'

In [8]:
def classificate(data):
    test_data = pd.DataFrame(data, columns=['Обращение', 'Тип обращения'])
    test_data['Обращение'] = stem(test_data['Обращение'])

    test_data_prep, test_data_tr = prepare_data(test_data, translate=True)
    preds_proba = clf.predict(test_data_prep)

    preds = preds_proba.argmax(axis=1)
    preds_lab = label_encoder.inverse_transform(preds)
    
    return preds_lab, preds_proba

## Test

In [9]:
test_data = pd.read_csv(r'.\test_dataset.csv')
test_data = shuffle(test_data)

drop_to = list()
for i in range(len(test_data['Ответ'])):
    if str(test_data['Ответ'][i]) not in [x for x in df['Ответ']]:
        drop_to.append(i)

test_data = test_data.drop(index=drop_to)
test_data.index = range(len(test_data['Ответ']))

test_labels = label_encoder.transform(test_data['Ответ'])
test_data = test_data.drop(columns=['Ответ'])

test_data_prep, test_data_tr = prepare_data(test_data, translate=True)

preds_proba = clf.predict(test_data_prep)

print('{:20} : {:1.4f}'.format('accuracy_score', metrics.accuracy_score(test_labels, preds_proba.argmax(axis=1))))

50/50
accuracy_score       : 0.4200


## Monitor users' requests database
Send back predicted answers and its probabilities

In [10]:
while True:
    request = requests.get('http://77.222.54.233/requests.php')
    request = request.json() if len(request.content) else list()
    if len(request):
        print(request)
    
    req_data = list()
    for req in request:
        msg = req['msg']
        theme = req['theme'] if (req['theme'] is not None and len(req['theme'])) else 'Интернет'
        req_data.append([msg, theme])
    
    if len(req_data):
        labs, probas = classificate(req_data)

        reply = {"answer": []}
        for i in range(len(request)):
            reply["answer"].append(
            {
                'id': request[i]['id'], 
                'result': labs[i], 
                'prob': float(probas[i].max())
            })
            
        print(reply["answer"])
        reply["answer"] = json.dumps(reply["answer"])

        resp = requests.post('http://77.222.54.233/result.php', data=reply)
    
    time.sleep(0.2)

KeyboardInterrupt: 