In [1]:
import pandas as pd
import numpy as np
import os
import sys
import pickle
import io
from tqdm import tqdm

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_predict, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.externals import joblib

In [2]:
?SVC

In [2]:
project_dir = '/Users/alexandergrigoriev/work/sandbox/sb900-ai/'
static_dir = '/Users/alexandergrigoriev/work/sandbox/sb900-static/'
STATIC_PATH = static_dir
os.environ['STATIC_PATH'] = static_dir
sys.path.append(project_dir)
from ai.ai_model.base_text_normalizer import text_normalizer
from ai.ai_model.faq.faq_responder import FaqResponder
from ai.configs.faq_config import FaqConfig

from ai.configs.intent_recognizer_config import IntentRecognizerConfig
from ai.vectorizer.simple_word2vec import SimpleW2V

TextNormalizer init time = 5s


### Parameters

In [24]:
RND_SEED = 42

train_fname = 'train_rnd_lift_100.xlsx'
test_fname = 'test_rnd_lift_100.xlsx'
prepr_dataset_pkl_fname = 'train_test_prepr.pkl'
classifier_pkl_fname = 'other_classifier.pkl'

In [25]:
norm_data = {}

def cached_normalizer(x):
    global norm_data
    if x not in norm_data:
        try:
            norm_data[x] = text_normalizer(x).normalized_text
        except:
            print('problem text:')
            print(x)
            norm_data[x] = text_normalizer(x).normalized_text
    return norm_data[x]

In [26]:
from sklearn.metrics import precision_recall_fscore_support, fbeta_score

def get_f_scores(y_true, y_pred):
    labels = sorted(np.unique(y_true))
    _, _, f05score, support = precision_recall_fscore_support(y_true, y_pred,average=None, beta=0.5,labels=labels)
    classes_score = (f05score[:-1] * support[:-1]).sum()/support[:-1].sum()
    f2_scores = fbeta_score(y_true, y_pred, average=None, beta=2,labels=labels)
    other_score = f2_scores[-1]
    result_score = (classes_score + other_score) / 2
    
    return {"classes_score": classes_score, 
            "other_score": other_score, 
            "result_score": result_score}

def get_result_f_score(y_true, y_pred):
    return get_f_scores(y_true, y_pred)['result_score']

### Reading data

In [27]:
train = pd.read_excel(train_fname)
test = pd.read_excel(test_fname)

In [7]:
train = train.assign(phrase=train['phrase'].astype(str))
test = test.assign(phrase=test['phrase'].astype(str))

In [28]:
train.head()

Unnamed: 0,mq_id,master_question,phrase,source
5021,1019,Какой режим работы банка,"как работает офис сбербанка 9 мая? москва, ряд...",rnd
5970,1021,Как оставить обращение,хочу подать жалобу,nst
1615,1015,Как закрыть карту,"добрый день, как мне полностью закрыть дебетов...",rnd
1743,1011,Здравствуйте,доброе утро,rnd
5920,1009,Как получить справку о подтверждении операции,каким образом мне можно взять подтверждение фа...,gen


In [9]:
test.head()

Unnamed: 0,mq_id,master_question,phrase,source
2703,1012,Как увеличить кредитный лимит,здравствуйте!!!я хотела бы узнать как повысить...,nst
1714,1003,Действует ли программа автокредитования,галина сдраствуйте хочу взять авто кредит на а...,nst
1505,1011,Здравствуйте,"здравствуйте, ольга",rnd
24983,1023,Как отключить копилку,я вчера открыла пополнение копилки а как эту к...,nst
434,1011,Здравствуйте,добрый день,rnd


In [10]:
train.shape

(1308, 4)

In [11]:
test.shape

(5233, 4)

In [12]:
if not os.path.exists(prepr_dataset_pkl_fname):
    X_train = np.array([cached_normalizer(text) for text in tqdm(train['phrase'])])
    y_train = train['mq_id'].values
    X_test = np.array([cached_normalizer(text) for text in tqdm(test['phrase'])])
    y_test = test['mq_id']
    joblib.dump([X_train, y_train, X_test, y_test], prepr_dataset_pkl_fname) 

else:
    X_train, y_train, X_test, y_test = joblib.load(prepr_dataset_pkl_fname) 

In [13]:
X_train[0]

'надо подтверждать справка об операция как получать .'

In [14]:
#load stopwords
STOPWORDS_YANDEX=['большой','бы','быть','в','весь','вот','все','всей','вы','говорить','год','да','для','до','еще','же','знать','и','из','к','как','который','мочь','мы','на','наш','не','него','нее','нет','них','но','о','один','она','они','оно','оный','от','ото','по','с','свой','себя','сказать','та','такой','только','тот','ты','у','что','это','этот','я']


In [15]:
RND=123
# SGD
pipe =  Pipeline([

                    ('vec', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier())
                ])
tuned_parameters = [ {
                    "vec__stop_words":[STOPWORDS_YANDEX],                
                    "tfidf__use_idf":[True],
                'clf__random_state': [RND_SEED],
                'clf__penalty': ['l2'],  #, 'elasticnet', 'l1'],
                'clf__alpha': [0.000001, 0.00001, 0.0001, 0.001],
                'clf__max_iter': [1000],
                'clf__loss':['modified_huber']
            }]

# # SVC
# pipe =  Pipeline([

#                     ('vec', CountVectorizer()),
#                     ('tfidf', TfidfTransformer()),
#                     ('standardscaler', StandardScaler()),
#                     ('clf', SVC())
#                 ])
# tuned_parameters = [ {
#                     "vec__stop_words":[STOPWORDS_YANDEX],                
#                     "tfidf__use_idf": [True],
#                     'standardscaler__with_mean': [False],
#                 'clf__random_state': [RND_SEED],
#                 'clf__kernel': ['linear', 'rbf', 'sigmoid'],
#                 'clf__C': [0.01, 0.1, 1, 10, 100],
#                 'clf__max_iter': [10000] 
# #                 'clf__probability': [True]
#             }]

In [16]:
ftwo_scorer = make_scorer(fbeta_score, beta=2, average = 'micro')
result_f_scorer = make_scorer(get_result_f_score)

In [17]:
%%time
gs = GridSearchCV(pipe, tuned_parameters, cv = 5, scoring = result_f_scorer, n_jobs = -1)
gs.fit(X_train, y_train)  
print (gs.best_params_)
print (gs.best_score_)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'clf__alpha': 0.0001, 'clf__loss': 'modified_huber', 'clf__max_iter': 1000, 'clf__penalty': 'l2', 'clf__random_state': 42, 'tfidf__use_idf': True, 'vec__stop_words': ['большой', 'бы', 'быть', 'в', 'весь', 'вот', 'все', 'всей', 'вы', 'говорить', 'год', 'да', 'для', 'до', 'еще', 'же', 'знать', 'и', 'из', 'к', 'как', 'который', 'мочь', 'мы', 'на', 'наш', 'не', 'него', 'нее', 'нет', 'них', 'но', 'о', 'один', 'она', 'они', 'оно', 'оный', 'от', 'ото', 'по', 'с', 'свой', 'себя', 'сказать', 'та', 'такой', 'только', 'тот', 'ты', 'у', 'что', 'это', 'этот', 'я']}
0.8373425899809402
CPU times: user 22.1 s, sys: 163 ms, total: 22.3 s
Wall time: 6min


In [18]:
gs.best_estimator_.predict(['хотеть кредит', 'NUM_TOKEN NUM_TOKEN NUM_TOKEN NUM_TOKEN', 'привет', 'оператор живой', 'привет помоги подобрать кредит'])

array([9999, 9999, 9999, 1034, 9999])

In [19]:
joblib.dump(gs.best_estimator_, classifier_pkl_fname) 

['other_classifier.pkl']

In [20]:
# gs.best_estimator_.predict_proba(['привет помоги подобрать кредит'])

## Validation

### Test score

In [21]:
y_pred_test = gs.best_estimator_.predict(list(X_test))

In [22]:
# print (precision_recall_fscore_support(y_test, y_pred_test))
print (classification_report(y_test, y_pred_test))

             precision    recall  f1-score   support

       1001       0.79      0.78      0.79        82
       1002       0.75      0.63      0.69        73
       1003       0.75      0.65      0.70        80
       1004       0.92      0.77      0.84        78
       1006       0.90      0.32      0.47        84
       1007       0.88      0.91      0.90        81
       1008       0.71      0.31      0.43        80
       1009       0.71      0.30      0.42        79
       1010       0.74      0.93      0.83        74
       1011       0.97      0.98      0.98      1561
       1012       0.83      0.48      0.61        81
       1013       0.69      0.11      0.19        84
       1014       0.81      0.27      0.41        81
       1015       0.71      0.30      0.42        80
       1016       0.53      0.32      0.40        75
       1017       0.58      0.63      0.60        75
       1018       0.90      0.51      0.65        92
       1019       0.83      0.74      0.79   

In [23]:
f_scores = get_f_scores(y_test, y_pred_test)
f_scores

{'classes_score': 0.7325784488074432,
 'other_score': 0.9744006896935312,
 'result_score': 0.8534895692504871}