In [471]:
import warnings
warnings.filterwarnings('ignore')

In [966]:
import regex as re
import jsonlines
from pathlib import Path
from collections import defaultdict, Counter

from tqdm import tqdm

In [3]:
import tokenize_uk
import pymorphy2
import langdetect

morph = pymorphy2.MorphAnalyzer(lang='uk')
stop_words = set(x.strip() for x in open('ukr_stopwords.txt').readlines())

In [834]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD, NMF

In [5]:
DATA_DIR = Path.cwd() / 'rozetka/data'

In [77]:
def jl_file(data_file):
    with jsonlines.open(data_file, 'r') as f:
        for x in f:
            yield x

def write_data_lang(data_file):
    out_file = f'{data_file.parent}/{data_file.stem}-lang{data_file.suffix}'
    
    with jsonlines.open(out_file, 'w') as f:
        for comment in tqdm(jl_file(data_file)):
            try:
                lang = langdetect.detect(comment['text'])
            except langdetect.detector.LangDetectException:
                lang = ''
            else:
                comment['lang'] = lang
                f.write(comment)

In [78]:
for f in DATA_DIR.iterdir():
    if not str(f).endswith('-lang.jl'):
        print(f)
        write_data_lang(f)
        print('\n')

6it [00:00, 56.93it/s]

/home/igor/projects/prj-nlp-2019/students/igor_kurinnyi/hw5/rozetka/data/kupanie-i-gigiena.jl


4233it [00:24, 176.07it/s]
10it [00:00, 97.54it/s]



/home/igor/projects/prj-nlp-2019/students/igor_kurinnyi/hw5/rozetka/data/tehnika-dlya-kuhni.jl


28088it [02:55, 159.61it/s]








In [15]:
def load_data(data_file):
    data = defaultdict(list)
    for comment in jl_file(data_file):
        if comment['rating'] and (comment['lang'] == 'uk'):
            text = comment['text']
            adv = get_advantages(comment['text'])
            dis = get_disadvantages(comment['text'])
            data['text'].append(text)
            data['advantages'].append(adv)
            data['disadvantages'].append(dis)
            
            data['rating'].append(comment['rating'])
            data['like'].append(comment['likes'])
            data['dislike'].append(comment['dislikes'])
            data['author'].append(comment['author'])
    return pd.DataFrame(data)


def get_advantages(text):
    for par in text.split('\n'):
        if par.startswith('Достоинства'):
            par = re.sub(r'Достоинства:?\s?', '', par)
            return par
    return ''

        
def get_disadvantages(text):
    for par in text.split('\n'):
        if par.startswith('Недостатки'):
            par = re.sub(r'Недостатки:?\s?', '', par)
            return par
    return ''

In [1046]:
data_file = DATA_DIR / 'tehnika-dlya-kuhni-lang.jl'
data = load_data(data_file)

target = ['rating']
feature_names = ['text', 'advantages', 'disadvantages', 'author', 'like', 'dislike']
data_train, data_test, y_train, y_test = train_test_split(data[feature_names], data[target], test_size=0.3, random_state=42, stratify=data[target])

data_train.head()

Unnamed: 0,text,advantages,disadvantages,author,like,dislike
1206,"В принципі нічого, каву варить, але надто поту...",Поки що працює,Традиційно для Розетки незаповнена гарантійка....,Віталій Бочковський,2,0
2592,Дуже корисна річ! Всім рекомендую.\nНедостатки...,,Не має.,Люда Ященко,0,0
1143,Дуже класний помічник!! зі всіма завданнями сп...,"Легкий, дешевий",не виявила,Ольга Кулик,0,0
35,Відмінна якість. Тости прожарюються рівномірно...,"Автоматичний підйом тостів, автоцентрування, п...",Поки-що не знайшли.,Виктор,0,0
1613,"Працює) дуже гарна підсвітка! Якість, на свої ...","Працює) дуже гарна підсвітка! Якість, на свої ...",,оксана,0,0


## Текстові фічі

In [1028]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key, to_values=False):
        self.key = key
        self.to_values = to_values
        
    def fit(self, x, y=None):
        return self
        
    def transform(self, df):
        if self.to_values:
            return df[self.key].values[:, None]
        else:
            return df[self.key]


class TextProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, lemma=True, no_stop_words=True, clean=True):
        self.lemma = lemma
        self.no_stop_words = no_stop_words
        self.clean = clean
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        if self.lemma:
            x = x.apply(self.lemmatize)
        if self.no_stop_words:
            x = x.apply(self.remove_stop_words)
        if self.clean:
            x = x.apply(self.clean_text)
        return x
    
    def lemmatize(self, text):
        tokens = [t for t in tokenize_uk.tokenize_words(text)]
        lemmas = [morph.parse(t)[0].normal_form for t in tokens]
        return ' '.join(lemmas)
    
    def remove_stop_words(self, text):
        return ' '.join([t for t in tokenize_uk.tokenize_words(text) if t not in stop_words])
    
    def clean_text(self, text):
        sub_patterns = [
            (r'\+\++', 'manyplus'),
            (r'\!+', ' manyexclamation '),
            (r'\?+', ' manyquest '),
            (r'(\?\!)+', ' exclquest '),
            (r'\.\.+', ' manydot '),
            (r'[;:|]?-?\)+', ' happysmile '),
            (r'[;:|]?\*?-?\)+', ' sadsmile '),
            (r"(\w+)'(\w)", r'\g<1>\g<2>'),
            (r'\W+', ' '),
        ]
        for p, s in sub_patterns:
            text = re.sub(p, s, text)
        return text.strip()

In [1212]:
# ================== TEXT FEATURES ======================

def text_feature_pipe(field):
    return Pipeline([
        ('selector', FeatureSelector(field)),
        ('processor', TextProcessor()),
        ('tfidf', TfidfVectorizer()),])

body = text_feature_pipe('text')
adv = text_feature_pipe('advantages')
dis = text_feature_pipe('disadvantages')

text = FeatureUnion([
    ('body', body),
    ('adv', adv),
    ('dis', dis),
])

mnb_pipe = Pipeline([
    ('features', text),
    ('mnb', MultinomialNB())
])

# параметри вже підібрав
text_params = {
    'features__body__processor__lemma': [False],
    'features__body__processor__no_stop_words': [True],
    'features__body__processor__clean': [True],
    'features__body__tfidf__ngram_range': [(1, 3)],
    'features__body__tfidf__max_df': [1.0],
    
    'features__adv__processor__lemma': [True],
    'features__adv__processor__no_stop_words': [True],
    'features__adv__processor__clean': [True],
    'features__adv__tfidf__ngram_range': [(1, 1)],
    'features__adv__tfidf__max_df': [0.7],
    
    'features__dis__processor__lemma': [True],
    'features__dis__processor__no_stop_words': [True],
    'features__dis__processor__clean': [True],
    'features__dis__tfidf__ngram_range': [(1, 1)],
    'features__dis__tfidf__max_df': [0.7],
}

mnb_params = {'mnb__alpha': [0]}
mnb_params.update(text_params)

In [1114]:
%%time
clf = GridSearchCV(mnb_pipe, mnb_params, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=2)
clf.fit(data_train, y_train)

print(f'Best Score: {clf.best_score_:.3f}')
print('Mean Score', clf.cv_results_['mean_test_score'])
print('STD ', clf.cv_results_['std_test_score'])
print('\n')

y_predict = clf.predict(data_test)
report = classification_report(y_test, y_predict)
print(report)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.0s finished


Best Score: 0.554
Mean Score [0.55365155]
STD  [0.00809414]


              precision    recall  f1-score   support

           1       0.62      0.13      0.21        39
           2       0.00      0.00      0.00        31
           3       0.25      0.04      0.07        46
           4       0.29      0.16      0.20       164
           5       0.68      0.91      0.78       522

   micro avg       0.63      0.63      0.63       802
   macro avg       0.37      0.25      0.25       802
weighted avg       0.55      0.63      0.57       802

CPU times: user 4.66 s, sys: 15.8 ms, total: 4.67 s
Wall time: 11.6 s


## Числові признаки

In [1029]:
class TextStats(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        result = list()
        for row in X[['text', 'advantages', 'disadvantages']].itertuples():
            len_text = len(re.split(r'\W+', row.text))
            len_adv = len(re.split(r'\W+', row.advantages))
            len_dis = len(re.split(r'\W+', row.disadvantages))
            stats = {
                'len_text': len_text,
                'len_adv': len_adv,
                'len_dis': len_dis,
                'adv_text': len_adv / len_text,
                'dis_text': len_dis / len_text,
                'adv_dis': len_adv / (len_dis + 0.001),
                'dis_adv': len_dis / (len_adv + 0.01),
                'is_latin': bool(re.search(r'\b[A-Za-z]+\b', row.text)),
                'is_hastag': bool(re.search(r'#\w+', row.text)),  # #моєрозпакування
                'is_caps_text': bool(re.search(r'\b[А-Я][А-Я]+\b', row.text)),
                'is_caps_text_latin': bool(re.search(r'\b[A-Z][A-Z]+\b', row.text)),
                'is_caps_adv': bool(re.search(r'\b[А-Я][А-Я]+\b', row.advantages)),
                'is_caps_dis': bool(re.search(r'\b[А-Я][А-Я]+\b', row.disadvantages)),
                'quest': (len_adv + len_dis == 0) and (row.text.endswith('?'))  # питання (а не відгуки) в яких часто стоїть 5 зірок
            }
            result.append(stats)
        return result

In [1030]:
class AuthorType(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return list(X['author'].apply(self.process_author))
    
    def process_author(self, author):
        return {
            'is_guest': author == 'Гость',
            'is_cyrillic': bool(re.match(r'(\p{IsCyrillic}+\s?)+', author)),
            'is_latin': not bool(re.match(r'(\p{IsCyrillic}+\s?)+', author)),
            'n_words': len(re.split(r'\W+', author)),
        }

In [1031]:
def load_tones():
    tones = dict()
    with open('tone-dict-uk.tsv', 'r') as f:
        for line in f:
            word, tone = line.split('\t')
            tones[word] = int(tone.strip())
    return tones

TONES = load_tones()
TONES['manyexclamation'] = 2
TONES['manyplus'] = 2
TONES['manyquest'] = -2
TONES['exclquest'] = -1
TONES['manydot'] = -1
TONES['happysmile'] = 2
TONES['sadsmile'] = -2
TONES['ок'] = 1
TONES['окей'] = 1


class ToneFeature(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        return list(x.apply(lambda text: self.tone_stats(text)))
    
    def tone_stats(self, text):
        scores = self.sent_to_tones(text)
        pos_scores = [s for s in scores if s > 0]
        neg_scores = [s for s in scores if s < 0]
        return {
            'min_tone': min(scores),
            'max_tone': max(scores),
            'max_to_min': abs(max(scores) / (min(scores) + 0.001)),
            'tone_per_word': sum(scores) / len(scores),
            'mean_tone': sum(scores) / (len(pos_scores) + len(neg_scores) + 0.001),
            'neg_tone_per_word': abs(sum(neg_scores)) / len(scores),
            'pos_tone_per_word': sum(pos_scores) / len(scores),
            'mean_neg': abs(sum(neg_scores)) / (len(neg_scores) + 0.001),
            'mean_pos': sum(pos_scores) / (len(pos_scores) + 0.001),
            'pos_to_neg': sum(pos_scores) / (abs(sum(neg_scores)) + 0.001),
        }
    
    def sent_to_tones(self, text):
        tones = list()
        tokens = tokenize_uk.tokenize_words(text)
        for i, t in enumerate(tokens):
            tone = TONES.get(t.lower(), 0)
            if i > 0:
                if tokens[i - 1] == 'не':
                    tone *= -1
            if i > 1:
                prev = ' '.join(tokens[i - 2: i])
                if re.match('не (дуже|зовсім|такий)', prev):
                    tone *= -1
            tones.append(tone)
        return tones

In [1213]:
# ================== NUMERICAL FEATURES =============================

text_stats = Pipeline([
    ('stats', TextStats()),
    ('vect', DictVectorizer(sparse=False)),
    ('scale', StandardScaler()),
])

tone_features = Pipeline([
    ('selector', FeatureSelector('text')),
    ('processor', TextProcessor(lemma=True)),
    ('tone', ToneFeature()),
    ('vect', DictVectorizer(sparse=False)),
    ('scale', StandardScaler())
])

author_features = Pipeline([
    ('stats', AuthorType()),
    ('vect', DictVectorizer(sparse=False)),
    ('scale', StandardScaler())
])

numerical = FeatureUnion([
    ('text_stats', text_stats),
    ('tone', tone_features),
    ('author', author_features),
    ('like', FeatureSelector('like', to_values=True)),
    ('dislike', FeatureSelector('dislike', to_values=True))
])

logit_pipe = Pipeline([
    ('numerical', numerical),
    ('logit', LogisticRegression())
])

logit_params = {
    'logit__C': [1000],
}

In [1124]:
%%time
clf = GridSearchCV(logit_pipe, logit_params, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=2)
clf.fit(data_train, y_train)

print(f'Best Score: {clf.best_score_:.3f}')
print('Mean Score', clf.cv_results_['mean_test_score'])
print('STD ', clf.cv_results_['std_test_score'])
print('\n')

y_predict = clf.predict(data_test)
report = classification_report(y_test, y_predict)
print(report)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   31.4s remaining:   31.4s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   47.7s finished


Best Score: 0.556
Mean Score [0.55648293 0.55648293]
STD  [0.0048353 0.0048353]


              precision    recall  f1-score   support

           1       0.53      0.23      0.32        39
           2       0.00      0.00      0.00        31
           3       1.00      0.02      0.04        46
           4       0.42      0.10      0.16       164
           5       0.69      0.98      0.81       522

   micro avg       0.67      0.67      0.67       802
   macro avg       0.53      0.27      0.27       802
weighted avg       0.62      0.67      0.58       802

CPU times: user 13.9 s, sys: 324 ms, total: 14.3 s
Wall time: 1min 1s


## Комбінація текстових та числових признаків

In [1214]:
features = FeatureUnion([
    ('text', text),
    ('numerical', numerical),
])

logit_pipe = Pipeline([
    ('features', features),
    ('logit', LogisticRegression())
])

mnb_pipe = Pipeline([
    ('features', features),
    ('logit', MultinomialNB())
])

text_params = {
    'features__text__body__processor__lemma': [False],
    'features__text__body__processor__no_stop_words': [True],
    'features__text__body__processor__clean': [True],
    'features__text__body__tfidf__ngram_range': [(1, 3)],
    'features__text__body__tfidf__max_df': [1.0],
    
    'features__text__adv__processor__lemma': [True],
    'features__text__adv__processor__no_stop_words': [True],
    'features__text__adv__processor__clean': [True],
    'features__text__adv__tfidf__ngram_range': [(1, 1)],
    'features__text__adv__tfidf__max_df': [0.7],
    
    'features__text__dis__processor__lemma': [True],
    'features__text__dis__processor__no_stop_words': [True],
    'features__text__dis__processor__clean': [True],
    'features__text__dis__tfidf__ngram_range': [(1, 1)],
    'features__text__dis__tfidf__max_df': [0.7],
}

logit_params = {
    'logit__C': [1000]
}
logit_params.update(text_params)

mnb_params = dict()
mnb_params.update(text_params)

In [1215]:
clf = GridSearchCV(logit_pipe, logit_params, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=2)

clf.fit(data_train, y_train)

print(f'Best Score: {clf.best_score_:.3f}')
print('Mean Score', clf.cv_results_['mean_test_score'])
print('STD ', clf.cv_results_['std_test_score'])
print('\n')

y_predict = clf.predict(data_test)
report = classification_report(y_test, y_predict)
print(report)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   38.5s finished


Best Score: 0.622
Mean Score [0.62214829]
STD  [0.00844017]


              precision    recall  f1-score   support

           1       0.42      0.26      0.32        39
           2       0.20      0.06      0.10        31
           3       0.31      0.20      0.24        46
           4       0.31      0.21      0.25       164
           5       0.75      0.89      0.81       522

   micro avg       0.65      0.65      0.65       802
   macro avg       0.40      0.32      0.34       802
weighted avg       0.59      0.65      0.61       802



# Тест на категорії купання та гігієна

In [1226]:
kupanie = load_data(DATA_DIR / 'kupanie-i-gigiena-lang.jl')

kup_predict = clf.predict(kupanie[feature_names])
report = classification_report(kupanie[target], kup_predict)
print(report)

              precision    recall  f1-score   support

           1       1.00      0.40      0.57        10
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00        12
           4       0.22      0.09      0.12        47
           5       0.89      0.98      0.94       494

   micro avg       0.87      0.87      0.87       565
   macro avg       0.42      0.29      0.33       565
weighted avg       0.82      0.87      0.84       565



# Стекінг

In [1151]:
class MetaFeature(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, cv):
        self.estimator = estimator
        self.cv = cv
        
    def fit(self, X, y):
        self.y_pred = cross_val_predict(self.estimator, X, y, cv=self.cv, method='predict_proba', verbose=0, n_jobs=-1)
        self.estimator.fit(X, y)
    
    def transform(self, X):
        return self.estimator.predict_proba(X)
    
    def fit_transform(self, X, y):
        if getattr(self, 'y_pred', None) == None:
            self.fit(X, y)
        return self.y_pred

In [1064]:
class EmoText(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.apply(self.filter_tones)
        return X
    
    def filter_tones(self, text):
        return ' '.join([t for t in tokenize_uk.tokenize_words(text) if t.lower() in TONES])

In [1198]:
# ================== TEXT FEATURES ======================

def text_feature_pipe(field):
    return Pipeline([
        ('selector', FeatureSelector(field)),
        ('processor', TextProcessor()),
        ('tfidf', TfidfVectorizer()),])

body = text_feature_pipe('text')
adv = text_feature_pipe('advantages')
dis = text_feature_pipe('disadvantages')

emo = Pipeline([
    ('selector', FeatureSelector('text')),
    ('processor', TextProcessor()),
    ('emo', EmoText()),
    ('tfidf', TfidfVectorizer()),
])


text = FeatureUnion([
    ('body', emo),
    ('adv', adv),
    ('dis', dis),
#     ('emo', emo),
])

# ================== NUMERICAL FEATURES =============================

text_stats = Pipeline([
    ('stats', TextStats()),
    ('vect', DictVectorizer(sparse=False)),
    ('scale', StandardScaler()),
])

tone_features = Pipeline([
    ('selector', FeatureSelector('text')),
    ('processor', TextProcessor(lemma=True)),
    ('tone', ToneFeature()),
    ('vect', DictVectorizer(sparse=False)),
    ('scale', StandardScaler())
])

author_features = Pipeline([
    ('stats', AuthorType()),
    ('vect', DictVectorizer(sparse=False)),
    ('scale', StandardScaler())
])

numerical = FeatureUnion([
    ('text_stats', text_stats),
    ('tone', tone_features),
    ('author', author_features),
    ('like', FeatureSelector('like', to_values=True)),
    ('dislike', FeatureSelector('dislike', to_values=True))
])

# ================= META-TEXT FEATURES ===============================


bayes_text = Pipeline([
    ('text', text),
    ('rating', MetaFeature(MultinomialNB(), cv=5))
])

svc_text = Pipeline([
    ('text', text),
    ('rating', MetaFeature(SVC(C=10, probability=True), cv=5))
])

forest_text = Pipeline([
    ('text', text),
    ('rating', MetaFeature(RandomForestClassifier(n_estimators=100, class_weight='balanced'), cv=5))
])

# ============== META-NUMERICAL FEATURES =======================

logit_numerical = Pipeline([
    ('numerical', numerical),
    ('rating', MetaFeature(LogisticRegression(), cv=5))
])

forest_numerical = Pipeline([
    ('numerical', numerical),
    ('rating', MetaFeature(RandomForestClassifier(n_estimators=100, max_depth=2, class_weight='balanced'), cv=5))
])


# ================ META-ESTIMATOR ============================

features = FeatureUnion([
    ('bayes', bayes_text),
    ('svc', svc_text),
    ('forest', forest_text),
    ('logit_numerical', logit_numerical),
    ('forest_numerical', forest_numerical),
])

pipe = Pipeline([
    ('features', features),
    ('logit', LogisticRegression()),
])

In [1199]:
hyper_params = {
    'features__svc__text__body__processor__lemma': [False],
    'features__svc__text__body__processor__no_stop_words': [True],
    'features__svc__text__body__processor__clean': [True],
    'features__svc__text__body__tfidf__ngram_range': [(1, 3)],
    'features__svc__text__body__tfidf__max_df': [1.0],
    
    'features__svc__text__adv__processor__lemma': [True],
    'features__svc__text__adv__processor__no_stop_words': [True],
    'features__svc__text__adv__processor__clean': [True],
    'features__svc__text__adv__tfidf__ngram_range': [(1, 1)],
    'features__svc__text__adv__tfidf__max_df': [0.7],
    
    'features__svc__text__dis__processor__lemma': [True],
    'features__svc__text__dis__processor__no_stop_words': [True],
    'features__svc__text__dis__processor__clean': [True],
    'features__svc__text__dis__tfidf__ngram_range': [(1, 1)],
    'features__svc__text__dis__tfidf__max_df': [0.7],
    
    
    'features__forest__text__body__processor__lemma': [False],
    'features__forest__text__body__processor__no_stop_words': [True],
    'features__forest__text__body__processor__clean': [True],
    'features__forest__text__body__tfidf__ngram_range': [(1, 3)],
    'features__forest__text__body__tfidf__max_df': [1.0],
    
    'features__forest__text__adv__processor__lemma': [True],
    'features__forest__text__adv__processor__no_stop_words': [True],
    'features__forest__text__adv__processor__clean': [True],
    'features__forest__text__adv__tfidf__ngram_range': [(1, 1)],
    'features__forest__text__adv__tfidf__max_df': [0.7],
    
    'features__forest__text__dis__processor__lemma': [True],
    'features__forest__text__dis__processor__no_stop_words': [True],
    'features__forest__text__dis__processor__clean': [True],
    'features__forest__text__dis__tfidf__ngram_range': [(1, 1)],
    'features__forest__text__dis__tfidf__max_df': [0.7],
    
    
    'features__bayes__text__body__processor__lemma': [False],
    'features__bayes__text__body__processor__no_stop_words': [True],
    'features__bayes__text__body__processor__clean': [True],
    'features__bayes__text__body__tfidf__ngram_range': [(1, 3)],
    'features__bayes__text__body__tfidf__max_df': [1.0],
    
    'features__bayes__text__adv__processor__lemma': [True],
    'features__bayes__text__adv__processor__no_stop_words': [True],
    'features__bayes__text__adv__processor__clean': [True],
    'features__bayes__text__adv__tfidf__ngram_range': [(1, 1)],
    'features__bayes__text__adv__tfidf__max_df': [0.7],
    
    'features__bayes__text__dis__processor__lemma': [True],
    'features__bayes__text__dis__processor__no_stop_words': [True],
    'features__bayes__text__dis__processor__clean': [True],
    'features__bayes__text__dis__tfidf__ngram_range': [(1, 1)],
    'features__bayes__text__dis__tfidf__max_df': [0.7],
    
#     'mnb__alpha': [0],
#     'logit__penalty': ['l2'],
#     'logit__solver': ['saga'],
#     'logit__class_weight': ['balanced'],
#     'logit__multi_class': ['ovr']
}

clf = GridSearchCV(pipe, hyper_params, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=2)

In [1167]:
# for k in clf.get_params().keys():
#     print(k)

In [1200]:
%%time
clf.fit(data_train, y_train)

print(f'Best Score: {clf.best_score_:.3f}')
print('Mean Score', clf.cv_results_['mean_test_score'])
print('STD ', clf.cv_results_['std_test_score'])
print('\n')

y_predict = clf.predict(data_test)
report = classification_report(y_test, y_predict)
print(report)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.5min finished


Best Score: 0.610
Mean Score [0.61014264]
STD  [0.01002696]


              precision    recall  f1-score   support

           1       0.41      0.18      0.25        39
           2       0.00      0.00      0.00        31
           3       0.00      0.00      0.00        46
           4       0.34      0.22      0.27       164
           5       0.74      0.96      0.83       522

   micro avg       0.68      0.68      0.68       802
   macro avg       0.30      0.27      0.27       802
weighted avg       0.57      0.68      0.61       802

CPU times: user 45 s, sys: 140 ms, total: 45.1 s
Wall time: 2min 25s


In [1205]:
pred = cross_val_predict(clf, data_train, y_train, cv=3)
report = classification_report(y_train, pred)
print(report)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   47.6s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   54.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   55.9s finished


              precision    recall  f1-score   support

           1       0.49      0.21      0.29        92
           2       0.00      0.00      0.00        73
           3       0.40      0.02      0.04       107
           4       0.38      0.22      0.28       382
           5       0.73      0.96      0.83      1215

   micro avg       0.68      0.68      0.68      1869
   macro avg       0.40      0.28      0.29      1869
weighted avg       0.60      0.68      0.61      1869



In [1210]:
nc, nl = 0, 0
for comment in jl_file(DATA_DIR / data_file):
    if comment['lang'] == 'uk':
        nc += 1
    nl += 1

In [1211]:
MultinomialNB

0.1684308065148437