Приведём текущий работающий код классификатора.

In [2]:
import numpy as np
import scipy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

from collections import defaultdict
from functools import partial
from itertools import chain
import re
import string
import time
from tqdm.notebook import trange, tqdm

from filimdb_evaluation.score import load_dataset_fast

In [3]:
my_stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
                  'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',
                  'themselves', 'what', 'which', 'who', 'whom', 'this',
                  'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',
                  'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
                  'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
                  'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
                  'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
                  'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
                  'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
                  't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
                  've', 'y', 'ain', 'aren', "aren't", 'could', 'couldn', "couldn't", 'didn', "didn't", 'doesn',
                  "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
                  "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
                  "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [4]:
def preprocessing(text):
    text = text.lower()
    remove_tags = re.compile(r'<.*?>')
    text = re.sub(remove_tags, '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join(sym if (sym.isalnum() or sym in (" ", "'")) else f" {sym} " for sym in text)
    return text

In [5]:
def tokenize_text(text, stem=0):
    """
        arg: list of texts
        return: list of tokenized texts
    """
    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
                  'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',
                  'themselves', 'what', 'which', 'who', 'whom', 'this',
                  'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',
                  'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
                  'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
                  'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
                  'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
                  'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
                  'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
                  't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
                  've', 'y', 'ain', 'aren', "aren't", 'could', 'couldn', "couldn't", 'didn', "didn't", 'doesn',
                  "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
                  "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
                  "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    tokenizer = re.compile(r"-?\d*[.,]?\d+|[?'\w]+|\S", re.MULTILINE | re.IGNORECASE)
    tokenized_text = tokenizer.findall(text)
    if stem == 0:
        return [token for token in tokenized_text if token not in stop_words]
    stem_dataset = [token[:stem] for token in tokenized_text if token not in stop_words]
    return stem_dataset

In [6]:
def preprocess_texts(dataset_texts):
    processed_texts = []
    for ind, text in enumerate(dataset_texts):
        prepared_text = preprocessing(text)
        tokenized_text = tokenize_text(prepared_text)
        processed_texts.append(tokenized_text)
    return processed_texts

In [7]:
def generate_ngrams(text, max_ngram=3):
    ngrams = []
    for token in text:
        ngrams.append((token, ))
    if max_ngram >= 2:
        for token in zip(text[:-1], text[1:]):
            ngrams.append(token)
    if max_ngram >= 3:
        for token in zip(text[:-2], text[1:-1], text[2:]):
            ngrams.append(token)
    return ngrams

In [8]:
def make_ngram_dataset(dataset, max_ngram=3):
    ngram_dataset = []
    for ind, text in enumerate(dataset):
        ngrams = generate_ngrams(text)
        ngram_dataset.append(ngrams)
    return ngram_dataset

In [9]:
def make_vocab(texts, max_df=0.5, min_df=3, min_tf=3, max_tokens=1000000):
    print("MAKING VOCAB")
    start = time.time()
    df_cnt = defaultdict(int)
    tf_cnt = defaultdict(int)
    total_documents = len(texts)
    # print(f"total_documents = {total_documents}")
    for text in texts:
        been = set()
        for token in text:
            if token not in been:
                been.add(token)
                df_cnt[token] += 1
            tf_cnt[token] += 1

    free_ind = 0
    w2ind = dict()
    vocab_tf = []
    tf_with_inds = []
    for word, tf in tf_cnt.items():
        df = df_cnt[word]
        if tf >= min_tf and df / total_documents <= max_df and df >= min_df:
            w2ind[word] = free_ind
            vocab_tf.append(tf)
            tf_with_inds.append((tf, word))
            free_ind += 1

    tf_with_inds.sort(key=lambda x: x[0], reverse=True)
    for tf, w in tf_with_inds[max_tokens:]:
        del w2ind[w]

    free_ind = 0
    w2ind_final = dict()
    vocab_tf_final = []
    for w, ind in w2ind.items():
        w2ind_final[w] = free_ind
        vocab_tf_final.append(tf_cnt[w])
        free_ind += 1

    vocab_tf_final = np.array(vocab_tf_final, dtype=np.float64)
    vocab_tf_prob = np.float_power(vocab_tf_final, 0.75)
    vocab_tf_prob /= vocab_tf_prob.sum()

    print(f"Finish vocab in {time.time() - start} seconds.")
    return w2ind_final, vocab_tf_prob

In [10]:
def make_inds_ngram_dataset(texts, w2ind, shuffle=True):
    ngrams_inds = []
    docs_inds = []
    for doc_ind, text in enumerate(texts):
        for ngram in text:
            if ngram in w2ind:
                ngrams_inds.append(w2ind[ngram])
                docs_inds.append(doc_ind)

    ngrams_inds = np.array(ngrams_inds)
    docs_inds = np.array(docs_inds)
    assert (len(ngrams_inds) == len(docs_inds))
    return ngrams_inds, docs_inds

In [11]:
def batch_generator(words_idxs, docs_idxs, probs, nb=5, batch_size=100, shuffle=True):
    # Let's generate all negative examples at once.
    
    neg_samples = np.random.choice(np.arange(len(probs)), size = (nb * len(words_idxs), ), p=probs)
    
#     print("pos_samples_len = ", len(docs_idxs), ", neg_samples = ", len(neg_samples))
    
    if shuffle:
        permutation = np.random.permutation(len(docs_idxs))
        words_idxs = words_idxs[permutation]
        docs_idxs = docs_idxs[permutation]
    
    end = (len(words_idxs) // batch_size - 1) * batch_size + 1
    for batch_start in range(0, end, batch_size):
        pos_batch = words_idxs[batch_start : batch_start + batch_size]
        docs_batch = docs_idxs[batch_start : batch_start + batch_size]
        pos_labels_batch = np.array([1 for _ in range(len(pos_batch))])
        yield (pos_batch, docs_batch, pos_labels_batch)
        for i in range(nb):
            neg_batch = neg_samples[batch_start + batch_size * i : batch_start + batch_size * (i + 1)]
            neg_labels_batch = np.array([0 for _ in range(len(neg_batch))])
            yield (neg_batch, docs_batch, neg_labels_batch)

In [12]:
class Doc2Vec:
    def __init__(self, vocab_size, docs_cnt, emb_size=500, train_start=0):
        self.word_embs = np.random.uniform(low=-0.001, high=0.001, size=(vocab_size, emb_size))
        self.docs_embs = np.random.uniform(low=-0.001, high=0.001, size=(docs_cnt, emb_size))
        self.vocab_size = vocab_size
        self.docs_cnt = docs_cnt
        self.emb_size = emb_size
        self.train_start = train_start
        self.bow = None

    def train(self, word_inds, doc_inds, labels, lr):
        word_batch_embs = self.word_embs[word_inds]
        doc_batch_embs = self.docs_embs[doc_inds]

        dot_prods = np.einsum('ij,ij->i', word_batch_embs, doc_batch_embs)
        y_pred = self.sigmoid(dot_prods)

        word_batch_grads = doc_batch_embs * (y_pred - labels).reshape(-1, 1)
        doc_batch_grads = word_batch_embs * (y_pred - labels).reshape(-1, 1)

        for ind, (w_ind, d_ind) in enumerate(zip(word_inds, doc_inds)):
            self.word_embs[w_ind] -= lr * word_batch_grads[ind]
            self.docs_embs[d_ind] -= lr * doc_batch_grads[ind]

        batch_loss = (-labels * np.log(y_pred) - (1 - labels) * np.log(1 - y_pred)).sum()

        return batch_loss

    def get_all_X(self):
        if self.bow is not None:
            full_embs = scipy.sparse.hstack((self.docs_embs, self.bow))
            full_embs = full_embs.tocsr()
            self.docs_embs = full_embs
        else:
            full_embs = self.docs_embs
            
        train_borders = (self.train_start, self.train_start + 15000)
        dev_borders = (train_borders[1], train_borders[1] + 10000)
        test_borders = (dev_borders[1], dev_borders[1] + 25000)
        X_train = full_embs[train_borders[0]: train_borders[1]]
        X_dev = full_embs[dev_borders[0]: dev_borders[1]]
        X_test = full_embs[test_borders[0]: test_borders[1]]

        return X_train, X_dev, X_test

    def set_bow_vectors(self, X_bow_full):
        self.bow = X_bow_full
    
    def sigmoid(self, x):
        return np.where(x > 0, 1.0 / (1.0 + np.exp(-x)), np.exp(x) / (np.exp(x) + 1.0))

In [13]:
def train_logreg(X_train, y_train, X_dev, y_dev):
    model = LogisticRegression(penalty='l2', max_iter=300, solver='liblinear', warm_start=True)
    
    log_border = 3
    C_values = np.logspace(-log_border, log_border, 20)
    params = {'C' : C_values}
    gs_clf = GridSearchCV(model, params, cv=10, n_jobs=4, verbose=2)
    gs_clf.fit(X_train, y_train)
    
    if gs_clf.best_params_['C'] in (C_values[0], C_values[-1]):
        print("C is on border!", gs_clf.best_params_['C'])
        log_border += 2
        C_values = np.logspace(-log_border, log_border, 20)
        params = {'C' : C_values}
        gs_clf = GridSearchCV(model, params, cv=10, n_jobs=4, verbose=1)
        gs_clf.fit(X_train, y_train)
    
    train_acc = gs_clf.best_score_
    
    best_model = gs_clf.best_estimator_
    dev_acc = best_model.score(X_dev, y_dev)
    return train_acc, dev_acc

In [14]:
def fit(d2v_epochs, base_d2v_lr=0.03, d2v_batch_size=100, d2v_nb=5):
    #------------------------------Data preparation started------------------------------
    all_data = load_dataset_fast()
    print()
    train_texts = all_data['train'][1]
    train_labels = all_data['train'][2]
    train_labels = np.array([int(lab == 'pos') for lab in train_labels], dtype=np.int32)

    dev_texts = all_data['dev'][1]
    dev_labels = all_data['dev'][2]
    dev_labels = np.array([int(lab == 'pos') for lab in dev_labels], dtype=np.int32)
    
    all_texts = list(chain(train_texts, dev_texts))
    preprocessed_texts = preprocess_texts(all_texts)
    ngram_texts = make_ngram_dataset(preprocessed_texts)
    
    w2ind_full, vocab_probs = make_vocab(ngram_texts, min_tf=1, max_df=0.6, min_df=1, max_tokens=1000000)
    
    print("len(w2ind_full)", len(w2ind_full))
    
    inds_texts = make_inds_ngram_dataset(ngram_texts, w2ind_full)
    
    doc2vec_model = Doc2Vec(len(w2ind_full), len(ngram_texts))
    
    #------------------------------Data preparation finished------------------------------
    
    train_accs = []
    dev_accs = []
    
    total_epoch_iterations = ((d2v_nb + 1) * len(inds_texts[0])) // d2v_batch_size
    
    total_iter = d2v_epochs * total_epoch_iterations
    
    loss_stat_border = 150000
    
    cur_iter = 0
    for ep in range(d2v_epochs):
        print("Start epoch #", ep + 1)
        print("Training Doc2Vec")
        batch_gen = batch_generator(inds_texts[0], inds_texts[1], probs=vocab_probs)
        avg_loss = 0.0
        for ind, (word_inds, doc_inds, labels) in enumerate(tqdm(batch_gen, total=total_epoch_iterations)):
            new_lr = base_d2v_lr * (1 - cur_iter * 1.0 / total_iter)
#             if ind % loss_stat_border == 0 and ind != 0:
#                 tqdm.write(f"new_lr: {new_lr}")
            batch_loss = doc2vec_model.train(word_inds, doc_inds, labels, new_lr)
            cur_iter += 1
            avg_loss += batch_loss
            if ind % loss_stat_border == 0 and ind != 0:
                tqdm.write(f"avg_loss: {avg_loss / loss_stat_border}")
                avg_loss = 0.0
            
        print("Starting logistic regression")
        
        X_train, X_dev, X_test = doc2vec_model.get_all_X()
        
        t_acc, d_acc = train_logreg(X_train, train_labels, X_dev, dev_labels)
        print("Train acc = ", t_acc, " Dev acc =  ", d_acc)
        train_accs.append(t_acc)
        dev_accs.append(d_acc)
    return train_accs, dev_accs

Теперь попробуем запустить этого монстра.

In [62]:
accs = fit(d2v_epochs=5)

Loading train set 
neg 7480
pos 7520
Loading dev set 
neg 5020
pos 4980
Loading test set 
unlabeled 25000

MAKING VOCAB
Finish vocab in 17.000880479812622 seconds.
len(w2ind_full) 1000000
Start epoch # 1
Training Doc2Vec


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=309556.0), HTML(value='')))

avg_loss: 69.31516219312935
avg_loss: 69.30627612093049

Starting logistic regression
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   23.0s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  1.2min finished


Train acc =  0.6047333333333333  Dev acc =   0.6144
Start epoch # 2
Training Doc2Vec


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=309556.0), HTML(value='')))

avg_loss: 68.55573649053017
avg_loss: 65.06104882432649

Starting logistic regression
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   32.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  4.1min finished


Train acc =  0.721  Dev acc =   0.7209
Start epoch # 3
Training Doc2Vec


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=309556.0), HTML(value='')))

avg_loss: 59.76488970962363
avg_loss: 55.44330884068975

Starting logistic regression
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   56.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  6.6min finished


Train acc =  0.7680666666666667  Dev acc =   0.7662
Start epoch # 4
Training Doc2Vec


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=309556.0), HTML(value='')))

avg_loss: 52.24754222319546
avg_loss: 50.35614107697979

Starting logistic regression
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  7.7min finished


Train acc =  0.7953333333333333  Dev acc =   0.7878
Start epoch # 5
Training Doc2Vec


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=309556.0), HTML(value='')))

avg_loss: 49.20957334184276
avg_loss: 48.7683816746832

Starting logistic regression
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  8.3min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


C is on border!
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.9min


KeyboardInterrupt: 

In [17]:
def fit_power(d2v_epochs, base_d2v_lr=0.03, d2v_batch_size=100, d2v_nb=5):
    #------------------------------Data preparation started------------------------------
    all_data = load_dataset_fast()
    print()
    train_texts = all_data['train'][1]
    train_labels = all_data['train'][2]
    train_labels = np.array([int(lab == 'pos') for lab in train_labels], dtype=np.int32)

    dev_texts = all_data['dev'][1]
    dev_labels = all_data['dev'][2]
    dev_labels = np.array([int(lab == 'pos') for lab in dev_labels], dtype=np.int32)
    
    all_texts = list(chain(train_texts, dev_texts))
    preprocessed_texts = preprocess_texts(all_texts)
    ngram_texts = make_ngram_dataset(preprocessed_texts)
    
    w2ind_full, vocab_probs = make_vocab(ngram_texts, min_tf=2, max_df=0.6, min_df=2, max_tokens=2000000)
    
    print("len(w2ind_full)", len(w2ind_full))
    vocab_ngrams_stat = [0, 0, 0]
    for ind, ngram in enumerate(w2ind_full.keys()):
        try:
            vocab_ngrams_stat[len(ngram) - 1] += 1
        except IndexError as e:
            print("wrong index: ", len(ngram))
            print(ind, ngram, w2ind_full[ngram])
            exit(0)
    print("total 1gram: ", vocab_ngrams_stat[0], ", 2gram: ", vocab_ngrams_stat[1], ", 3gram: ", vocab_ngrams_stat[2])
    
    
    inds_texts = make_inds_ngram_dataset(ngram_texts, w2ind_full)
    
    doc2vec_model = Doc2Vec(len(w2ind_full), len(ngram_texts))
    
    #------------------------------Data preparation finished------------------------------
    
    #------------------------------Fitting bag of words started------------------------------
    print()
    print("Started TFIDF")
    vectorizer = TfidfVectorizer(ngram_range=(1,3), max_df=0.6, min_df=2, vocabulary=w2ind_full, stop_words=my_stop_words)
    X_full_bow = vectorizer.fit_transform(all_texts)
#     X_train_bow = X_full_bow[:15000]
#     X_dev_bow = X_full_bow[15000: 15000 + 10000]
    
    doc2vec_model.set_bow_vectors(X_full_bow)
    #------------------------------Fitting bag of words finished------------------------------
    
    train_accs = []
    dev_accs = []
    
    total_epoch_iterations = ((d2v_nb + 1) * len(inds_texts[0])) // d2v_batch_size
    
    total_iter = d2v_epochs * total_epoch_iterations
    
    loss_stat_border = 150000
    
    cur_iter = 0
    for ep in range(d2v_epochs):
        print("Start epoch #", ep + 1)
        print("Training Doc2Vec")
        batch_gen = batch_generator(inds_texts[0], inds_texts[1], probs=vocab_probs)
        avg_loss = 0.0
        for ind, (word_inds, doc_inds, labels) in enumerate(tqdm(batch_gen, total=total_epoch_iterations)):
            new_lr = base_d2v_lr * (1 - cur_iter * 1.0 / total_iter)
#             if ind % loss_stat_border == 0 and ind != 0:
#                 tqdm.write(f"new_lr: {new_lr}")
            batch_loss = doc2vec_model.train(word_inds, doc_inds, labels, new_lr)
            cur_iter += 1
            avg_loss += batch_loss
            if ind % loss_stat_border == 0 and ind != 0:
                tqdm.write(f"avg_loss: {avg_loss / loss_stat_border}")
                avg_loss = 0.0
            
        print("Starting logistic regression")
        
        X_train, X_dev, X_test = doc2vec_model.get_all_X()
        
        t_acc, d_acc = train_logreg(X_train, train_labels, X_dev, dev_labels)
        print("Train acc = ", t_acc, " Dev acc =  ", d_acc)
        train_accs.append(t_acc)
        dev_accs.append(d_acc)
    return train_accs, dev_accs

In [18]:
accs = fit_power(d2v_epochs=10, base_d2v_lr=0.05)

Loading train set 
pos 7520
neg 7480
Loading dev set 
pos 4980
neg 5020
Loading test set 
unlabeled 25000

MAKING VOCAB
Finish vocab in 14.174696207046509 seconds.
len(w2ind_full) 419822
total 1gram:  54300 , 2gram:  289564 , 3gram:  75958

Started TFIDF
Start epoch # 1
Training Doc2Vec


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=273747.0), HTML(value='')))

avg_loss: 69.27357481270568

Starting logistic regression
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    7.1s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   44.3s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  1.3min finished


Train acc =  0.7009333333333334  Dev acc =   0.6981
Start epoch # 2
Training Doc2Vec


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=273747.0), HTML(value='')))

avg_loss: 48.49030135610227

Starting logistic regression
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  4.7min finished


C is on border! 1000.0
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.4s


KeyboardInterrupt: 