# Исследование векторных моделей представления текстовых данных на основе набора алгоритмов *word2vec*

## Постановка задачи:
- Провести экспериментальное исследование векторных моделей представления текстовых данных на основе набора алгоритмов *word2vec* в задаче одноклассовой классификации текстовых документов;
- Сравнить качество классификации текстовых документов с использованием исследуемых моделей и модели «мешок слов».

### Применение *doc2vec*

In [1]:
%matplotlib inline
import os
import numpy as np
import pickle
import multiprocessing
from random import shuffle
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_curve, auc, roc_auc_score, average_precision_score
import matplotlib.pyplot as plt
from sklearn.svm import OneClassSVM
from gensim.models import doc2vec
from gensim.models.doc2vec import LabeledSentence
from sklearn.feature_extraction.text import TfidfVectorizer

Создаём итератор, который необходим для дальнейшего использования модели *doc2vec*

In [2]:
class DocsIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            #for lines in doc:
            doc_words = doc.split()
            #shuffle(doc_words)
            yield LabeledSentence(words=doc_words , tags=[self.labels_list[idx]])

### Набор данных
Используем набор данных Enron. Это тексты из переписки 15 сотрудников компании Enron за 2000 и 2001 годы,всего 11941 текстовый документ, которые были разбиты на 118 экспериментальных диапазонов. Использовались оригинальный набор и обработанный с помощью алгоритма, разработанного в лаборатории ТП АСВК.

Считывание данных. Создаём текстовый корпус и метки для задачи одноклассовой классификации:

In [3]:
def read_data(user_id, element_list, path, data, labels = None):
    for docs in element_list:
        with open(path + '/' + str(docs) + '.term', 'r') as file:
            data.append(file.read())
            if labels != None:
                labels.append(user_id + docs)
        file.close()
    return 

In [4]:
def fill_data_labels(data_list, dataset_path, inf_shrink):
    data = [] 
    labels = []  
    for (user_name, user_id, element_list) in data_list:
        path = os.path.join(dataset_path, user_name, inf_shrink)
        read_data(user_id, element_list, path, data, labels)
    return data, labels

Создаём и обучаем модель *doc2vec*

Пробуем использовать [control the learning rate](https://rare-technologies.com/doc2vec-tutorial/#training)

In [5]:
def create_model(train_list, dataset_path, inf_shrink, dm, mode, size, window):
    data, labels =  fill_data_labels(train_list, dataset_path, inf_shrink)
    it = DocsIterator(data, labels)
    cores = multiprocessing.cpu_count()
    if dm == 1:
        if mode is 'dm_mean':
            #alpha=0.025, min_alpha=0.025,
            model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, dm = dm, dm_mean = 1, \
                                    size = size, window = window, min_count = 2, workers=cores) 
        elif mode is 'dm_concat':
            model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, dm = dm, dm_concat = 1, \
                                    size = size, window = window, min_count = 2, workers=cores) 
        else:
            model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, dm = dm, size = size, \
                                    window = window, min_count = 2, workers=cores) 
    else:
         model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, dm = dm, size = size, \
                                 min_count = 2, workers=cores)      
    model.build_vocab(it)
    for epoch in range(10):
        model.train(it)
        model.alpha -= 0.002 # decrease the learning rate
        model.min_alpha = model.alpha # fix the learning rate, no decay
    return model

### Решаем задачу одноклассовой классификации для разных экспериментальных диапозонов.
`clf_mode`: пробуем классификаторы **k-Nearest-Neighbor**  и **One-Class Support Vector Machine**. 

`dm`, `size`, `window`: пробуем разные варианты моделей *doc2vec*: Distributed Memory version of Paragraph Vectors (PV-DM) и Distributed Bag of Words version of Paragraph Vector (PV-DBOW), разные размеры окна и размерности пространства.

Метрика качества **ROC AUC**

In [9]:
def process(dataset_path, clf_mode = 'kNN', dm = 1, mode = '', size = 10, window = 10):
    auc_list1 = []
    auc_list2 = []
    auc_list3 = []
    count = 0
    for target_user_name in os.listdir(dataset_path):
        user_path = os.path.join(dataset_path, target_user_name)
        partition_path = os.path.join(user_path, 'partition.pickle')

        partition = None
        with open(partition_path, 'rb') as f:
            partition = pickle.load(f)

        if partition is None or not partition:
            continue

        inf_shrink = 'nmf_10'
        inf_shrink_path = os.path.join(user_path, inf_shrink)

        if not os.path.isdir(inf_shrink_path):  # Skip .pickle
            continue

        for (ds_time_series_train_list, ds_time_series_test_list) in partition:
            #vectors
            X_train = []
            X_test = []
            #labels
            y_train = []
            y_test= []
            count += 1
            model = create_model(ds_time_series_train_list, dataset_path, inf_shrink, dm, mode, size, window)

            for (user_name, user_id, element_list) in ds_time_series_train_list:
                # user_name
                # user_id
                # element_list - list of file id for user
                
                train_path = os.path.join(dataset_path, user_name, inf_shrink)
                for docs in element_list:
                    label = user_id + docs
                    X_train.append(model.docvecs[label])
                    if user_name == target_user_name:
                        y_train.append(1)
                    else:
                        y_train.append(-1)                    

            for (user_name, user_id, element_list) in ds_time_series_test_list:
                # Test docs
                if(len(element_list) == 0):
                    continue
                test_path = os.path.join(dataset_path, user_name, inf_shrink)
                test_data = []
                read_data(user_id, element_list, test_path, test_data)
                for doc, elems in zip(test_data, element_list):
                    words = doc.split()
                    #shuffle(words)
                    X_test.append(model.infer_vector(words))
                    if user_name == target_user_name:
                        y_test.append(1)
                    else:
                        y_test.append(-1) 
                            
            if clf_mode == 'kNN':           
                auc_score = []
                for n_neighbors in range(1, 4):              
                    clf = NearestNeighbors(n_neighbors = n_neighbors, algorithm='auto')
                    clf.fit(X_train)
                    (distances, indices) = clf.kneighbors(X_test)
                    test_score = []
                    for test_i in distances:
                        test_score.append(max(test_i))
                    roc_auc = roc_auc_score(y_test, test_score)
                    auc_score.append(1 - roc_auc)
                auc_list1.append(auc_score[0])
                auc_list2.append(auc_score[1])
                auc_list3.append(auc_score[2])
            else:
                clf = OneClassSVM(kernel='rbf')
                clf.fit(X_train)
                test_score = clf.decision_function(X_test)
                fpr, tpr, thresholds = roc_curve(y_test, test_score, pos_label = 1)
                roc_auc = auc(fpr, tpr)
                roc_auc = roc_auc_score(y_test, test_score)
                auc_list1.append(roc_auc)

    if clf_mode == 'kNN':
        return np.median(auc_list1), np.median(auc_list2), np.median(auc_list3)
    
    return np.median(auc_list1)

Пример запуска:

In [51]:
res = process('./student/', clf_mode = 'SVM', dm = 0, mode = '', size = 10)

0.921221231817


### Сравнение с bag-of-words
Для сравнения реализуем подход bag-of-words

In [9]:
def build_vocab(data):
    others_data = []
    for text in data:
        words = text.split()
        vocab = []
        for word in words:
            if words.count(word) < 2:
                text = text.replace(word, '_others')
            else:
                vocab.append(word)
        others_data.append(text)  
    return others_data, vocab

In [10]:
def bow_others(data, vocab):
    others_data = []
    for text in data:
        words = text.split()
        for word in words:
            if words.count(word) < 2 or vocab.count(word) < 1:
                text = text.replace(word, '_others')
        others_data.append(text)  
    return others_data

In [19]:
def bow_process(dataset_path, clf_mode = 'kNN'):
    auc_list1 = []
    auc_list2 = []
    auc_list3 = []
    count = 0
    for target_user_name in os.listdir(dataset_path):
        user_path = os.path.join(dataset_path, target_user_name)
        partition_path = os.path.join(user_path, 'partition.pickle')

        partition = None
        with open(partition_path, 'rb') as f:
            partition = pickle.load(f)

        if partition is None or not partition:
            continue

        inf_shrink = 'nmf_10'
        inf_shrink_path = os.path.join(user_path, inf_shrink)

        if not os.path.isdir(inf_shrink_path):  # Skip .pickle
            continue

        for (ds_time_series_train_list, ds_time_series_test_list) in partition:
            #labels
            y_train = []
            y_test= []
            count += 1
    
            vectorizer = TfidfVectorizer(use_idf = False, lowercase =  False)
            for (user_name, user_id, element_list) in ds_time_series_train_list:
                # user_name
                # user_id
                # element_list - list of file id for user
                train_path = os.path.join(dataset_path, user_name, inf_shrink)
                data = []
                read_data(user_id, element_list, train_path, data)
                train_data, vocab = build_vocab(data)
                X_train = vectorizer.fit_transform(train_data).toarray()
                for docs in element_list:
                    if user_name == target_user_name:
                        y_train.append(1)
                    else:
                        y_train.append(-1)   
            
            fl = True
            for (user_name, user_id, element_list) in ds_time_series_test_list:
                # Test docs
                if(len(element_list) == 0):
                    continue
                test_path = os.path.join(dataset_path, user_name, inf_shrink)
                data2 = []
                read_data(user_id, element_list, test_path, data2)
                test_data = bow_others(data2, vocab)
                if fl:
                    X_test = vectorizer.transform(test_data).toarray()
                    fl = False
                else:
                    X_test = np.concatenate((X_test, vectorizer.transform(test_data).toarray()), axis = 0)
                for doc, elems in zip(test_data, element_list):
                    if user_name == target_user_name:
                        y_test.append(1)
                    else:
                        y_test.append(-1)
         
            if clf_mode == 'kNN':           
                auc_score = []
                for n_neighbors in range(1, 4):              
                    clf = NearestNeighbors(n_neighbors = n_neighbors, algorithm='auto')
                    clf.fit(X_train)
                    (distances, indices) = clf.kneighbors(X_test)
                    test_score = []
                    for test_i in distances:
                        test_score.append(max(test_i))
                    fpr, tpr, thresholds = roc_curve(y_test, test_score, pos_label = 1)
                    roc_auc = auc(fpr, tpr)
                    roc_auc = roc_auc_score(y_test, test_score)
                    auc_score.append(1 - roc_auc)
                auc_list1.append(auc_score[0])
                auc_list2.append(auc_score[1])
                auc_list3.append(auc_score[2])  
                print(auc_score[0], auc_score[1], auc_score[2])
            else:
                clf = OneClassSVM(kernel = 'linear')
                clf.fit(X_train)
                test_score = clf.decision_function(X_test)
                roc_auc = roc_auc_score(y_test, test_score)
                auc_list1.append(roc_auc)
    
    if clf_mode == 'kNN':
        return np.median(auc_list1), np.median(auc_list2), np.median(auc_list3)
    
    return np.median(auc_list1), np.subtract(*np.percentile(auc_list1, [75, 25]))

Пример запуска (медиана результатов и iqr):

In [20]:
res = bow_process('./student/', clf_mode = 'SVM')

0.704344270144 0.153195270473
