# Libraries

In [24]:
#Libraries to manage text data 

## SKLearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## NLTK
from nltk.stem import RSLPStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('rslp')
stemmer_pt = RSLPStemmer()
stemmer_en = PorterStemmer()
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize

## Python
import string

## Gensim
import gensim
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#Libraries to manage the file system
import os

#Other libraries
from tqdm import tqdm
import numpy as np
import scipy
import joblib
from abc import ABCMeta, abstractmethod
import json
import pandas as pd


[nltk_data] Downloading package stopwords to /home/rafael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rafael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /home/rafael/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


# Loading the Stopwords

In [25]:
stopwords_pt = set({})
stopwords_en = set({})
path_stop_pt = './stopPort.txt'
path_stop_en = './stopIngl.txt'

if(os.path.exists(path_stop_pt) and os.path.exists(path_stop_en)): 
    with open(path_stop_pt) as file_stop_pt:
        for line in file_stop_pt.readlines():
            stopwords_pt.add(line.strip())
    with open(path_stop_en) as file_stop_en:
        for line in file_stop_en.readlines():
            stopwords_en.add(line.strip())
else: 
    stopwords_pt = set(stopwords.words('portuguese'))
    stopwords_en = set(stopwords.words('english'))

# Class to Tokenize and Clean the Texts

In [26]:
config['pre-processing_steps']['lower_case'] = True

In [27]:
config['pre-processing_steps']

{'language': 'en',
 'remove_stopwords': True,
 'remove_punctuation': True,
 'convert_numbers': True,
 'remove_numbers': True,
 'simplification': True,
 'simplification_type': 'stemming',
 'lower_case': True}

In [28]:
text_preprocessor = TextPreprocessor(**config['pre-processing_steps'])

In [29]:
print('Language:', text_preprocessor.language)
print('Remove stopwords:', text_preprocessor.remove_stopwords)
print('Remove puctuation:', text_preprocessor.remove_punctuation)
print('Convert numbers:', text_preprocessor.convert_numbers)
print('Remove numbers:', text_preprocessor.remove_numbers)
print('Simplification:', text_preprocessor.simplification)
print('Simplification type:', text_preprocessor.simplification_type)
print('Lower case:', text_preprocessor.lower_case)

Language: en
Remove stopwords: True
Remove puctuation: True
Convert numbers: True
Remove numbers: True
Simplification: True
Simplification type: stemming
Lower case: True


In [30]:
class TextPreprocessor(object): 
    
    def __init__(self, language='en', remove_stopwords=True, remove_punctuation=True, 
                 convert_numbers = True, remove_numbers = False, simplification=True, 
                 simplification_type='lemmatization', lower_case = True): 
        self.language = language
        self.remove_stopwords = remove_stopwords
        self.remove_punctuation = remove_punctuation
        self.convert_numbers = convert_numbers
        self.remove_numbers = remove_numbers
        self.simplification = simplification
        self.simplification_type = simplification_type 
        self.lower_case = lower_case


    # Complete function to standardize the text
    def text_cleaner(self, text): 
        new_text = ''
        stopwords = None 

        if self.language == 'en':
            stopwords = stopwords_en 
        else:
            stopwords = stopwords_pt

        if self.lower_case == True: 
            text = text.lower()

        tokens = nltk.word_tokenize(text)
        
        if self.remove_stopwords == True:
            new_tokens = []
            for token in tokens: 
                if token in stopwords:
                    continue 
                else: 
                    new_tokens.append(token)
            tokens = new_tokens 

        if self.remove_punctuation == True: 
            new_tokens = []
            for token in tokens: 
                if token in string.punctuation:
                    continue 
                else: 
                    new_tokens.append(token)
            tokens = new_tokens 
        
        if self.remove_numbers == True:
            new_tokens = []
            for token in tokens: 
                if token.isnumeric():
                    continue
                new_tokens.append(token)
            tokens = new_tokens 
        
        if self.convert_numbers == True: 
            new_tokens = []
            for token in tokens: 
                if token.isnumeric():
                    new_tokens.append("0"*len(token))
                else: 
                    new_tokens.append(token)
            tokens = new_tokens 

        if self.simplification == True: 
            new_tokens = []
            if self.language == 'en': 
                if self.simplification_type  == 'lemmatization':
                    for token in tokens: 
                        new_tokens.append(lemmatizer.lemmatize(token))
                elif self.simplification_type  == 'stemming':
                    for token in tokens: 
                        new_tokens.append(stemmer_en.stem(token))
                else: 
                    raise ValueError('Unsuported language. Please, use language = {"pt","en"}.')
            elif self.language == 'pt':
                for token in tokens: 
                        new_tokens.append(stemmer_en.stem(token))
            else: 
                raise ValueError('Unsuported language. Please, use language = {"pt","en"}.')
            tokens = new_tokens

        return ' '.join(tokens).strip()


    #Just a simple tokenizer
    def tokenizer(self, text):
        text = text.lower()
        lista_alfanumerica = []

        for token in nltk.word_tokenize(text):
            if token in string.punctuation:
                continue 
            if token in stopwords_en: 
                continue
            if token.isnumeric():
                token = "0"*len(token)

            lista_alfanumerica.append(token)
        return lista_alfanumerica

In [31]:
def tokenizer(self, text):
    text = text.lower()
    lista_alfanumerica = []

    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue 
        if token in stopwords_en: 
            continue
        if token.isnumeric():
            token = "0"*len(token)

        lista_alfanumerica.append(token)
    return lista_alfanumerica

In [32]:
def text_cleaner(self, text, language='en', remove_stopwords=True, remove_punctuation=True, 
                 convert_numbers = True, remove_numbers = False, simplification=True, 
                 simplification_type='lemmatization', lower_case = True): 
    new_text = ''
    stopwords = None 
    if self.language == 'en':
        stopwords = stopwords_en 
    else:
        stopwords = stopwords_pt

    if lower_case == True: 
        text = text.lower()

    tokens = nltk.word_tokenize(text)
    
    if remove_stopwords == True:
        new_tokens = []
        for token in tokens: 
            if token in stopwords:
                continue 
            else: 
                new_tokens.append(token)
        tokens = new_tokens 

    if remove_punctuation == True: 
        new_tokens = []
        for token in tokens: 
            if token in string.punctuation:
                continue 
            else: 
                new_tokens.append(token)
        tokens = new_tokens 
    
    if convert_numbers == True: 
        new_tokens = []
        for token in tokens: 
            if token.isnumeric():
                token = "0"*len(token)
            new_tokens.append(token)
        tokens = new_tokens 

    if remove_numbers == True: 
        new_tokens = []
        for token in tokens: 
            if token.isnumeric():
                continue
            new_tokens.append(token)
        tokens = new_tokens 

    if simplification == True: 
        new_tokens = []
        if language == 'en': 
            if simplification_type  == 'lemmatization':
                for token in tokens: 
                    new_tokens.append(lemmatizer.lemmatize(token))
            elif simplification_type  == 'stemming':
                for token in tokens: 
                    new_tokens.append(stemmer_en.stem(token))
            else: 
                raise ValueError('Unsuported language. Please, use language = {"pt","en"}.')
        elif language == 'pt':
            for token in tokens: 
                    new_tokens.append(stemmer_en.stem(token))
        else: 
            raise ValueError('Unsuported language. Please, use language = {"pt","en"}.')
        tokens = new_tokens

    return ' '.join(tokens).strip()



# Functions to Save and Load the Presentations

In [33]:
def save_representation(representation, path): 
        joblib.dump(representation,path)
        
def load_representation(path): 
    return joblib.load(path)

# Class to Generate a Standard Representation for Different Space Vector Models

In [34]:
class StructuredRepresentation():

    def __init__(self, doc_vectors=None, class_vectors=None, vocabulary=None): 
        self.text_vectors = doc_vectors
        self.class_vectors = class_vectors 
        self.vocabulary = vocabulary

  
    def save_arff(self, name, path, non_sparse_format = False):
        num_docs = self.text_vectors.shape[0]
        num_attrs = self.text_vectors.shape[1]
        with open(path, 'w') as arff: 
            #Writting the relation
            arff.write(f'@relation {name}\n\n')
            
            #Writting the attributes
            if self.vocabulary == None: 
                for attr in range(num_attrs): 
                    arff.write(f'@ATTRIBUTE dim{attr + 1} NUMERIC\n')
            else: 
                sorted_vocabulary = sorted(self.vocabulary.items(), key=lambda x: x[1])
                for attr in range(num_attrs): 
                    arff.write(f'@ATTRIBUTE {sorted_vocabulary[attr][0]} NUMERIC\n')
            
            #Writting the class names
            arff.write('@ATTRIBUTE att_class ' + '{"' + '","'.join(self.class_vectors.unique()) + '"}\n\n')


            #Writting the data
            arff.write('@data\n\n')

            if non_sparse_format == False: 
                for doc in range(num_docs):
                    vector = self.text_vectors[doc]
                    if type(vector) == scipy.sparse.csr.csr_matrix: 
                        vector = self.text_vectors[doc].toarray()[0]
                    str_vec = ''
                    for i in range(vector.shape[0]): 
                        str_vec += str(vector[i]) + ','
                    classe = self.class_vectors[doc]
                    arff.write(str_vec + '"' + classe + '"\n') 
            else: 
                for doc in range(num_docs):
                    vector = self.text_vectors[doc]
                    if type(vector) == scipy.sparse.csr.csr_matrix: 
                        vector = self.text_vectors[doc].toarray()[0]
                    str_vec = ''
                    for i in range(vector.shape[0]): 
                        if vector[i] > 0: 
                            str_vec += f'{i} {str(vector[i])},'
                    classe = self.class_vectors[doc]
                    arff.write('{' + str_vec + str(num_attrs) + ' "' + classe + '"}\n') 
    

# Classes to Generate Vector Space Model Based Representaions 

## Bag-of-Words or Bag-of-N-Grams

In [48]:
class MySparseVSM: 

    def __init__(self, weight='tf', n_grams=1):
        self.vectorizer = None 
        if(weight == 'tf'):
            self.vectorizer = CountVectorizer(min_df=2, ngram_range=(1, n_grams), dtype=np.uint8)
        else:
            self.vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, n_grams), dtype=np.uint8)

        self.structured_representation = None

    def build_representation(self, texts, classes): 
        self.structured_representation = StructuredRepresentation(self.vectorizer.fit_transform(texts), classes, self.vectorizer.vocabulary_)
        return self.structured_representation

## Low Dimensional Representations

### SuperClass

In [36]:
class LowDimensionalRepresentation(object):
    
    def __init__(self, dim_size = 200, model = None, num_threads=1, min_count = 2, window_size = 5): 
        __metaclass__  = ABCMeta
        self.dim_size = dim_size
        self.model = model 
        self.num_threads = num_threads 
        self.min_count = min_count
        self.window_size = window_size

    @abstractmethod
    def build_representation(self, texts, classes): 
        pass    

In [37]:
class MyWord2Vec (LowDimensionalRepresentation):
    
    def __init__(self, dim_size = 200, model = 'skip-gram', method='average', num_threads=1, min_count = 2, window_size = 5): 
        super(MyWord2Vec,self).__init__(dim_size,model,num_threads,min_count,window_size)
        self.language_model = None 
        self.cg = None 

    def build_model(self, texts):
        language_model = None

        sg = 0
        if self.model == 'cbow' : 
            language_model = gensim.models.Word2Vec
        elif self.model == 'skip-gram': 
            language_model = gensim.models.Word2Vec
            sg = 1
        #elif self.model == 'glove': 
            #self.language_model = gensim.models.Word2Vec(list_tokens_texts,min_count=min_count,window=window_size, size=dim_size, workers=num_threads)
        elif self.model == 'fasttext': 
            language_model = gensim.models.FastText
        else: 
            raise ValueError('Unsuported language model. Please, use language model = {"cbow","skip-gram"","glove"","fasttext"}.')

        list_tokens_texts = texts.apply(self.tokenizer)
        self.language_model = language_model(list_tokens_texts,sg=sg, min_count=self.min_count,window=self.window_size, size=self.dim_size, workers=self.num_threads)

    def build_representation(self, texts, classes): 
        self.build_language_model(texts)
        matrix = np.zeros((len(texts),self.dim_size))

        for i in range(len(texts)):
            tokens = self.tokenizer(texts.iloc[i])
            matrix[i] = self.sum_vectors(tokens)


        self.structured_representation = StructuredRepresentation(matrix, classes, list(self.language_model.wv.vocab))
        return self.structured_representation

    def tokenizer(self,text):
        text = text.lower()
        lista_alfanumerica = []

        for token in nltk.word_tokenize(text):
            if token in string.punctuation:
                continue 
            if token in stopwords_en: 
                continue
            if token.isnumeric():
                token = "0"*len(token)

            lista_alfanumerica.append(token)
        return lista_alfanumerica

    def sum_vectors(self,lista_tokens): 
        vetor_combinado = np.zeros(self.dim_size)
        for token in lista_tokens: 
            try:
                vetor_combinado += self.language_model.wv.get_vector(token)
            except KeyError:
                if token.isnumeric():
                    token = "0"*len(token)
                    vetor_combinado += self.language_model.wv.get_vector(token)
                else:
                    token = "0"*len(token)
                    vetor_combinado += self.language_model.wv.get_vector("unknown")
        return vetor_combinado

In [70]:
class MyDoc2Vec (LowDimensionalRepresentation):
    
    def __init__(self, dim_size = 200, model = 'dm', method='average', num_threads=4, alpha = 0.025, min_alpha=0.0001, num_max_epochs = 2000,min_count = 1, window_size = 5): 
        super(MyDoc2Vec,self).__init__(dim_size,model,num_threads,min_count,window_size)
        
        self.num_threads = num_threads
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.num_max_epochs = num_max_epochs
        self.model = model

        self.dm = -1
        if model == 'dbow':
            self.dm = 0
        elif model == 'dm':
            self.dm = 1
        elif model != 'both':
            raise ValueError('Unsuported model. Please, use model = {"dm","dbow"}.')
        
        self.dm_mean = 1
        if method == 'average': 
            self.dm_concat = 0
        elif method == 'concat':
            self.dm_concat = 1
        else:
            raise ValueError('Unsuported method. Please, use method = {"concat"","average"}.')
        
        #standard parameters
        self.hs = 0
        self.dbow_words = 0
        
    
    def build_model(self, texts): 
        
        tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(texts)]
        if self.model == 'dm' or self.model == 'dbow': 
            model = Doc2Vec(vector_size=self.dim_size, alpha=self.alpha, min_alpha=self.min_alpha, 
                            min_count=self.min_count, dm=self.dm, workers = self.num_threads,
                            dm_min = self.dm_mean, dm_concat = self.dm_concat,
                            dbow_words = self.dbow_words, hs=self.hs, epochs=self.num_max_epochs, seed=1)
            model.build_vocab(tagged_data)
            model.train(tagged_data,total_examples=model.corpus_count,epochs=model.iter)
            
            #Reduce memory usage
            model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

            matrix = np.zeros((len(texts),self.dim_size))
            for i in range(model.corpus_count): 
                matrix[i] = model.docvecs[str(i)]
            return matrix
        elif self.model == 'both': 
            modelDM = Doc2Vec(vector_size=self.dim_size, alpha=self.alpha, min_alpha=self.min_alpha, 
                            min_count=self.min_count, dm=1, workers = self.num_threads,
                            dm_min = self.dm_mean, dm_concat = self.dm_concat,
                            dbow_words = self.dbow_words, hs=self.hs, epochs=self.num_max_epochs, seed=1)
            modelDBOW = Doc2Vec(vector_size=self.dim_size, alpha=self.alpha, min_alpha=self.min_alpha, 
                            min_count=self.min_count, dm=0, workers = self.num_threads,
                            dm_min = self.dm_mean, dm_concat = self.dm_concat,
                            dbow_words = self.dbow_words, hs=self.hs, epochs=self.num_max_epochs, seed=1)
                        
            modelDM.build_vocab(tagged_data)
            modelDBOW.build_vocab(tagged_data)

            modelDM.train(tagged_data,total_examples=modelDM.corpus_count,epochs=modelDM.iter)
            modelDBOW.train(tagged_data,total_examples=modelDBOW.corpus_count,epochs=modelDBOW.iter)
            
            #Reduce memory usage
            modelDM.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
            modelDBOW.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)


            matrixDM = np.zeros((len(texts),self.dim_size))
            for i in range(modelDM.corpus_count): 
                matrixDM[i] = modelDM.docvecs[str(i)]
            matrixDBOW = np.zeros((len(texts),self.dim_size))
            for i in range(modelDBOW.corpus_count): 
                matrixDBOW[i] = modelDBOW.docvecs[str(i)]
            
            matrix = np.concatenate([matrixDM, matrixDBOW], axis=1)
            return matrix

  

    def build_representation(self, texts, classes): 
        self.structured_representation = StructuredRepresentation(self.build_model(texts), classes, None)
        return self.structured_representation



## Based on Word Embeddings

# Área de Testes

In [39]:
import pandas as pd 
path = '/home/rafael/Área de Trabalho/Produção Científica/JIPM 2020/Revisao V1/complete_texts_csvs/CSTR.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence


In [40]:
myDoc2Vec = MyDoc2Vec(dim_size=100, num_max_epochs=100, min_count=5)
representation = myDoc2Vec.build_representation(df['text'].apply(text_cleaner),df['class'])
representation.save_arff('teste', 'teste.arff')

SyntaxError: invalid syntax (<ipython-input-40-a06689b9c173>, line 1)

In [86]:
# Criando um dicionario (versão completa)
config = {}
config['csvs_diretory'] = '/home/rafael/Área de Trabalho/Produção Científica/JIPM 2020/Revisao V1/complete_texts_csvs'
config['output_directory'] = './teste'
config['text_column'] = 'text'
config['class_column'] = 'class'
config['pre-processing'] = True
config['pre-processing_steps'] = {'language' : 'en', 'remove_stopwords' : True, 'remove_punctuation' : True, 
                 'convert_numbers' : True, 'remove_numbers' : True, 'simplification' : True, 
                 'simplification_type' : 'lemmatization', 'lower_case' : True}
config['sparse_representation'] = {'use': True, 'n-grams' : [1], 'term-weights' : ['tf', 'tf-idf']}
config['low-dimension_representation'] = {'use' : True, 'types' : ['doc2vec', 'word2vec'] ,
                                          'doc2vec_config' : {'models': ['dm', 'dbow', 'both'], 'methods' : ['average','concat'], 
                                          'num_threads': 4, 'alpha' : 0.025, 'min_alpha' : 0.001,
                                          'num_max_epochs' : [1, 3, 100, 1000], 'min_count' : 1, 'window_sizes' : [5, 8, 10], 
                                          'num_dimensions' : [25,  50, 100, 500, 1000] }}
config['save-arff'] = True 
config['save-binary'] = False


In [88]:
# Criando um dicionario (versão simplificada)
config = {}
config['csvs_diretory'] = '/home/rafael/Área de Trabalho/Produção Científica/JIPM 2020/Revisao V1/complete_texts_csvs_temp/temp1'
config['output_directory'] = './teste'
config['text_column'] = 'text'
config['class_column'] = 'class'
config['pre-processing'] = True
config['pre-processing_steps'] = {'language' : 'en', 'remove_stopwords' : True, 'remove_punctuation' : True, 
                 'convert_numbers' : True, 'remove_numbers' : True, 'simplification' : True, 
                 'simplification_type' : 'lemmatization', 'lower_case' : True}
config['sparse_representation'] = {'use': False, 'n-grams' : [1], 'term-weights' : ['tf', 'tf-idf']}
config['low-dimension_representation'] = {'use' : True, 'types' : ['doc2vec', 'word2vec'] ,
                                          'doc2vec_config' : {'models': ['dm', 'both'], 'methods' : ['average','concat'], 
                                          'num_threads': 4, 'alpha' : 0.025, 'min_alpha' : 0.0001,
                                          'num_max_epochs' : [100], 'min_count' : 2, 'window_sizes' : [5, 10], 
                                          'num_dimensions' : [100, 200]}}
config['save-arff'] = True 
config['save-binary'] = False


In [75]:
# Saving the Json
with open('config.json', 'w') as outfile:
    json.dump(config, outfile, indent=4, ensure_ascii=False,)

In [76]:
# Loading the Json
with open('config.json') as json_file:
    new_dict = json.load(json_file)

In [90]:
new_dict

{'csvs_diretory': '/home/rafael,teste',
 'text_column': 'text',
 'class_column': 'class',
 'pre-processing': True,
 'pre-processing_steps': {'language': 'en',
  'remove_stopwords': True,
  'remove_punctuation': True,
  'convert_numbers': True,
  'remove_numbers': True,
  'simplification': True,
  'simplification_type': 'lemmatization',
  'lower_case': True},
 'sparse_representation': {'use': True,
  'n-grams': [1],
  'term-weight': ['tf', 'tf-idf']},
 'low-dimension_representation': {'use': True,
  'types': ['doc2vec', 'word2vec'],
  'doc2vec_config': {'model': ['dm', 'dbow'],
   'methods': ['average', 'concat'],
   'num_threads': 4,
   'alpha': 0.025,
   'min_alpha': 0.0001,
   'num_max_epochs': [100, 200, 500, 1000],
   'min_count': 2,
   'window_size': [5, 10, 15]}}}

In [44]:
def load_csv(path, text_column, class_column): 
    df = pd.read_csv(os.path.join(directory,csv_file))
    df = df.dropna()
    texts = df[config['text_column']]
    classes = df[config['class_column']]
    return texts, classes 

In [45]:
def build_name(name, representation_type, config): 
    final_name = f'{name}_{representation_type}'
    for item in config.items(): 
        final_name += f'_{item[0]}={item[1]}'
    
    return final_name

In [46]:
def build_and_save_representation(config, rep_builder, name_builder ,parameters, dataset_name, non_sparse_format): 
    representation = rep_builder.build_representation(texts,classes)
    representation_name = build_name(dataset_name, name_builder, parameters)
    if config['save-arff'] == True: 
        representation.save_arff(representation_name, os.path.join(config['output_directory'], representation_name + '.arff'),                                                          non_sparse_format = non_sparse_format)
    if config['save-binary'] == True: 
        save_representation(representation, os.path.join(config['output_directory'], representation_name + '.rep'))

In [89]:
#Processing the JSON

#Getting the directory of the csvs and listing the csvs 
text_preprocessor = TextPreprocessor(**config['pre-processing_steps'])
directory = config['csvs_diretory']
for csv_file in sorted(os.listdir(directory)):
    dataset_name = csv_file[:csv_file.rindex('.')]
    print('=============================================')
    print('=============================================')
    print('Dataset: ', dataset_name)
    
    # Loading the CSVs and getting the column of the texts and the classes
    texts, classes = load_csv(os.path.join(directory,csv_file), config['text_column'], config['class_column'])

    #Pre-prossing texts
    if config['pre-processing'] == True: 
        print('Preprocessing text collection')
        texts = texts.apply(text_preprocessor.text_cleaner)
    
    #Processing sparse representations
    if config['sparse_representation']['use'] == True: 
        print('=============================================')
        print('Sparse Representation')
        for ngram in config['sparse_representation']['n-grams']: 
            print('N-gram: ', ngram)
            for term_weight in config['sparse_representation']['term-weights']: 
                print('Term-weight: ', term_weight)
                parameters = {'term-weight' : term_weight, 'n-grams' : ngram}
                mySparseVSM = MySparseVSM(weight=term_weight, n_grams=ngram)
                build_and_save_representation(config, mySparseVSM, 'SparseVSM' ,parameters, dataset_name, True)
               
    #Processing low-dimensional representations
    if config['low-dimension_representation']['use'] == True:
        for type_repr in config['low-dimension_representation']['types']: 
            if type_repr == 'doc2vec': 
                print('=============================================')
                print('Doc2Vec')
                for model in config['low-dimension_representation']['doc2vec_config']['models']:
                    print('Model:', model)
                    for method in config['low-dimension_representation']['doc2vec_config']['methods']: 
                        print('Method:', method)
                        for num_max_epoch in config['low-dimension_representation']['doc2vec_config']['num_max_epochs']:
                            print('Num. Max Epochs:', num_max_epoch)
                            for window_size in config['low-dimension_representation']['doc2vec_config']['window_sizes']:
                                print('Window Size:', window_size)
                                for num_dimensions in config['low-dimension_representation']['doc2vec_config']['num_dimensions']:
                                    print('Num. Dimensions:', num_dimensions)
                                    parameters = {'model' : model, 'method' : method, 'dim_size': num_dimensions,
                                                  'num_max_epochs' : num_max_epoch, 'window_size' : window_size, 
                                                  'num_threads' : config['low-dimension_representation']['doc2vec_config']['num_threads'],
                                                  'min_count' : config['low-dimension_representation']['doc2vec_config']['min_count'],
                                                  'alpha' : config['low-dimension_representation']['doc2vec_config']['alpha'],
                                                  'min_alpha' : config['low-dimension_representation']['doc2vec_config']['min_alpha']
                                                  }
                
                                    myDoc2Vec = MyDoc2Vec(**parameters)
                                    build_and_save_representation(config, myDoc2Vec, 'Doc2Vec', parameters, dataset_name, False)
            elif type_repr == 'word2vec': 
                pass
            else: 
                raise ValueError('Unsuported low dimension representation type. Please, use types = {"doc2vec","word2vec"}.')
            
             

print('Process Concluded!!')
                

Dataset:  webkb-parsed
Preprocessing text collection
Doc2Vec
Model: dm
Method: average
Num. Max Epochs: 100
Window Size: 5
Num. Dimensions: 100
Epochs Model:  100
Num. Dimensions: 200
Epochs Model:  100
Window Size: 10
Num. Dimensions: 100
Epochs Model:  100
Num. Dimensions: 200
Epochs Model:  100
Method: concat
Num. Max Epochs: 100
Window Size: 5
Num. Dimensions: 100
Epochs Model:  100
Num. Dimensions: 200
Epochs Model:  100
Window Size: 10
Num. Dimensions: 100
Epochs Model:  100
Num. Dimensions: 200
Epochs Model:  100
Model: both
Method: average
Num. Max Epochs: 100
Window Size: 5
Num. Dimensions: 100
Num. Dimensions: 200
Window Size: 10
Num. Dimensions: 100
Num. Dimensions: 200
Method: concat
Num. Max Epochs: 100
Window Size: 5
Num. Dimensions: 100
Num. Dimensions: 200
Window Size: 10
Num. Dimensions: 100
Num. Dimensions: 200
Process Concluded!!


# Avaliando o resultado com o Doc2Vec

In [227]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [401]:
clfLR = LogisticRegression()
clfMNB = MultinomialNB()

In [402]:
#train_X, test_X, train_y, test_y = train_test_split(representation.text_vectors, representation.class_vectors, test_size=0.20)
train_X, test_X, train_y, test_y = train_test_split(representation.text_vectors, representation.class_vectors, test_size=0.20)
clfLR.fit(train_X, train_y)
clfLR.score(test_X,test_y)

0.8

In [91]:
model_w2v = gensim.models.FastText(list_tokens_texts,min_count=5,window=5, size=200)

In [92]:
model_w2v

<gensim.models.fasttext.FastText at 0x7fb89ad3fd10>

In [118]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [272]:
doc2vecrepr = []
for i in range(model.corpus_count): 
    doc2vecrepr.append(model.docvecs[str(i)])

In [273]:
#train_X, test_X, train_y, test_y = train_test_split(representation.text_vectors, representation.class_vectors, test_size=0.20)
train_X, test_X, train_y, test_y = train_test_split(doc2vecrepr, representation.class_vectors, test_size=0.20)
clfLR.fit(train_X, train_y)
clfLR.score(test_X,test_y)

0.75

In [288]:
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

sentence = "The striped bats are hanging on their feet for best"

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)

# Extract the lemma for each token and join','.join(df['class'].unique())
" ".join([token.lemma_ for token in doc])


'the strip bat be hang on -PRON- foot for good'