<h1 align="center"> Experimento PiLN - Algoritmo Distributed Bag of Words Doc2vec</h1>

---



In [None]:
from google.colab import drive
import gensim
from gensim.models import Doc2Vec
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics._classification import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
import joblib

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



Leitura dos arquivos contendo os dados. Métodos auxiliares para gerar o split de todo o conjunto.

In [None]:
def read_corpus(train_filepath, test_filepath):
    
    train_data = pd.read_csv(train_filepath, sep=',')
    test_data = pd.read_csv(test_filepath, sep=',')
    
    X_train = []
    y_train = []
    X_train.extend(train_data['text'].values)
    y_train.extend(train_data['prediction'].values)
    
    X_test = []
    y_test = []
    X_test.extend(test_data['text'].values)
    y_test.extend(test_data['prediction'].values)
   
    return X_train, X_test, y_train, y_test  

def split_data_set(X, y, size):
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y, test_size = size, random_state = 42)

    return X_train, X_test, y_train, y_test


def prepare_corpus(tweets: list):
    for i, line in enumerate(tweets):
        tokens = gensim.utils.simple_preprocess(line)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

def load_train_data(filepath):
    
    train_data = pd.read_csv(filepath, sep=',')
    X_train = []
    y_train = []
    X_train.extend(train_data['text'].values)
    y_train.extend(train_data['prediction'].values)
    
    return X_train, y_train

def load_test_data(filepath):
    
    test_data = pd.read_csv(filepath, sep='\t')
    X_test = []
    id_test = []

    X_test.extend(test_data['text'].values)
    id_test.extend(test_data['id'].values)
    
    return X_test, id_test

def write_submission_data(pred, id_test_data, file_name):

    df = pd.DataFrame(list(zip(id_test_data, pred)), columns = ['Id', 'Prediction'])

    df.to_csv(file_name, sep='\t', encoding='utf-8', index = False)
    return


Método para criação das embeddings a serem utilizadas na classificação

In [None]:
def train_doc2vec_embeddings(vec_size, option, data, filepath):
 
    documents = list(prepare_corpus(data))
    model = Doc2Vec(vector_size = vec_size, dm = option, window=4, min_count=1, epochs=300, sample=1e-4, workers=5)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(filepath)

def get_d2v_model(filepath):
    model = Doc2Vec.load(filepath)
    return model

def infer_d2v_embeddings(d2v_model, X):
    for i in range(len(X)):
        model_vector = d2v_model.infer_vector(gensim.utils.simple_preprocess(X[i]))
        X[i] = model_vector
        
    return X

Métodos associados ao modelo classificador

In [None]:
def save_classifier_model(X_train, y_train, filepath):

    classifier = MLPClassifier(random_state=1, max_iter=300)

    classifier.fit(X_train,y_train)

    joblib.dump(classifier, filepath)

def load_classifier_model(filepath):
    cls_model = joblib.load(filepath)
    return cls_model  
    
def run_classifier(classifier, X_train, X_test, y_train, y_test):
    
    classifier.fit(X_train,y_train)
    
    pred = classifier.predict(X_test)
    
    acc_score = accuracy_score(y_test,pred)
    ballanced_acc_score = balanced_accuracy_score(y_test, pred)
    classif_report = classification_report(y_test, pred)
    
    return acc_score, ballanced_acc_score, classif_report


def Ironia(d2v_model, cls_model, sentence):

    vector_representation = d2v_model.infer_vector(gensim.utils.simple_preprocess(sentence))
    
    pred = cls_model.predict([vector_representation])

    return pred

<h1 align="center"> <b>Modelos</b> </h1>



<h2 align="center"> <b><i>Corpus</i> Twitter</b> </h2>

In [None]:
vec_size = 300
#DM model = 1
model_option = 0

X_train, y_train = load_train_data('/content/drive/MyDrive/Projects/IDPT2021/data/balanceado/70-30/train_tweets_final.csv')

d2v_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/twitter_doc2vec.model'
classifier_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/twitter_classifier.model'

train_doc2vec_embeddings(vec_size, model_option, X_train, d2v_filepath)

d2v_model = get_d2v_model(d2v_filepath)
X_train = infer_d2v_embeddings(d2v_model, X_train)

save_classifier_model(X_train, y_train, classifier_filepath)

In [None]:
d2v_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/twitter_doc2vec.model'
classifier_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/twitter_classifier.model'

cls_model = load_classifier_model(classifier_filepath)
d2v_model = get_d2v_model(d2v_filepath)

pred = Ironia(d2v_model, cls_model, 'Proposta que obriga comércio a informarpreços em braile é aprovada  economia')
print(pred)

[0]


In [None]:
train_fp = '/content/drive/MyDrive/Projects/IDPT2021/data/balanceado/70-30/train_tweets_final.csv'
test_fp = '/content/drive/MyDrive/Projects/IDPT2021/data/Test/test_tweets.csv'

d2v_fp = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/doc2vec.model'

classifier_fp = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/twitter_classifier.model'

output_filepath = '/content/drive/MyDrive/Projects/IDPT2021/data/piln_output_tweets.csv'


X_test, id_test = load_test_data(test_fp)

pred = Ironia(d2v_model, X_train, y_train, X_test)

write_submission_data(pred, id_test, output_filepath)

'''
acc_score, ballanced_acc_scores, report = run_classifier(classifier, X_train, X_test, y_train, y_test)

print('======Relatório========')
print(report)
print("Acuracia == " + str(acc_score))
print("Acuracia balanceada == " + str(ballanced_acc_scores))
'''

<h2 align="center"> <b><i>Corpus</i> Notícias</b> </h2>

In [None]:
vec_size = 300
#DM model = 1
model_option = 0

X_train, y_train = load_train_data('/content/drive/MyDrive/Projects/IDPT2021/data/70-30/train_news_final.csv')

d2v_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/news_doc2vec.model'
classifier_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/news_classifier.model'

train_doc2vec_embeddings(vec_size, model_option, X_train, d2v_filepath)

d2v_model = get_d2v_model(d2v_filepath)
X_train = infer_d2v_embeddings(d2v_model, X_train)

save_classifier_model(X_train, y_train, classifier_filepath)

In [None]:
d2v_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/news_doc2vec.model'
classifier_filepath = '/content/drive/MyDrive/Projects/IDPT2021/saved_models/news_classifier.model'

cls_model = load_classifier_model(classifier_filepath)
d2v_model = get_d2v_model(d2v_filepath)

pred = Ironia(d2v_model, cls_model, 'O presidente Jair Bolsonaro, que, na posse do ministro da Justiça, André Mendonça,disse que tem com o presidente do STJ, João Otávio de Noronha, “amor à primeira vista”, deve um favor. Este mandou transferir o guarda-livros da famiglia Bolsonaro da prisão para casa. E, na mesma canetada, dada em plantão no recesso do Judiciário, determinou idêntico destino à mulher dele, Márcia Aguiar, embora ela ainda estivesse, então, foragida. No habeas corpus, a defesa de Queiroz pediu a conversão da prisão preventiva em domiciliar. Os advogados citaram o estado de saúde do amigo do presidente e  o contexto de pandemia e criticaram fundamentos da medida autorizada pela Justiça. Noronha decidiu estender a prisão domiciliar para a mulher dele para que ela possa lhe dispensar as atenções necessárias”. Não é fofo?')
print(pred)

[0]


In [None]:
train_fp = '/content/drive/MyDrive/Projects/IDPT2021/data/70-30/train_news_final.csv'
test_fp = '/content/drive/MyDrive/Projects/IDPT2021/data/Test/test_news.csv'

output_filepath = '/content/drive/MyDrive/Projects/IDPT2021/data/piln_output_news.csv'

vec_size = 300
#DM model = 1
model_option = 0

X_train, y_train = load_train_data(train_fp)
X_test, id_test = load_test_data(test_fp)

d2v_model = train_doc2vec_embeddings(vec_size, model_option, X_train)

X_train = infer_d2v_embeddings(d2v_model, X_train)
X_test = infer_d2v_embeddings(d2v_model, X_test)

classifier = MLPClassifier(random_state=1, max_iter=300)

classifier.fit(X_train,y_train)
    
pred = classifier.predict(X_test)

write_submission_data(pred, id_test, output_filepath)

'''
acc_score, ballanced_acc_scores, report = run_classifier(classifier, X_train, X_test, y_train, y_test)

print('======Relatório========')
print(report)
print("Acuracia == " + str(acc_score))
print("Acuracia balanceada == " + str(ballanced_acc_scores))
'''