In [28]:
from pathlib import Path

from allennlp.modules.elmo import Elmo, batch_to_ids
from allennlp.commands.elmo import ElmoEmbedder
from allennlp.models.archival import load_archive

from scipy.spatial.distance import cosine
from scipy.spatial.distance import cdist

import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, balanced_accuracy_score

import pickle

In [2]:
options_file = "/media/discoD/models/elmo/options.json"
weight_file = "/media/discoD/models/elmo/elmo_pt_weights.hdf5"

In [3]:
elmo_embedder = ElmoEmbedder(options_file, weight_file, 0)

In [4]:
df_train_file = '/media/discoD/Mestrado/FASAM/train_text_label.csv'
df_dev_file = '/media/discoD/Mestrado/FASAM/dev_text.csv'

In [5]:
train_df = pd.read_csv(df_train_file)
train_df

Unnamed: 0,text,label
0,Casa da Barra Funda tem clima roceiro e receit...,comida
1,Professores de SP decidem manter greve; grupo ...,educacao
2,"Em segunda edição, concurso paga R$ 35 mil par...",empreendedorsocial
3,Usar maconha por anos não faz tão mal para a s...,equilibrioesaude
4,Baleia-azul percorre 5.200 km e revela a cient...,ciencia
5,Líderes inovadores se reunem em rede para comp...,empreendedorsocial
6,"Conheça Pandora, atração milionária da Disney ...",turismo
7,Fiesp organiza quarta edição de maratona hacke...,empreendedorsocial
8,Praia do Forte mistura natureza com resorts es...,turismo
9,App de agência avisa quando visto e passaporte...,turismo


In [6]:
dev_df = pd.read_csv(df_dev_file)
dev_df

Unnamed: 0,text
0,"Vitrine de Dilma, Pronatec terá orçamento 65% ..."
1,"Por direitos autorais e publicidade, 'youtuber..."
2,Rótulos de alimentos terão que alertar sobre l...
3,Sociedade britânica de compositores processa S...
4,"Por Fies, aluna madruga na porta da FMU, mas s..."
5,Cientistas aguardam nascimento raro de 'dragõe...
6,Aplicativo mostra quem está no mesmo voo que v...
7,Pesquisas indicam que sono na direção pode ser...
8,Brasil lançará missão à Lua até 2020 para estu...
9,Cientistas testam vacina contra colesterol e d...


In [24]:
def get_encoded_data(dataframe, pickle_out):
    sentences = [word_tokenize(text) for text in dataframe['text']]
    embeddings = elmo_embedder.embed_sentences(sentences, batch_size=2)
    encoded_data = []
    for elmo in embeddings:
        encoded_data.append(np.mean(elmo[-1], axis=0))
        if len(encoded_data) % 50 == 0:
            print('Processed %d texts' % len(encoded_data))
    print('Finished processing %d texts' % len(encoded_data))
    pickle_data = open(pickle_out, "wb")
    pickle.dump(encoded_data, pickle_data)
    pickle_data.close()
    return encoded_data

def write_predictions(predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions to %s' % out_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())

In [None]:
x_train = get_encoded_data(train_df, "/media/discoD/Mestrado/FASAM/x_train.pickle")

In [21]:
x_dev = get_encoded_data(dev_df, "/media/discoD/Mestrado/FASAM/x_dev.pickle")

Processed 50 texts
Processed 100 texts
Processed 150 texts
Processed 200 texts
Processed 250 texts
Processed 300 texts
Processed 350 texts
Processed 400 texts
Processed 450 texts
Processed 500 texts
Processed 550 texts
Processed 600 texts
Processed 650 texts
Processed 700 texts
Processed 750 texts
Processed 800 texts
Processed 850 texts
Processed 900 texts
Processed 950 texts
Processed 1000 texts
Processed 1050 texts
Processed 1100 texts
Processed 1150 texts
Processed 1200 texts
Processed 1250 texts
Processed 1300 texts
Processed 1350 texts
Processed 1400 texts
Processed 1450 texts
Processed 1500 texts
Processed 1550 texts
Processed 1600 texts
Processed 1650 texts
Processed 1700 texts
Processed 1750 texts
Processed 1800 texts
Processed 1850 texts
Processed 1900 texts
Processed 1950 texts
Processed 2000 texts
Processed 2050 texts
Processed 2100 texts
Processed 2150 texts
Processed 2200 texts
Processed 2250 texts
Processed 2300 texts
Processed 2350 texts
Processed 2400 texts
Processed 24

In [7]:
sentences = [word_tokenize(text) for text in train_df['text']]

In [8]:
embeddings = elmo_embedder.embed_sentences(sentences, batch_size=2)

In [9]:
x_train = [np.mean(elmo[-1], axis=0) for elmo in embeddings]
print(len(x_train))
print(x_train[0].shape)

7894
(1024,)


In [17]:
pickle_x_train = open(,"wb")
pickle.dump(x_train, pickle_x_train)
pickle_x_train.close()

In [11]:
lreg = LogisticRegression()
lreg.fit(x_train, train_df['label'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
f1_score(train_df['label'], lreg.predict(x_train), average='micro')

0.9828984038510261

In [25]:
write_predictions(lreg.predict(x_dev), '/media/discoD/Mestrado/FASAM/submissions_elmo_logistic_regression.csv')

Saving predictions to /media/discoD/Mestrado/FASAM/submissions_elmo_logistic_regression.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 se

In [29]:
linear_svc = LinearSVC(random_state=0, max_iter=3000, tol=1e-6)
linear_svc.fit(x_train, train_df['label'])
predicted_linear_svc = linear_svc.predict(x_train)
balanced_accuracy_score(train_df['label'], predicted_linear_svc)



0.9985617706465761

In [30]:
write_predictions(linear_svc.predict(x_dev), '/media/discoD/Mestrado/FASAM/submissions_elmo_linear_svc.csv')

Saving predictions to /media/discoD/Mestrado/FASAM/submissions_elmo_linear_svc.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
P