In [1]:
import pandas as pd
from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from allennlp.predictors.predictor import Predictor
from allennlp.models.archival import load_archive
from allennlp.common.util import lazy_groups_of
from typing import List, Iterator
from allennlp.data import Instance
from pathlib import Path
import fire

In [10]:
def predict(document_path, model_path, out_path, batch_size=4):
    def get_instance_data(document_path) -> Iterator[Instance]:
        yield from predictor._dataset_reader.read(Path(document_path))

    def predict_instances(batch_data: List[Instance]) -> Iterator[str]:
        yield predictor.predict_batch_instance(batch_data)

    # model_path = 'model_title/model.tar.gz'

    print('Loading model from %s' % model_path)
    archive = load_archive(archive_file=model_path, cuda_device=0)
    predictor = Predictor.from_archive(archive, 'text_classifier')

    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Loading batches from %s for prediction' % document_path)
        out_file.write('Id,Category\n')
        idx = 0
        for batch in lazy_groups_of(get_instance_data(document_path), batch_size):
            for items, results in zip(batch, predict_instances(batch)):
                for item, result in zip(items, results):
                    print(item)
                    print(result)
                    break
                    count += 1
                    predicted_label = result['label']
                    out_file.write(str(idx) + ',' + predicted_label + '\n')
                    idx += 1
                    if count % 100 == 0:
                        print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())

In [11]:
predict(document_path='test_submission.json', model_path='snli-roberta-full/model.tar.gz', out_path='submissions_snli_roberta_full.csv')

Loading model from snli-roberta-full/model.tar.gz
Loading batches from test_submission.json for prediction
tokens
{'logits': [-3.0299172401428223, 6.20608377456665, -2.782586097717285], 'probs': [9.744491399032995e-05, 0.9997777342796326, 0.00012478820281103253], 'token_ids': [0, 357, 8755, 4083, 3594, 30, 108865, 384, 296, 8543, 149322, 35, 1857, 58170, 709, 290, 3508, 41512, 622, 6911, 46323, 35, 341, 329, 88, 18, 265, 19, 81, 6683, 91, 25, 91, 2776, 48, 2], 'label': 'Negativo', 'tokens': ['<s>', 'Ġ@', 'JD', 'aniel', 'df', ':', 'ĠPedindo', 'Ġpara', 'Ġque', 'ĠMG', 'Ġreaja', '?', 'ĠRe', 'agir', 'Ġcontra', 'Ġo', 'Ġgovernador', 'Ġqueridinho', 'Ġdos', 'Ġeleitores', 'Ġmineiros', '?', 'Ġhttps', '://', 't', '.', 'co', '/', 'm', 'MC', 'w', '5', 'w', 'yy', 'L', '</s>']}
tokens
{'logits': [-3.73759388923645, 6.195363998413086, -2.2650222778320312], 'probs': [4.853536302107386e-05, 0.9997398257255554, 0.0002116352115990594], 'token_ids': [0, 357, 7934, 30, 47073, 484, 584, 8, 4071, 16, 28, 2405,

tokens
{'logits': [6.961755275726318, -2.352102041244507, -3.270143508911133], 'probs': [0.9998738765716553, 9.015472460305318e-05, 3.599878618842922e-05], 'token_ids': [0, 17317, 16, 8665, 59998, 8356, 16, 42017, 262, 10671, 356, 99610, 225, 341, 329, 88, 18, 265, 19, 13232, 59, 9919, 85, 22, 84, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Positivo', 'tokens': ['<s>', 'ĠdenÃºncia', ',', 'ĠPM', 'Ġapreende', 'Ġdrogas', ',', 'ĠmuniÃ§Ã£o', 'Ġe', 'Ġarmas', 'Ġem', 'ĠAraxÃ¡', 'Ġ', 'Ġhttps', '://', 't', '.', 'co', '/', 'Ox', 'W', 'HQ', 'q', '2', 'p', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [-2.7311792373657227, -2.2480459213256836, 4.862624168395996], 'probs': [0.0005028984160162508, 0.000815271632745862, 0.9986818432807922], 'token_ids': [0, 357, 7934, 30, 2384, 48985, 20869, 356, 8543, 30, 30904, 12113, 410, 6057, 384, 2851, 584, 8, 532, 16, 22, 3774, 341, 329, 88, 18, 265, 19, 25, 69, 26, 1

tokens
{'logits': [-2.4507484436035156, -3.1436004638671875, 5.665140628814697], 'probs': [0.0002986206382047385, 0.00014935439685359597, 0.9995520710945129], 'token_ids': [0, 357, 298, 67, 392, 30, 2779, 511, 5, 4305, 263, 5510, 348, 6843, 7172, 323, 2755, 274, 7103, 274, 733, 16, 13078, 274, 6166, 18, 5289, 410, 339, 341, 329, 88, 18, 265, 19, 7873, 15930, 4314, 9452, 53, 341, 329, 2], 'label': 'Neutro', 'tokens': ['<s>', 'Ġ@', 'em', '_', 'com', ':', 'ĠBom', 'Ġdia', '!', 'ĠVeja', 'Ġa', 'Ġcapa', 'Ġda', 'ĠediÃ§Ã£o', 'Ġdigital', 'Ġdo', 'ĠEstado', 'Ġde', 'ĠMinas', 'Ġde', 'Ġhoje', ',', 'Ġ09', 'Ġde', 'Ġjaneiro', '.', 'ĠLeia', 'Ġmais', 'Ġno', 'Ġhttps', '://', 't', '.', 'co', '/', 'rap', 'Yr', 'fs', 'nz', 'Q', 'Ġhttps', '://', '</s>']}
tokens
{'logits': [-3.397783041000366, 6.318267822265625, -2.6040573120117188], 'probs': [6.029599535395391e-05, 0.9998063445091248, 0.00013335193216335028], 'token_ids': [0, 357, 20775, 48516, 30, 1996, 12433, 12341, 16, 1265, 274, 8543, 2656, 1507, 305, 1449

tokens
{'logits': [-3.4884257316589355, -2.5471584796905518, 5.983168125152588], 'probs': [7.698743866058066e-05, 0.00019733629596885294, 0.9997256398200989], 'token_ids': [0, 303, 7482, 9049, 348, 8365, 9002, 263, 4071, 16, 27, 7153, 274, 15260, 356, 10290, 836, 2755, 274, 7103, 341, 329, 88, 18, 265, 19, 5419, 26175, 24, 16521, 15288, 2, 1, 1, 1], 'label': 'Neutro', 'tokens': ['<s>', 'Ã¡', 'vit', 'Ġcomercial', 'Ġda', 'ĠAlemanha', 'Ġsobe', 'Ġa', 'Ġ21', ',', '7', 'ĠbilhÃµes', 'Ġde', 'Ġeuros', 'Ġem', 'Ġnovembro', 'Ġ-', 'ĠEstado', 'Ġde', 'ĠMinas', 'Ġhttps', '://', 't', '.', 'co', '/', 'cm', 'zD', '4', 'Hp', 'Cf', '</s>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [-2.0905210971832275, -3.0997376441955566, 5.235811233520508], 'probs': [0.0006573923164978623, 0.00023962247360032052, 0.9991029500961304], 'token_ids': [0, 3515, 4812, 960, 63879, 274, 5931, 18253, 16, 308, 3137, 1059, 341, 329, 88, 18, 265, 19, 73, 9311, 9571, 55388, 45, 29, 341, 329, 88, 18, 265, 19, 85, 1409, 33224, 5850

tokens
{'logits': [6.8851494789123535, -2.0588603019714355, -3.630154609680176], 'probs': [0.9998424053192139, 0.0001304960751440376, 2.711395063670352e-05], 'token_ids': [0, 357, 8663, 34921, 30, 1200, 3437, 997, 16, 40200, 10952, 356, 7103, 11296, 324, 1827, 20631, 5378, 18, 366, 662, 16, 9192, 35, 341, 329, 88, 18, 265, 19, 29832, 7115, 61894, 48, 2, 1, 1], 'label': 'Positivo', 'tokens': ['<s>', 'Ġ@', 'felipe', 'fonte', ':', 'ĠCom', 'ĠtrÃªs', 'Ġanos', ',', 'ĠpresÃŃdio', 'Ġprivado', 'Ġem', 'ĠMinas', 'ĠGerais', 'ĠnÃ£o', 'Ġteve', 'Ġrebel', 'iÃµes', '.', 'ĠE', 'Ġagora', ',', 'ĠJosÃ©', '?', 'Ġhttps', '://', 't', '.', 'co', '/', 'zO', 'Gi', '877', 'L', '</s>', '<pad>', '<pad>']}
tokens
{'logits': [-3.7122135162353516, -1.3880335092544556, 5.070651531219482], 'probs': [0.00015307473950088024, 0.0015641641803085804, 0.998282790184021], 'token_ids': [0, 1922, 13162, 94822, 274, 31635, 356, 21754, 274, 6355, 274, 9333, 836, 13313, 836, 2755, 274, 7103, 341, 329, 88, 18, 265, 19, 75, 6457, 135

tokens
{'logits': [6.882328033447266, -2.9724655151367188, -2.5696325302124023], 'probs': [0.9998689889907837, 5.24880742887035e-05, 7.85251468187198e-05], 'token_ids': [0, 694, 29925, 350, 2875, 23340, 328, 5461, 10697, 274, 5734, 15551, 262, 64240, 356, 36628, 225, 341, 329, 88, 18, 265, 19, 89, 10702, 86, 22, 72, 7377, 24920, 350, 45191, 1966, 54745, 38198, 350, 33217, 112457, 2, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Positivo', 'tokens': ['<s>', 'tim', 'beta', 'Ġ#', 'globo', 'ĠJovem', 'ĠÃ©', 'Ġpreso', 'Ġsuspeito', 'Ġde', 'ĠviolÃªncia', 'ĠdomÃ©stica', 'Ġe', 'ĠcÃ¡rcere', 'Ġem', 'ĠUberlÃ¢ndia', 'Ġ', 'Ġhttps', '://', 't', '.', 'co', '/', 'u', 'NW', 'r', '2', 'd', 'DB', 'ZK', 'Ġ#', 'Opera', 'cao', 'Beta', 'Lab', 'Ġ#', 'Sigo', 'devol', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [6.586040019989014, -1.7618839740753174, -3.7140748500823975], 'probs': [0.999729573726654, 0.00023682366008870304, 3.3620144677115604e-05], 'token_ids': [0, 466

tokens
{'logits': [-3.4789512157440186, 6.289176940917969, -2.3598225116729736], 'probs': [5.723408321500756e-05, 0.9997674822807312, 0.00017526144802104682], 'token_ids': [0, 357, 7934, 30, 5300, 10395, 308, 2610, 274, 42550, 30, 3749, 274, 7103, 14078, 12875, 274, 584, 8, 3498, 16, 28, 3896, 308, 305, 144922, 341, 329, 88, 18, 265, 19, 1583, 85, 9987, 899, 2], 'label': 'Negativo', 'tokens': ['<s>', 'Ġ@', 'UOLNoticias', ':', 'ĠPol', 'Ãªmica', 'Ġcom', 'Ġfilho', 'Ġde', 'ĠPimentel', ':', 'ĠGoverno', 'Ġde', 'ĠMinas', 'ĠprevÃª', 'Ġgasto', 'Ġde', 'ĠR', '$', 'Ġ50', ',', '8', 'Ġmi', 'Ġcom', 'Ġh', 'elicÃ³pteros', 'Ġhttps', '://', 't', '.', 'co', '/', 'uu', 'q', 'DL', 'AA', '</s>']}
tokens
{'logits': [0.027622992172837257, -4.341817378997803, 4.448256015777588], 'probs': [0.011881910264492035, 0.0001504050160292536, 0.9879676699638367], 'token_ids': [0, 28176, 20635, 9326, 328, 12576, 836, 2755, 274, 7103, 836, 28176, 20635, 9326, 328, 12576, 2755, 274, 62634, 481, 341, 329, 88, 18, 265, 19, 13

tokens
{'logits': [-4.0288872718811035, 5.710243225097656, -1.6190474033355713], 'probs': [5.888967280043289e-05, 0.9992855191230774, 0.0006555701838806272], 'token_ids': [0, 357, 2501, 51016, 602, 30102, 30, 19757, 6421, 9091, 296, 1265, 274, 8543, 324, 408, 5253, 356, 961, 20463, 341, 329, 88, 18, 265, 19, 6285, 4983, 2964, 5805, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Negativo', 'tokens': ['<s>', 'Ġ@', 'lex', 'lut', 'hor', '2015', ':', 'ĠBB', 'Ġavisa', 'ĠJustiÃ§a', 'Ġque', 'Ġgoverno', 'Ġde', 'ĠMG', 'ĠnÃ£o', 'Ġtem', 'Ġrecursos', 'Ġem', 'Ġconta', 'Ġjudicial', 'Ġhttps', '://', 't', '.', 'co', '/', 'QP', 'cb', 'GA', 'PG', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [-2.92389178276062, -2.8458518981933594, 5.626180648803711], 'probs': [0.00019345313194207847, 0.00020915493951179087, 0.9995974898338318], 'token_ids': [0, 865, 75139, 60658, 263, 3143, 348, 3300, 323, 122760, 1048, 1046, 136808, 2323, 58662, 1924, 

tokens
{'logits': [-2.6693613529205322, -3.1254658699035645, 5.939489364624023], 'probs': [0.00018242915393784642, 0.00011561407154658809, 0.9997019171714783], 'token_ids': [0, 357, 19641, 18834, 30, 584, 362, 3975, 93, 513, 7270, 6666, 323, 16855, 16, 14079, 16, 28948, 24601, 17, 4607, 16, 7795, 262, 26345, 203, 203, 4160, 434, 42759, 408, 296, 2594, 274, 2, 1, 1, 1, 1, 1], 'label': 'Neutro', 'tokens': ['<s>', 'Ġ@', 'ware', 'porter', ':', 'ĠR', 'it', 'hel', 'y', 'ĠjÃ¡', 'Ġrecebeu', 'Ġpropostas', 'Ġdo', 'ĠPalmeiras', ',', 'ĠCorinthians', ',', 'ĠAt', 'letico', '-', 'MG', ',', 'ĠInter', 'Ġe', 'ĠFluminense', 'Ċ', 'Ċ', 'Pra', 'Ġser', 'Ġconvocado', 'Ġtem', 'Ġque', 'Ġmudar', 'Ġde', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [-2.6693613529205322, -3.1254658699035645, 5.939489364624023], 'probs': [0.00018242915393784642, 0.00011561407154658809, 0.9997019171714783], 'token_ids': [0, 357, 19641, 18834, 30, 584, 362, 3975, 93, 513, 7270, 6666, 323, 16855, 16, 14079, 1

tokens
{'logits': [6.873697757720947, -2.0573842525482178, -3.6488990783691406], 'probs': [0.9998408555984497, 0.00013219389074947685, 2.6916919523500837e-05], 'token_ids': [0, 357, 18013, 10169, 30, 1200, 3437, 997, 16, 40200, 10952, 356, 7103, 11296, 324, 1827, 20631, 5378, 30, 341, 329, 88, 18, 265, 19, 22, 84, 21, 69, 25, 3529, 56, 2], 'label': 'Positivo', 'tokens': ['<s>', 'Ġ@', 'Blogdo', 'BG', ':', 'ĠCom', 'ĠtrÃªs', 'Ġanos', ',', 'ĠpresÃŃdio', 'Ġprivado', 'Ġem', 'ĠMinas', 'ĠGerais', 'ĠnÃ£o', 'Ġteve', 'Ġrebel', 'iÃµes', ':', 'Ġhttps', '://', 't', '.', 'co', '/', '2', 'p', '1', 'a', '5', 'Vc', 'T', '</s>']}
tokens
{'logits': [6.589495658874512, -2.382939577102661, -3.210195779800415], 'probs': [0.9998177886009216, 0.0001268357882509008, 5.545861495193094e-05], 'token_ids': [0, 7103, 11296, 2554, 3749, 274, 7103, 11296, 67737, 22226, 9364, 744, 3027, 779, 440, 122914, 341, 329, 88, 18, 265, 19, 20995, 11609, 60, 9001, 72, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Positivo', 'to

tokens
{'logits': [6.7884521484375, -2.69478178024292, -2.7186548709869385], 'probs': [0.9998495578765869, 7.610589091200382e-05, 7.431058475049213e-05], 'token_ids': [0, 694, 29925, 350, 2875, 22245, 6803, 760, 89473, 2408, 18445, 356, 4891, 274, 17307, 356, 55792, 341, 329, 88, 18, 265, 19, 8587, 20, 54, 9247, 28377, 3736, 350, 45191, 1966, 54745, 38198, 350, 33217, 310, 2], 'label': 'Positivo', 'tokens': ['<s>', 'tim', 'beta', 'Ġ#', 'globo', 'ĠTrÃªs', 'Ġhomens', 'ĠsÃ£o', 'Ġdetidos', 'ĠapÃ³s', 'Ġroubo', 'Ġem', 'Ġloja', 'Ġde', 'Ġcelulares', 'Ġem', 'ĠIpatinga', 'Ġhttps', '://', 't', '.', 'co', '/', 'UC', '0', 'R', 'GN', 'cM', '65', 'Ġ#', 'Opera', 'cao', 'Beta', 'Lab', 'Ġ#', 'Sigo', 'de', '</s>']}
tokens
{'logits': [6.865391731262207, -2.345245122909546, -3.2647204399108887], 'probs': [0.9998601675033569, 9.995634900406003e-05, 3.9855447539594024e-05], 'token_ids': [0, 357, 80992, 75, 25041, 30, 357, 52, 118823, 25041, 836, 69570, 377, 23119, 274, 8356, 356, 390, 41667, 341, 329, 88, 18

tokens
{'logits': [-3.316335678100586, 6.2996039390563965, -2.5855460166931152], 'probs': [6.664403190370649e-05, 0.9997950196266174, 0.0001384010392939672], 'token_ids': [0, 1128, 731, 1822, 16, 1265, 274, 8543, 2656, 1507, 305, 144922, 274, 584, 8, 4071, 16, 28, 2405, 341, 329, 88, 18, 265, 19, 61, 8324, 1650, 55, 2261, 26, 2, 1, 1, 1, 1, 1, 1, 1], 'label': 'Negativo', 'tokens': ['<s>', 'Ġmeio', 'ĠÃł', 'Ġcrise', ',', 'Ġgoverno', 'Ġde', 'ĠMG', 'Ġcompra', 'Ġdois', 'Ġh', 'elicÃ³pteros', 'Ġde', 'ĠR', '$', 'Ġ21', ',', '8', 'ĠmilhÃµes', 'Ġhttps', '://', 't', '.', 'co', '/', 'Y', 'MT', 'En', 'S', 'll', '6', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [4.586225986480713, -3.092975378036499, -1.401479721069336], 'probs': [0.9970371723175049, 0.00046097420272417367, 0.0025019797030836344], 'token_ids': [0, 55973, 348, 8665, 12, 4607, 13, 136092, 290, 3508, 10986, 457, 92524, 341, 329, 88, 18, 265, 19, 70, 17362, 10660, 4961, 28, 6683, 341, 329, 88,

tokens
{'logits': [-2.0915591716766357, -2.457427978515625, 4.173025608062744], 'probs': [0.0018963934853672981, 0.0013153264299035072, 0.9967883825302124], 'token_ids': [0, 6630, 16, 1207, 274, 16170, 356, 9166, 323, 130039, 763, 491, 14436, 111682, 274, 80664, 836, 11296, 836, 2755, 274, 7103, 341, 329, 88, 18, 265, 19, 84, 6792, 62, 29, 16303, 3187, 2, 1, 1], 'label': 'Neutro', 'tokens': ['<s>', 'ĠPF', ',', 'Ġfalta', 'Ġde', 'ĠfiscalizaÃ§Ã£o', 'Ġem', 'Ġobra', 'Ġdo', 'ĠMove', 'Ġpode', 'Ġter', 'Ġcausado', 'Ġdesabamento', 'Ġde', 'Ġviaduto', 'Ġ-', 'ĠGerais', 'Ġ-', 'ĠEstado', 'Ġde', 'ĠMinas', 'Ġhttps', '://', 't', '.', 'co', '/', 'p', 'dn', 'Z', '9', 'Cd', 'Jo', '</s>', '<pad>', '<pad>']}
tokens
{'logits': [-1.451767086982727, -3.180058002471924, 4.768916606903076], 'probs': [0.0019832432735711336, 0.0003521994804032147, 0.9976645708084106], 'token_ids': [0, 357, 315, 392, 92, 30, 52530, 274, 108708, 333, 7541, 2881, 965, 52809, 419, 965, 58247, 4050, 350, 53302, 20958, 58155, 9966, 274, 

tokens
{'logits': [-1.6293361186981201, 4.126284599304199, -2.555342197418213], 'probs': [0.003151017241179943, 0.995600700378418, 0.0012482211459428072], 'token_ids': [0, 274, 7103, 328, 5933, 274, 11778, 262, 14271, 356, 7619, 274, 1736, 2212, 341, 329, 88, 18, 265, 19, 81, 29, 22770, 342, 2900, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Negativo', 'tokens': ['<s>', 'Ġde', 'ĠMinas', 'ĠÃ©', 'Ġchamado', 'Ġde', 'Ġvagabundo', 'Ġe', 'ĠladrÃ£o', 'Ġem', 'Ġshopping', 'Ġde', 'ĠSÃ£o', 'ĠPaulo', 'Ġhttps', '://', 't', '.', 'co', '/', 'm', '9', 'tA', 'ho', 'fu', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [-2.7275753021240234, 6.279728412628174, -2.910098075866699], 'probs': [0.0001224842999363318, 0.9997754693031311, 0.00010204959835391492], 'token_ids': [0, 357, 48253, 67, 107892, 30, 3749, 274, 8543, 12918, 356, 22531, 2514, 687, 14353, 2

tokens
{'logits': [-3.3668549060821533, 6.254929542541504, -2.5381128787994385], 'probs': [6.62548336549662e-05, 0.9997820258140564, 0.00015175240696407855], 'token_ids': [0, 357, 90816, 3065, 2084, 30, 5525, 356, 12433, 12341, 16, 1265, 274, 8543, 2656, 410, 1507, 305, 144922, 18, 35048, 376, 32076, 125421, 341, 329, 88, 18, 265, 19, 15027, 3389, 86, 25, 42, 26, 62, 23, 1204, 2], 'label': 'Negativo', 'tokens': ['<s>', 'Ġ@', 'adilson', 'jr', '11', ':', 'ĠMesmo', 'Ġem', 'Ġcalamidade', 'Ġfinanceira', ',', 'Ġgoverno', 'Ġde', 'ĠMG', 'Ġcompra', 'Ġmais', 'Ġdois', 'Ġh', 'elicÃ³pteros', '.', 'ĠPapai', 'ĠP', 'ilan', 'trel', 'Ġhttps', '://', 't', '.', 'co', '/', 'Nd', '57', 'r', '5', 'F', '6', 'Z', '3', 'Ġvia', '</s>']}
tokens
{'logits': [-3.781372308731079, 5.898703098297119, -1.8586609363555908], 'probs': [6.24861495452933e-05, 0.9995101690292358, 0.00042737266630865633], 'token_ids': [0, 357, 90358, 30, 19757, 6421, 9091, 296, 1265, 274, 8543, 324, 408, 5253, 356, 961, 20463, 341, 329, 88, 18

tokens
{'logits': [6.493953704833984, -3.0069754123687744, -2.5437471866607666], 'probs': [0.9998064637184143, 7.476786413462833e-05, 0.0001188207752420567], 'token_ids': [0, 4660, 7357, 20604, 356, 3437, 6750, 11428, 323, 28484, 54469, 341, 329, 88, 18, 265, 19, 27096, 15983, 8976, 79, 24, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Positivo', 'tokens': ['<s>', 'Ġanuncia', 'Ġensino', 'Ġintegral', 'Ġem', 'ĠtrÃªs', 'Ġescolas', 'Ġestaduais', 'Ġdo', 'ĠTri', 'Ã¢ngulo', 'Ġhttps', '://', 't', '.', 'co', '/', 'hV', 'UK', 'yz', 'k', '4', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [6.975962162017822, -2.647507429122925, -2.907437562942505], 'probs': [0.9998828172683716, 6.614994345000014e-05, 5.1008555601583794e-05], 'token_ids': [0, 274, 4388, 731, 5931, 18253, 408, 2640, 356, 131704, 33122, 11142, 30, 26490, 18447, 21851, 262

tokens
{'logits': [-3.0752058029174805, 6.355701446533203, -2.9388160705566406], 'probs': [8.019259985303506e-05, 0.9998278617858887, 9.191101707983762e-05], 'token_ids': [0, 357, 30810, 30, 3217, 693, 305, 144922, 451, 37, 809, 274, 3469, 262, 263, 22563, 668, 465, 2877, 760, 4985, 435, 263, 12433, 12341, 435, 5033, 290, 1991, 18, 203, 203, 83924, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Negativo', 'tokens': ['<s>', 'Ġ@', 'AnaPaulaVolei', ':', 'ĠMais', 'Ġ2', 'Ġh', 'elicÃ³pteros', '!!', 'A', 'Ġcara', 'Ġde', 'Ġpau', 'Ġe', 'Ġa', 'Ġcanalhice', 'Ġainda', 'ĠsÃ³', 'ĠÃ±', 'ĠsÃ£o', 'Ġmaiores', 'Ġq', 'Ġa', 'Ġcalamidade', 'Ġfinanceira', 'Ġq', 'Ġvive', 'Ġo', 'Ġestado', '.', 'Ċ', 'Ċ', 'Ġhtt', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [6.409249305725098, -3.0193934440612793, -2.482489585876465], 'probs': [0.999782145023346, 8.037075895117596e-05, 0.00013749036588706076], 'token_ids': [0

tokens
{'logits': [7.013844013214111, -2.636826515197754, -3.129685163497925], 'probs': [0.9998962879180908, 6.437567935790867e-05, 3.932568506570533e-05], 'token_ids': [0, 357, 35364, 30, 7112, 5893, 356, 7103, 23314, 911, 5827, 262, 3093, 8615, 5129, 9, 622, 8585, 18, 203, 891, 329, 88, 18, 265, 19, 4873, 2887, 92, 7497, 11461, 29, 341, 329, 88, 18, 265, 19, 21, 315, 5620, 29, 60, 2498, 2], 'label': 'Positivo', 'tokens': ['<s>', 'Ġ@', 'OGloboPolitica', ':', 'ĠPres', 'ÃŃdio', 'Ġem', 'ĠMinas', 'Ġadota', 'Ġnovo', 'Ġmodelo', 'Ġe', 'Ġconsegue', 'Ġrecuperar', 'Ġ60', '%', 'Ġdos', 'Ġpresos', '.', 'Ċ', 'https', '://', 't', '.', 'co', '/', '82', 'ty', 'x', 'Ir', 'CW', '9', 'Ġhttps', '://', 't', '.', 'co', '/', '1', 'im', 'uf', '9', 'X', '85', '</s>']}
tokens
{'logits': [-2.8157010078430176, -3.142913818359375, 5.969962120056152], 'probs': [0.0001528693683212623, 0.00011020820966223255, 0.9997368454933167], 'token_ids': [0, 109398, 962, 14905, 66878, 101977, 279, 5648, 106353, 117967, 37722, 22

tokens
{'logits': [-3.1644082069396973, -2.9537854194641113, 5.848018646240234], 'probs': [0.00012185257946839556, 0.00015042049926705658, 0.9997277855873108], 'token_ids': [0, 357, 4080, 81, 5418, 5420, 30, 8335, 2325, 356, 15788, 16, 1991, 274, 12893, 262, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Neutro', 'tokens': ['<s>', 'Ġ@', 'oficial', 'm', 'char', 'iel', ':', 'ĠFui', 'Ġparar', 'Ġem', 'Ġcontagem', ',', 'Ġestado', 'Ġde', 'Ġminas', 'Ġe', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [-2.5986387729644775, 6.224883556365967, -3.1247594356536865], 'probs': [0.00014719435421284288, 0.9997658133506775, 8.697612065589055e-05], 'token_ids': [0, 

tokens
{'logits': [6.601619720458984, -1.9237773418426514, -3.539724111557007], 'probs': [0.999762237071991, 0.00019831872486975044, 3.940641909139231e-05], 'token_ids': [0, 357, 148856, 1461, 22, 30, 50578, 26533, 21085, 321, 2684, 377, 23119, 7772, 274, 8356, 308, 1598, 6358, 274, 19559, 356, 12774, 274, 8543, 341, 329, 88, 18, 265, 19, 28, 59, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Positivo', 'tokens': ['<s>', 'Ġ@', 'Caco', 'Brasil', '2', ':', 'ĠPOLÃįCIA', 'ĠFEDERAL', 'Ġprende', 'Ġum', 'Ġhomem', 'Ġpor', 'ĠtrÃ¡fico', 'Ġinternacional', 'Ġde', 'Ġdrogas', 'Ġcom', 'Ġ10', 'kg', 'Ġde', 'ĠcocaÃŃna', 'Ġem', 'Ġaeroporto', 'Ġde', 'ĠMG', 'Ġhttps', '://', 't', '.', 'co', '/', '8', 'W', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [6.930508136749268, -2.3912909030914307, -3.1513609886169434], 'probs': [0.9998687505722046, 8.944107685238123e-05, 4.1825664084171876e-05], 'token_ids': [0, 357, 43, 21, 11276, 201

tokens
{'logits': [-2.901075601577759, -2.983099937438965, 5.882585525512695], 'probs': [0.00015317107317969203, 0.00014110874326433986, 0.9997057318687439], 'token_ids': [0, 434, 51277, 16, 1688, 34037, 709, 2684, 356, 7481, 96411, 341, 329, 88, 18, 265, 19, 81, 3192, 5779, 3742, 89, 10717, 341, 329, 88, 18, 265, 19, 1332, 2915, 15533, 4607, 2, 1, 1, 1, 1, 1, 1], 'label': 'Neutro', 'tokens': ['<s>', 'Ġser', 'Ġagredida', ',', 'Ġmulher', 'Ġatira', 'Ġcontra', 'Ġhomem', 'Ġem', 'ĠGovernador', 'ĠValadares', 'Ġhttps', '://', 't', '.', 'co', '/', 'm', 'IR', 'tn', '32', 'u', 'GM', 'Ġhttps', '://', 't', '.', 'co', '/', 'ed', 'PA', 'JT', 'MG', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [-3.6507411003112793, 6.248327255249023, -2.3047478199005127], 'probs': [5.0209215260110795e-05, 0.9997568726539612, 0.00019290397176519036], 'token_ids': [0, 357, 50, 67, 73181, 30, 75253, 16, 8906, 17138, 16, 4790, 11662, 2673, 5508, 8222, 35, 203, 203, 1219, 12433, 12341, 1

tokens
{'logits': [-3.6855061054229736, -2.3502635955810547, 5.864651203155518], 'probs': [7.116572669474408e-05, 0.0002704952785279602, 0.9996583461761475], 'token_ids': [0, 357, 7159, 67, 47092, 67, 5839, 30, 3749, 16007, 584, 8, 693, 16, 24, 7153, 263, 146404, 51630, 348, 20675, 17, 39139, 836, 57088, 836, 2755, 274, 7103, 341, 329, 88, 18, 265, 19, 87, 5547, 2140, 2137, 57, 2], 'label': 'Neutro', 'tokens': ['<s>', 'Ġ@', 'Cris', '_', 'duh', '_', '123', ':', 'ĠGoverno', 'Ġpagou', 'ĠR', '$', 'Ġ2', ',', '4', 'ĠbilhÃµes', 'Ġa', 'Ġempreiteiras', 'Ġalvos', 'Ġda', 'ĠLava', '-', 'Jato', 'Ġ-', 'ĠPolitica', 'Ġ-', 'ĠEstado', 'Ġde', 'ĠMinas', 'Ġhttps', '://', 't', '.', 'co', '/', 's', 'DS', 'zz', '77', 'U', '</s>']}
tokens
{'logits': [6.963513374328613, -2.538334846496582, -2.9475560188293457], 'probs': [0.9998756647109985, 7.470432319678366e-05, 4.96161483169999e-05], 'token_ids': [0, 274, 4388, 731, 5931, 18253, 408, 2640, 356, 131704, 33122, 11142, 341, 329, 88, 18, 265, 19, 31659, 27, 7161,

tokens
{'logits': [-2.0626277923583984, -3.4894726276397705, 5.670960426330566], 'probs': [0.0004376324941404164, 0.0001050603314070031, 0.9994572997093201], 'token_ids': [0, 328, 6118, 263, 19956, 2408, 1727, 40200, 356, 7481, 96411, 341, 329, 88, 18, 265, 19, 13761, 1872, 10669, 716, 7580, 350, 38810, 32973, 350, 18543, 3414, 57681, 350, 6952, 93, 18543, 3414, 57681, 3336, 45, 2133, 2], 'label': 'Neutro', 'tokens': ['<s>', 'ĠÃ©', 'Ġmorto', 'Ġa', 'Ġtiros', 'ĠapÃ³s', 'Ġdeixar', 'ĠpresÃŃdio', 'Ġem', 'ĠGovernador', 'ĠValadares', 'Ġhttps', '://', 't', '.', 'co', '/', 'vk', 'SO', 'AU', 'vo', 'kz', 'Ġ#', 'SomosTodos', 'Dilma', 'Ġ#', 'Ray', 'nn', 'iere', 'Ġ#', 'Bu', 'y', 'Ray', 'nn', 'iere', 'On', 'I', 'tun', '</s>']}
tokens
{'logits': [-1.6449451446533203, -2.665390729904175, 4.34192419052124], 'probs': [0.0025029622483998537, 0.0009021537262015045, 0.9965948462486267], 'token_ids': [0, 597, 283, 274, 14837, 17724, 25170, 532, 16, 2533, 857, 7875, 836, 12231, 836, 2755, 274, 7103, 341, 329,

tokens
{'logits': [-3.496389389038086, 6.36854887008667, -2.5350778102874756], 'probs': [5.1955310482298955e-05, 0.9998121857643127, 0.00013586970453616232], 'token_ids': [0, 357, 12832, 30, 1996, 12433, 12341, 16, 1265, 274, 8543, 2656, 1507, 305, 144922, 341, 329, 88, 18, 265, 19, 85, 2956, 31619, 1043, 60, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 'Negativo', 'tokens': ['<s>', 'Ġ@', 'VEJA', ':', 'ĠEm', 'Ġcalamidade', 'Ġfinanceira', ',', 'Ġgoverno', 'Ġde', 'ĠMG', 'Ġcompra', 'Ġdois', 'Ġh', 'elicÃ³pteros', 'Ġhttps', '://', 't', '.', 'co', '/', 'q', 'af', 'iU', 'ip', 'X', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']}
tokens
{'logits': [6.492689609527588, -2.2986764907836914, -3.1829965114593506], 'probs': [0.9997852444648743, 0.00015200738562271, 6.277833017520607e-05], 'token_ids': [0, 274, 7103, 5990, 101034, 15814, 384, 8428, 13815, 356, 128537, 6475, 836, 341, 329, 88, 18, 265, 19, 9769, 57, 846, 4518, 39, 26, 2, 1, 1, 1, 1, 1], 

tokens
{'logits': [-3.2739901542663574, -2.992088556289673, 6.165467739105225], 'probs': [7.95088053564541e-05, 0.00010540069342823699, 0.9998151659965515], 'token_ids': [0, 15072, 5134, 18780, 3217, 389, 7481, 96411, 16, 7103, 11296, 1341, 19, 357, 51474, 67, 13756, 1970, 357, 113686, 570, 818, 75, 357, 15714, 38446, 89041, 357, 15325, 20966, 4915, 341, 329, 88, 18, 265, 19, 6890, 24, 61, 23, 75, 21, 2], 'label': 'Neutro', 'tokens': ['<s>', "'m", 'Ġat', 'ĠBig', 'ĠMais', 'Ġin', 'ĠGovernador', 'ĠValadares', ',', 'ĠMinas', 'ĠGerais', 'Ġw', '/', 'Ġ@', 'tue', '_', 'nunes', '07', 'Ġ@', 'comba', 'ten', 'tem', 'g', 'Ġ@', 'cle', 'verson', '0505', 'Ġ@', 'new', 'rand', 'ley', 'Ġhttps', '://', 't', '.', 'co', '/', 'sr', '4', 'Y', '3', 'g', '1', '</s>']}
tokens
{'logits': [6.97915506362915, -2.450558662414551, -3.0967235565185547], 'probs': [0.9998775720596313, 8.029231685213745e-05, 4.207736492389813e-05], 'token_ids': [0, 52, 118823, 25041, 836, 51253, 348, 3998, 5085, 36543, 54232, 29706, 274, 

In [7]:
predict(document_path='test_submission.json', model_path='snli-roberta-parameters-full/model.tar.gz', out_path='submissions_snli_roberta_parameters_full.csv')

Loading model from snli-roberta-parameters-full/model.tar.gz
Loading batches from test_submission.json for prediction
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Finished predicting 1640 sentences
Results saved in /media/discoD/repositorios/deeplearningufg/nlp/competicao1/submissions_snli_roberta_parameters_full.csv


In [2]:
stopwords = [word.strip() for word in open('stopwords-pt.txt', mode='r', encoding='utf8')]

def load_dataset(filename, train=True):
    raw_df = pd.read_csv(filename)
    if train:
        raw_df = raw_df.rename(columns={'Classificacao': 'label'})
    return raw_df

def describe_dataset(dataframe):
    print(dataframe.label.unique())
    print(dataframe.label.describe())
    print(dataframe.groupby('label')['Text'].count())

def write_predictions(dataframe, predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions to %s' % out_path)
        out_file.write('Id,Category\n')
        idx = 0
        for result, (idx, row) in zip(predictions, dataframe.iterrows()):
            count += 1
            out_file.write('{},{}\n'.format(row['Id'], result))
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())
    
def train_svm_model(train_df, test_df = None, submission_df = None, submission_name: str = None):
    processed_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()),
                                  ('clf-svm', SGDClassifier(loss='hinge', max_iter=2000, tol=1e-5, random_state=42))])

    processed_clf_svm = processed_clf_svm.fit(train_df['Text'], train_df['label'])
    if test_df is not None:
        predicted_svm = processed_clf_svm.predict(test_df['Text'])
        print(balanced_accuracy_score(test_df['label'], predicted_svm))
    if submission_name is not None:
        predictions = processed_clf_svm.predict(submission_df['Text'])
        write_predictions(submission_df, predictions, 'submissions_' + submission_name + '.csv')
    return processed_clf_svm, predictions

In [3]:
train_pd = load_dataset('train.csv')
test_pd = load_dataset('test.csv')

In [4]:
train_pd

Unnamed: 0,Created At,Text,Geo Coordinates.latitude,Geo Coordinates.longitude,User Location,Username,User Screen Name,Retweet Count,label,Observação,Id
0,Mon Jan 09 15:27:43 +0000 2017,Dois são detidos ao tentar jogar celulares e d...,,,,Michele #beta #sdv,michelexmbeta,0,Positivo,,6272
1,Sun Jan 08 02:14:34 +0000 2017,me matan esas minas q cambian 554 veces su fot...,,,Núñez - C.A.B.A.,Gaby Messina,gabymessina36,0,Neutro,,1644
2,Sat Feb 11 09:49:11 +0000 2017,Líderes de motim em presídio de Minas Gerais s...,,,"Hollywood, CA",Wendie Rower,Wendie_Rower,0,Positivo,,7956
3,Thu Jan 05 14:43:03 +0000 2017,#Mídia: Press Release from Business Wire : Di...,,,SP,Marcello Binder,binderbr,0,Neutro,,85
4,Wed Feb 08 22:52:10 +0000 2017,Vacinação contra febre amarela é intensificada...,,,,fodido,eufodeu,0,Positivo,,6006
...,...,...,...,...,...,...,...,...,...,...,...
6554,Thu Jan 26 14:31:45 +0000 2017,Rio faz bloqueio contra febre amarela em munic...,,,Goiás - Brasil,Altair Tavares,altairtavares,0,Positivo,,5735
6555,Fri Feb 10 18:13:01 +0000 2017,Governador Fernando Pimentel entrega 401 veícu...,,,Santos Dumont - MG -Brasil,POSSANTE ON LINE,possanteonline,0,Positivo,,5192
6556,Thu Jan 05 17:19:20 +0000 2017,Secretaria de Educação faz reformulações para ...,,,Minas Gerais - Brasil,Uberlândia,PrefeituraUdia,0,Positivo,,5391
6557,Thu Jan 05 14:15:02 +0000 2017,E governo ainda quer indenizar a família dos b...,,,,Graça Azeredo,azeredo_mg,0,Neutro,,861


In [5]:
describe_dataset(train_pd)

['Positivo' 'Neutro' 'Negativo']
count         6559
unique           3
top       Positivo
freq          2639
Name: label, dtype: object
label
Negativo    1970
Neutro      1950
Positivo    2639
Name: Text, dtype: int64


In [6]:
train_split_pd, test_split_pd = train_test_split(train_pd, stratify=train_pd['label'], test_size=0.2)

In [12]:
# train_split_pd[['Text', 'label']].rename(columns={'Text': 'text'}).to_json('train.json', orient='records', lines=True)
# test_split_pd[['Text', 'label']].rename(columns={'Text': 'text'}).to_json('test.json', orient='records', lines=True)
train_pd[['Text', 'label']].rename(columns={'Text': 'text'}).to_json('train_full.json', orient='records', lines=True)

In [29]:
describe_dataset(train_split_pd)

['Neutro' 'Positivo' 'Negativo']
count         5247
unique           3
top       Positivo
freq          2111
Name: label, dtype: object
label
Negativo    1576
Neutro      1560
Positivo    2111
Name: Text, dtype: int64


In [30]:
describe_dataset(test_split_pd)

['Negativo' 'Positivo' 'Neutro']
count         1312
unique           3
top       Positivo
freq           528
Name: label, dtype: object
label
Negativo    394
Neutro      390
Positivo    528
Name: Text, dtype: int64


In [55]:
model, predictions = train_svm_model(train_split_pd, test_split_pd, test_pd, 'stopwords_svm_tfidf')

0.9515849632600902
Saving predictions to submissions_stopwords_svm_tfidf.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Finished predicting 1640 sentences
Results saved in /media/discoD/repositorios/deeplearningufg/nlp/competicao1/submissions_stopwords_svm_tfidf.csv


In [59]:
model, predictions = train_svm_model(train_df=train_pd, submission_df=test_pd, submission_name='stopwords_svm_tfidf_full')

Saving predictions to submissions_stopwords_svm_tfidf_full.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Finished predicting 1640 sentences
Results saved in /media/discoD/repositorios/deeplearningufg/nlp/competicao1/submissions_stopwords_svm_tfidf_full.csv


In [5]:
test_pd

Unnamed: 0,Created At,Text,Geo Coordinates.latitude,Geo Coordinates.longitude,User Location,Username,User Screen Name,Retweet Count,Observação,Id
0,Thu Jan 05 12:00:34 +0000 2017,RT @JDanieldf: Pedindo para que MG reaja? Reag...,,,Balneário Camboriú - SC,Mirela Franz,MiLick74,27,,3568
1,Fri Jan 06 11:54:50 +0000 2017,Homem que matou ex-mulher e jogou corpo em cis...,,,Belo Horizonte MG Brasil,Ricardo Carlini,carlinibh,1,,1323
2,Sat Feb 11 15:51:14 +0000 2017,"New post: ""Três adolescentes são apreendidos p...",,,,Camila Maciel Serrão,CamilaMacielSer,0,,7976
3,Wed Jan 04 18:08:43 +0000 2017,RT @AnaPaulaVolei: Mais 2 helicópteros!!A cara...,,,Sao Paulo,CLAUDIA DELAFIORI,cdelafiori,444,,2408
4,Wed Jan 04 18:12:12 +0000 2017,"RT @UOLNoticias: Custaram R$ 21,8 milhões: Mes...",,,Jaboatão dos Guararapes,Rodrigo Calabria,CalabriaRodrigo,141,,4435
...,...,...,...,...,...,...,...,...,...,...
1635,Sat Jan 07 12:51:37 +0000 2017,RT @ivo123zarate3: Me hace mal ver en instagra...,,,M e r c e d e s (Ctes),Ere Esse♡,RamoohSilvero,12,,3536
1636,Wed Jan 25 14:18:35 +0000 2017,@PMMG190 - Militares da 22ª Cia prendem autore...,,,Minas Gerais,POLÍCIA MILITAR MG,pmmg190,0,,6881
1637,Mon Jan 09 11:18:11 +0000 2017,Cadeia em Manaus tem 4 mortos; Estados pedem a...,,,,Lenilda Miranda,nilda_ap,0,,627
1638,Fri Jan 06 13:02:26 +0000 2017,Reforma da Previdência será feita no primeiro ...,,,Brasil,marli silvera dziadz,marlidzdz59,0,,2165
