In [20]:
import os

import collections

import matplotlib
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from unidecode import unidecode
import gensim


%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (20.0, 20.0)

In [15]:
# base vars
datasets_path = '../datasets/'
csv_path = datasets_path +'csv/'
outputs_path = '../outputs/'

In [16]:
#show project dirs
p_dirs = [d for d in os.listdir(datasets_path) if os.path.isdir(os.path.join(datasets_path, d)) and not d.startswith('.')]
p_dirs

['csv', 'tweets']

In [17]:
#show corpus files
corpus_path = datasets_path+'{0}/'.format(p_dirs[1])
cur_files = [e for e in os.listdir(corpus_path) if os.path.isfile(os.path.join(corpus_path, e)) and not e.startswith('.')]
cur_files

['2019-11-06 21:50:52.267798.txt']

In [18]:
#get file info
cur_file_path = corpus_path+cur_files[0]
separator = '||'

tmp_text = '';
with open(cur_file_path, 'r') as file:
    tmp_text = file.read()
    file.close()

#text preparation
tmp_text = unidecode(tmp_text.lower()) #tirando  acentuação e colocando em letras minúsculas
tmp_text = ' '.join(tmp_text.splitlines()) #removendo \n

##tweets txt
tmp_text = tmp_text.split('||')

##gerando matriz esparsa
nltk.download('punkt')
tmp_text = [[f.lower() for f in word_tokenize(e)] for e in tmp_text]

tmp_text

[nltk_data] Downloading package punkt to /home/albrigs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['@',
  'arcenio_igor',
  '@',
  'immari_bitch',
  'pior',
  'que',
  'tem',
  'notebook',
  'mesmo',
  'mkkkkkkkk'],
 ['rt',
  '@',
  'bullshhhit',
  ':',
  'mexer',
  'no',
  'notebook',
  'dentro',
  'de',
  'casa',
  'com',
  'uma',
  'arma',
  'do',
  'lado',
  'e',
  'o',
  'maior',
  'conceito',
  'de',
  'micropenis',
  'ja',
  'visto',
  'https',
  ':',
  '//t.co/fphq0lvnv4'],
 ['o',
  'unico',
  'dia',
  'que',
  'eu',
  'to',
  'em',
  'casa',
  'e',
  'preciso',
  'do',
  'notebook',
  'pra',
  'estudar',
  ',',
  'minha',
  'irma',
  'usa'],
 ['aaaaaaaaaaah', 'meu', 'notebook', 'ta', 'funcionando', '!', '!', '!', '!'],
 ['na',
  'porra',
  'do',
  'celular',
  'nem',
  'adianta',
  'jogar',
  'pq',
  'vai',
  'fuder',
  'ele',
  'mais',
  'ainda',
  ',',
  'e',
  'o',
  'notebook',
  'nao',
  'aguenta',
  'nem',
  'rodar',
  '50',
  'de',
  'fps'],
 ['preciso',
  'comprar',
  'um',
  'notebook',
  'ate',
  '2000',
  'reais',
  'se',
  'alguem',
  'tiver',
  'alguma',
  's

In [28]:
#most common words

words = []
for e in tmp_text:
    words.extend(e)

common_words = collections.Counter(words).most_common()

print(len(words))
print('---------')
print(common_words[:30])

17539
---------
[(':', 542), ('@', 540), ('e', 524), ('de', 437), (',', 414), ('o', 398), ('a', 354), ('que', 329), ('eu', 320), ('rt', 301), ('do', 277), ('...', 221), ('um', 220), ('camera', 197), ('pra', 174), ('no', 171), ('nao', 168), ('?', 168), ('.', 166), ('https', 158), ('com', 150), ('celular', 142), ('meu', 141), ('na', 138), ('uma', 125), ('em', 116), ('da', 110), ('iphone', 109), ('notebook', 101), ('memoria', 97)]


In [29]:
#building dictionary
vocab_size = 10_000

w_vocab = common_words[:(vocab_size-1)]

w_vocab

[(':', 542),
 ('@', 540),
 ('e', 524),
 ('de', 437),
 (',', 414),
 ('o', 398),
 ('a', 354),
 ('que', 329),
 ('eu', 320),
 ('rt', 301),
 ('do', 277),
 ('...', 221),
 ('um', 220),
 ('camera', 197),
 ('pra', 174),
 ('no', 171),
 ('nao', 168),
 ('?', 168),
 ('.', 166),
 ('https', 158),
 ('com', 150),
 ('celular', 142),
 ('meu', 141),
 ('na', 138),
 ('uma', 125),
 ('em', 116),
 ('da', 110),
 ('iphone', 109),
 ('notebook', 101),
 ('memoria', 97),
 ('!', 94),
 ('se', 94),
 ('aplicativo', 91),
 ('ventilador', 88),
 ('tem', 84),
 ('q', 77),
 ('mas', 72),
 ('ja', 68),
 ('minha', 67),
 ('me', 67),
 ('so', 67),
 ('ta', 58),
 ('ele', 58),
 ('``', 58),
 ('por', 57),
 ('mais', 56),
 ('#', 56),
 ("''", 53),
 ('todo', 50),
 ('esta', 49),
 ('vcs', 49),
 ('to', 47),
 ('gente', 46),
 ('para', 44),
 ('ou', 43),
 ('ai', 42),
 ('quem', 42),
 ('cabeca', 41),
 ('ate', 40),
 ('tudo', 37),
 ('como', 37),
 ('dia', 36),
 ('sem', 36),
 ('as', 36),
 ('ver', 36),
 ('pq', 35),
 ('tava', 35),
 ('esse', 32),
 ('vai', 31