# Funcoes

In [None]:
import csv
import re
import nltk
import string
import unicodedata
import sys
nltk.download('stopwords')
from datetime import datetime
import time
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
from collections import Counter

def strip_accents(text):

    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)
docs = []
labels = []

def remove_hashtag(text):
    words = text.split()
    for i in words:
        if i.startswith('#'):
            words.remove(i)

    text = ' '.join(words)
    return text


def remove_at(text):
    words = text.split()
    remove = []
    for i in words:
        if i.startswith('@'):
            remove.append(i)
    for r in remove:
        words.remove(r)

    text = ' '.join(words)
    return text


def remove_link(text):
    urls = re.findall(r'(https?://[^\s]+)', text)
    for u in urls:
        text = text.replace(u,"")
    return text


def remove_stopwords(txt):
    a = []
    content = []
    words = txt.split()

    stop = nltk.corpus.stopwords.words('portuguese')
    no_stop =[w for w in words if w.lower().strip() not in stop]

    regex = re.compile('[%s]' % re.escape(string.punctuation))

    for word in no_stop:
        word = regex.sub('', word)
        word = re.sub("\d+", " ", word)
        content.append(word)

    clean = []

    for word in content:
        nfkd = unicodedata.normalize('NFKD',word)
        palavra = u''.join([c for c in nfkd if not unicodedata.combining(c)])
        q = re.sub('[^a-zA-Z0-9 \\\]',' ', palavra)
        if len(q)<3:
            continue
        clean.append(q.lower().strip())

    tokens = [t for t in clean if len(t)>2 and not t.isdigit()]
    ct =' '.join(tokens)
    return ct

def clean(text):
    docs = remove_at(text)
    docs = remove_link(docs)
    docs = remove_stopwords(docs)
    docs = remove_hashtag(docs)
    return docs

def load(file):
    words = []
    with open(file) as f:
        lines = f.readlines()
    for l in lines:
        words.append(l.replace("\n",""))
    return words

# Extracting nouns

In [None]:
#ref https://github.com/fmaruki/Nltk-Tagger-Portuguese

import pickle
import nltk

def topWords(df, number, useStop, stopFile):

    selected = []
    for line in df.text:
        if any(ele in line for ele in words):
            selected.append(clean(line))

    stops = load(stopFile)

    text = ' '.join(selected)

    tagger = pickle.load(open("tagger.pkl",'rb'))
    portuguese_sent_tokenizer = nltk.data.load("tokenizers/punkt/portuguese.pickle")
    sentences = portuguese_sent_tokenizer.tokenize(text)
    tags = [tagger.tag(nltk.word_tokenize(sentence)) for sentence in sentences]
    nouns = []
    for t in tags[0]:
        if t[1]=='NOUN':
            if not t[0].startswith("kk"):
                nouns.append(t[0])

    if useStop:
        res = [i for i in nouns if i not in stops]
    else:
        res = nouns
    commons = Counter(res).most_common(number)

    return commons

In [None]:
import os.path
import pandas as pd

import datetime
now = datetime.datetime.now()

years = ["20", "21"]
months = ["01","02","03","04","05","06","07","08","09","10","11","12"]
path = "my-path"

n = 100

words = ["corona", "covid"]

top_list = []

for year in years:
    for month in months:
        for day in range(1,32):
            if day<10:
                day = '0'+str(day)
            filename = "tweets_"+str(day)+month+year+".txt"
            
            if os.path.isfile(path+filename):
                df = pd.read_csv(path+filename, sep=",",header=0, quotechar="'") 
                top = topWords(df[df["verified"]==True], n, True, 'stopVerbs')
                line = ""
                for t in top:
                    line += ",'" + t[0] + "'," + str(t[1])
                top_list.append(str(day) + month + year + line)
                
                


In [None]:
topFilename = "top"+str(n)+"words_verified.txt"
topWordsFile = open(topFilename, "w")
fields = ""

for i in range(1,n+1):
    fields +=",word"+str(i)+",count"+str(i)

topWordsFile.write("dateDDMMYY" + fields+"\n")

topWordsFile.close()

In [None]:
topWordsFile = open(topFilename, "a")

for top in top_list:
    topWordsFile.write(top.replace('"','')+"\n")
topWordsFile.close()

In [None]:
now = datetime.datetime.now()
print("Finish: ")
print(str(now))