%%capture 
!pip install import-ipynb 
!pip install textblob 
!pip install spacy 
!pip install nltk 
!pip install -U sklearn 
!pip install googletrans==3.1.0a0

In [1]:
import pandas as pd
import numpy as np
import spacy
import os
import re
import nltk
import unicodedata
import glob

from leia import SentimentIntensityAnalyzer
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from sklearn.metrics import accuracy_score
from googletrans import Translator
from nltk.corpus import stopwords

In [2]:
#Execute de line below to download de 'pt_core_news_sm'
!python -m spacy download pt_core_news_sm

translator = Translator(service_urls=['translate.googleapis.com'])
#nltk.download('punkt')
nlp = spacy.load('pt_core_news_sm')
#nltk.download('stopwords')
sw = set(stopwords.words('portuguese'))

Collecting pt-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.2.0/pt_core_news_sm-3.2.0-py3-none-any.whl (22.2 MB)
[+] Download and installation successful
You can now load the package via spacy.load('pt_core_news_sm')


## Prepeocessing Functions

In [3]:
def remove_tt_username(text):
    text = str(text)
    no_tt_username = re.sub(r'\@\S+', '', text)
    return no_tt_username

def identify_emoticons(text):
    text = str(text)
    text = re.sub(r'\:\-?\)+', ' cara feliz ', text)
    text = re.sub(r'\:\-?[dDpP]+', ' cara feliz ', text)
    text = re.sub(r'\:\-?\'?\(+', ' cara triste ', text)
    text = re.sub(r'\>\:\-?\(+', ' cara brava ', text)
    return text

def remove_hashtags(text):
    text = str(text)
    no_hashtags = re.sub(r'\#\S+', '', text)
    return no_hashtags

def remove_phone(text):
    text = str(text)
    text = re.sub(r'(\(?(\d{2,3})\)?)?\ ?\d{4,5}\-?\ ?\d{4}', ' ', text)
    return text

def remove_url(text):
    text = str(text)
    text = re.sub(r'https?\:\/\/\S+', ' ', text)
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'[a-zA-Z|.]+\.com(\.br)?', ' link ', text)
    return text

def remove_date(text):
    text = str(text)
    text = re.sub(r'((\d{1,2}\/)(\d{1,2}\/?)(\d{2,4})?)', ' ', text)
    text = re.sub(r'((\d{1,2}\-)(\d{1,2}\-?)(\d{2,4})?)', ' ', text)
    text = re.sub(r'((\d+(\s+[deDE]+\s+)[aA-zZ|ç|Ç]+((\s+[deDE]+\s+)\d+)?))', ' ', text)
    return text

def remove_hour(text):
    text = str(text)
    text = re.sub(r'(\d+)\:(\d+)[hH]?(\:\d+)?[hH]?[rsRS]\w?', ' ', text)
    text = re.sub(r'(\d+)[hH](\d+)', ' < hora > ', text)
    return text

def remove_number(text): 
    text = str(text)
    text = re.sub(r'[0-9]', '', text)
    return text

def lowercase(text):
    text = str(text)
    text = text.lower()
    return text

def remove_oneword(text):
    text = str(text)
    if len(text.split()) > 1:        
        return text
    return

def remove_stopword(text):
    text = str(text) 
    text = [word for word in text.split() if word not in sw]
    text = ' '.join(text)
    return text

def remove_accent(text):
    text = str(text) 
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode("utf-8")
    return text

def remove_emoji(text):
    text = str(text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U000E007F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF" 
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def remove_punction(text): 
    text = str(text) 
    text = re.sub(r'[!"#$%&\'()*+,-.º<>/:;=?@[/\/\]^_`{|}~]', ' ', text)
    return text

def preprocessing(data):
    data = pd.Series(data)
    data = data.apply(remove_tt_username)
    data = data.apply(remove_hashtags)
    data = data.apply(identify_emoticons)
    data = data.apply(remove_url)
    data = data.apply(remove_phone)
    data = data.apply(remove_hour)
    data = data.apply(remove_date)
    data = data.apply(remove_number)
    data = data.apply(remove_emoji)
    data = data.apply(lowercase)
    data = data.apply(remove_stopword)
    data = data.apply(remove_accent)
    data = data.apply(remove_punction)
    data = data.apply(remove_oneword)
    return data

## Models

### LeIA

In [4]:
analyzer = SentimentIntensityAnalyzer()
def leia(text):
    text = str(text)
    result = analyzer.polarity_scores(text)
    
    #analisa a frase utilizando o compound
    if result['compound'] >= 0.05:
        return 'positivo'
    elif result['compound'] <= -0.05:
        return 'negativo'
    else:
        return 'neutro'

In [5]:
print('teste "aspas" teste')

teste "aspas" teste


### TextBlob + ReLi

In [6]:
def textblob(sentence):
    sentence = str(sentence)
    blob = TextBlob(sentence)
    result = 0
    
    #translate the text to english
    try:
        translation = translator.translate(sentence, src='pt', dest='en')
        translation = translation.text
        translation = TextBlob(translation)
        result = translation.sentiment.polarity

        if result > 0:
            return 'positivo'
        elif result < 0:
            return 'negativo'
        else:
            return 'neutro'
    except:
        print('error to translate "'+sentence+'"')

### OpLexicon

In [7]:
with open('lexico_v3.0.txt', 'r') as f:
    lines = f.readlines()

lines = [str(x.strip()) for x in lines]
pol_dict = {}

for line in lines:
    word, _, pol, _ = line.split(',')
    
    if word not in pol_dict.keys():
        pol_dict[word] = pol

In [8]:
def oplexion(text):
    text = str(text)
    doc = nlp(text)
    pol = 0
    
    for token in doc:
        if token.text in pol_dict.keys():
            if token.pos_ == 'VERB':
                if token.lemma_ in pol_dict.keys():
                    pol += int(pol_dict[str(token.lemma_)])
                else:
                    pol += int(pol_dict[str(token.text)])
            else:
                pol += int(pol_dict[str(token.text)])
        else:
            pol += 0
        
    if pol > 0:
        return 'positivo'
    elif pol < 0:
        return 'negativo'
    else:
        return 'neutro'

### SentiLex

In [9]:
with open('SentiLex-lem-PT01.txt', 'r') as f:
    lines = f.readlines()

lines = [str(x.strip()) for x in lines]
pol_dict = {}

for line in lines:
    word, infos = line.split('.')
    pol = infos.split(';')
    pol = pol[3]
    pol = pol[4:]
    
    if word not in pol_dict.keys():
        pol_dict[word] = pol

In [10]:
def sentilex(text):
    text = str(text)
    doc = nlp(text)
    pol = 0
    
    for token in doc:
        try:
            if token.pos_ == 'VERB':
                pol += int(pol_dict[str(token.lemma_)])
            else:
                pol += int(pol_dict[str(token.text)])
        except KeyError:
            pol += 0
        
    if pol > 0:
        return 'positivo'
    elif pol < 0:
        return 'negativo'
    else:
        return 'neutro'

In [11]:
data = pd.concat(map(pd.read_csv, glob.glob('raw_data\\*2022-04-23*.csv')))[['Search', 'Tweet_Date', 'Original_Tweet']].reset_index(drop=True)
data.groupby('Search').count()

Unnamed: 0_level_0,Tweet_Date,Original_Tweet
Search,Unnamed: 1_level_1,Unnamed: 2_level_1
Bolsonaro,1000,1000
Ciro Gomes,1000,1000
Doria,1000,1000
Lula,1000,1000


In [12]:
data.shape

(4000, 3)

In [13]:
%%time
data['Cleaned_Text'] = preprocessing(data['Original_Tweet'])
data.head(10)

Wall time: 217 ms


Unnamed: 0,Search,Tweet_Date,Original_Tweet,Cleaned_Text
0,Bolsonaro,2022-04-23,@ArthurWilliam_ @LulaOficial @ricardostuckert ...,antes o ter sido presidente ter saqueado naca...
1,Bolsonaro,2022-04-23,@CarlosBolsonaro @TerraBrasilnot Se ele trocas...,trocasse insinuacao bolsonaro ministros suprem...
2,Bolsonaro,2022-04-23,@brussel_ive Infelizmente a grande massa de ap...,infelizmente grande massa apoiadores bolsonaro...
3,Bolsonaro,2022-04-23,BOLSONARO REELEITO 22 🇧🇷 https://t.co/oCDy6E0h0e,bolsonaro reeleito
4,Bolsonaro,2022-04-23,Veja o que o PRESIDENTE BOLSONARO ACABOU de RE...,veja presidente bolsonaro acabou realizar bras...
5,Bolsonaro,2022-04-23,A chefona da OMC..o mundo precisa do Brasil..o...,chefona omc o mundo precisa brasil o brasil ...
6,Bolsonaro,2022-04-23,@Aisha_com_vida números que eu estudei pra sab...,numeros estudei pra saber ne ja voce fica def...
7,Bolsonaro,2022-04-23,"@GlauciaNatali @ericlinsg Glaucia, eu tenho ce...",glaucia certeza militares lado bolsonaro ga...
8,Bolsonaro,2022-04-23,"@lovefefoneto Assim, é até que ""legal"" aq, mas...",assim legal aq triste msm morar msm pais b...
9,Bolsonaro,2022-04-23,@IsmaelxLucas Excelente! Tá bom demais. \nUm p...,excelente ta bom demais povinho elegeu bolso...


In [14]:
data.shape

(4000, 4)

In [15]:
%%time
data['leia'] = data['Cleaned_Text'].apply(leia)

Wall time: 739 ms


In [16]:
%%time
data['oplexion'] = data['Cleaned_Text'].apply(oplexion)

Wall time: 18.4 s


In [17]:
%%time
data['sentilex'] = data['Cleaned_Text'].apply(sentilex)

Wall time: 17.5 s


In [18]:
%%time
#se usar o texto limpo a acurácia fica muito baixa
#isso acontece por causa da api não conseguir direito para inglês
data['textblob'] = data['Original_Tweet'].apply(textblob)

error to translate "@taoquei1 @Rodrigo15121143 Boa noite Rodrigo e Barbara (Te Atualizei), eu achei Máxima essa jogada do Bolsonaro e tenho certeza que irá entrar para a história... 👏👏👏👏👏👏👏"
error to translate "@o_antagonista DORIA PRESIDENTEEEEEEE"
Wall time: 20min 22s


In [19]:
data[data.textblob.isnull() == True].head(10)

Unnamed: 0,Search,Tweet_Date,Original_Tweet,Cleaned_Text,leia,oplexion,sentilex,textblob
76,Bolsonaro,2022-04-23,@taoquei1 @Rodrigo15121143 Boa noite Rodrigo e...,boa noite rodrigo barbara te atualizei ache...,positivo,neutro,neutro,
2440,Doria,2022-04-23,@o_antagonista DORIA PRESIDENTEEEEEEE,doria presidenteeeeeee,neutro,neutro,neutro,


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Search          4000 non-null   object
 1   Tweet_Date      4000 non-null   object
 2   Original_Tweet  4000 non-null   object
 3   Cleaned_Text    3866 non-null   object
 4   leia            4000 non-null   object
 5   oplexion        4000 non-null   object
 6   sentilex        4000 non-null   object
 7   textblob        3998 non-null   object
dtypes: object(8)
memory usage: 250.1+ KB


In [21]:
data.shape

(4000, 8)

In [22]:
data[(data.leia == data.oplexion) & (data.leia == data.sentilex) & (data.leia == data.textblob) & (data.leia == 'neutro')].groupby(['Tweet_Date', 'Search']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Original_Tweet,Cleaned_Text,leia,oplexion,sentilex,textblob
Tweet_Date,Search,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-04-23,Bolsonaro,215,161,215,215,215,215
2022-04-23,Ciro Gomes,163,150,163,163,163,163
2022-04-23,Doria,246,207,246,246,246,246
2022-04-23,Lula,205,180,205,205,205,205


In [23]:
for search in data['Search'].value_counts().index.values:
    for date in data['Tweet_Date'].value_counts().index.values:
        data[(data.Search == search) & (data.Tweet_Date == date)].to_csv('clean_labeled_data\\'+search+'_'+date+'_clean_labeled.csv')