%%capture 
!pip install import-ipynb 
!pip install textblob 
!pip install spacy 
!pip install nltk 
!pip install -U sklearn 
!pip install googletrans==3.1.0a0

In [1]:
import pandas as pd
import numpy as np
import spacy
import os
import re
import nltk
import unicodedata
import glob

from leia import SentimentIntensityAnalyzer
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from sklearn.metrics import accuracy_score
from googletrans import Translator
from nltk.corpus import stopwords

In [2]:
#Execute de line below to download de 'pt_core_news_sm'
!python -m spacy download pt_core_news_sm

translator = Translator(service_urls=['translate.googleapis.com'])
nltk.download('punkt')
nlp = spacy.load('pt_core_news_sm')
nltk.download('stopwords')
sw = set(stopwords.words('portuguese'))

Collecting pt-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.2.0/pt_core_news_sm-3.2.0-py3-none-any.whl (22.2 MB)
[+] Download and installation successful
You can now load the package via spacy.load('pt_core_news_sm')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rodox\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rodox\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Prepeocessing Functions

In [3]:
def remove_tt_username(text):
    text = str(text)
    no_tt_username = re.sub(r'\@\S+', '', text)
    return no_tt_username

def identify_emoticons(text):
    text = str(text)
    text = re.sub(r'\:\-?\)+', ' cara feliz ', text)
    text = re.sub(r'\:\-?[dDpP]+', ' cara feliz ', text)
    text = re.sub(r'\:\-?\'?\(+', ' cara triste ', text)
    text = re.sub(r'\>\:\-?\(+', ' cara brava ', text)
    return text

def remove_hashtags(text):
    text = str(text)
    no_hashtags = re.sub(r'\#\S+', '', text)
    return no_hashtags

def remove_phone(text):
    text = str(text)
    text = re.sub(r'(\(?(\d{2,3})\)?)?\ ?\d{4,5}\-?\ ?\d{4}', ' ', text)
    return text

def remove_url(text):
    text = str(text)
    text = re.sub(r'https?\:\/\/\S+', ' ', text)
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'[a-zA-Z|.]+\.com(\.br)?', ' link ', text)
    return text

def remove_date(text):
    text = str(text)
    text = re.sub(r'((\d{1,2}\/)(\d{1,2}\/?)(\d{2,4})?)', ' ', text)
    text = re.sub(r'((\d{1,2}\-)(\d{1,2}\-?)(\d{2,4})?)', ' ', text)
    text = re.sub(r'((\d+(\s+[deDE]+\s+)[aA-zZ|ç|Ç]+((\s+[deDE]+\s+)\d+)?))', ' ', text)
    return text

def remove_hour(text):
    text = str(text)
    text = re.sub(r'(\d+)\:(\d+)[hH]?(\:\d+)?[hH]?[rsRS]\w?', ' ', text)
    text = re.sub(r'(\d+)[hH](\d+)', ' < hora > ', text)
    return text

def remove_number(text): 
    text = str(text)
    text = re.sub(r'[0-9]', '', text)
    return text

def lowercase(text):
    text = str(text)
    text = text.lower()
    return text

def remove_oneword(text):
    text = str(text)
    if len(text.split()) > 1:        
        return text
    return

def remove_stopword(text):
    text = str(text) 
    text = [word for word in text.split() if word not in sw]
    text = ' '.join(text)
    return text

def remove_accent(text):
    text = str(text) 
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode("utf-8")
    return text

def remove_emoji(text):
    text = str(text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U000E007F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF" 
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def remove_punction(text): 
    text = str(text) 
    text = re.sub(r'[!"#$%&\'()*+,-.º<>/:;=?@[/\/\]^_`{|}~]', ' ', text)
    return text

def preprocessing(data):
    data = pd.Series(data)
    data = data.apply(remove_tt_username)
    data = data.apply(remove_hashtags)
    data = data.apply(identify_emoticons)
    data = data.apply(remove_url)
    data = data.apply(remove_phone)
    data = data.apply(remove_hour)
    data = data.apply(remove_date)
    data = data.apply(remove_number)
    data = data.apply(remove_emoji)
    data = data.apply(lowercase)
    data = data.apply(remove_stopword)
    data = data.apply(remove_accent)
    data = data.apply(remove_punction)
    data = data.apply(remove_oneword)
    return data

## Models

### LeIA

In [4]:
analyzer = SentimentIntensityAnalyzer()
def leia(text):
    text = str(text)
    result = analyzer.polarity_scores(text)
    
    #analisa a frase utilizando o compound
    if result['compound'] >= 0.05:
        return 'positivo'
    elif result['compound'] <= -0.05:
        return 'negativo'
    else:
        return 'neutro'

### TextBlob + ReLi

In [5]:
def textblob(sentence):
    sentence = str(sentence)
    blob = TextBlob(sentence)
    result = 0
    
    #translate the text to english
    translation = translator.translate(sentence, src='pt', dest='en')
    translation = translation.text
    translation = TextBlob(translation)
    result = translation.sentiment.polarity
        
    if result > 0:
        return 'positivo'
    elif result < 0:
        return 'negativo'
    else:
        return 'neutro'

### OpLexicon

In [6]:
with open('lexico_v3.0.txt', 'r') as f:
    lines = f.readlines()

lines = [str(x.strip()) for x in lines]
pol_dict = {}

for line in lines:
    word, _, pol, _ = line.split(',')
    
    if word not in pol_dict.keys():
        pol_dict[word] = pol

In [7]:
def oplexion(text):
    text = str(text)
    doc = nlp(text)
    pol = 0
    
    for token in doc:
        if token.text in pol_dict.keys():
            if token.pos_ == 'VERB':
                if token.lemma_ in pol_dict.keys():
                    pol += int(pol_dict[str(token.lemma_)])
                else:
                    pol += int(pol_dict[str(token.text)])
            else:
                pol += int(pol_dict[str(token.text)])
        else:
            pol += 0
        
    if pol > 0:
        return 'positivo'
    elif pol < 0:
        return 'negativo'
    else:
        return 'neutro'

### SentiLex

In [8]:
with open('SentiLex-lem-PT01.txt', 'r') as f:
    lines = f.readlines()

lines = [str(x.strip()) for x in lines]
pol_dict = {}

for line in lines:
    word, infos = line.split('.')
    pol = infos.split(';')
    pol = pol[3]
    pol = pol[4:]
    
    if word not in pol_dict.keys():
        pol_dict[word] = pol

In [9]:
def sentilex(text):
    text = str(text)
    doc = nlp(text)
    pol = 0
    
    for token in doc:
        try:
            if token.pos_ == 'VERB':
                pol += int(pol_dict[str(token.lemma_)])
            else:
                pol += int(pol_dict[str(token.text)])
        except KeyError:
            pol += 0
        
    if pol > 0:
        return 'positivo'
    elif pol < 0:
        return 'negativo'
    else:
        return 'neutro'

In [10]:
#data = pd.concat(map(pd.read_csv, glob.glob('raw_data\\*.csv')))[['Search', 'Tweet_Date', 'Original_Tweet']].reset_index(drop=True)
data.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu já dei na época :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu só queria conseguir comer alguma coisa pra ...,Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [11]:
data.shape

(785814, 5)

In [12]:
data = data.drop(columns=['id', 'tweet_date', 'query_used'])
data = data.rename({'tweet_text': 'text', 'sentiment': 'target'}, axis=1)
data['target'] = data['target'].map(lambda x: x.lower())
data.head()

Unnamed: 0,text,target
0,@Tixaa23 14 para eu ir :),positivo
1,@drexalvarez O meu like eu já dei na época :),positivo
2,Eu só queria conseguir comer alguma coisa pra ...,positivo
3,:D que lindo dia !,positivo
4,"@Primo_Resmungao Pq da pr jeito!!é uma ""oferta...",positivo


In [13]:
%%time
data = data.sample(500)
data = data.reset_index()
data = data.drop(columns=['index'])
data['cleaned_text'] = preprocessing(data['text'])
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.head(10)

Wall time: 51.7 ms


Unnamed: 0,text,target,cleaned_text
0,ai quando a gente se conhecer eu vou ficar tão...,negativo,ai gente conhecer vou ficar tao feliz aaaaaa v...
1,@_iamSHUKRI Esp at 4:30 am :))),positivo,esp at am cara feliz
2,@eujulianaramosv Muito triste cara :(,negativo,triste cara cara triste
3,Poxa não tem uma negra :( https://t.co/S8U4DGyACx,negativo,poxa negra cara triste
4,@passarindemarte olha a carinha de assustada d...,negativo,olha carinha assustada ninha cara triste
5,@ddianatasa Espensif :(,negativo,espensif cara triste
6,só falta esse set pra eu ter todos os sets do ...,negativo,falta set pra ter todos sets north cara triste
7,@ischargro De esta forma salimos adelante como...,negativo,forma salimos adelante pais cara triste
8,eu falo que vou dormir cedo e meu projeto vai ...,negativo,falo vou dormir cedo projeto vai agua abaixo c...
9,@gabcrj Nao tem mais e nem vai ter :(. Se eu e...,negativo,nao vai ter cara triste encontrar alguma ant...


In [14]:
data.shape

(500, 3)

In [15]:
%%time
data['leia'] = data['cleaned_text'].apply(leia)

Wall time: 76.8 ms


In [24]:
%%time
#se usar o texto limpo a acurácia fica muito baixa
#isso acontece por causa da api não conseguir direito para inglês
data['textblob'] = data['text'].apply(textblob)

Wall time: 2min 38s


In [17]:
%%time
data['oplexion'] = data['cleaned_text'].apply(oplexion)

Wall time: 2.04 s


In [18]:
%%time
data['sentilex'] = data['cleaned_text'].apply(sentilex)

Wall time: 1.97 s


In [25]:
print("LeIA: " + str(accuracy_score(data['target'], data['leia'])))
print("TextBlob: " + str(accuracy_score(data['target'], data['textblob'])))
print("OpLexion: " + str(accuracy_score(data['target'], data['oplexion'])))
print("SentiLex: " + str(accuracy_score(data['target'], data['sentilex'])))

LeIA: 0.934
TextBlob: 0.924
OpLexion: 0.888
SentiLex: 0.904


In [None]:
data.head(10)

In [None]:
data['leia'].value_counts()