# <p style="background-color:#66ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Sentiment Analysis Portuguese</p>

In [None]:
!pip install swifter -q

In [None]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
sns.set(rc={'figure.figsize':(10,6)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

import re
import string
import swifter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter
from functools import partial

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Embedding, BatchNormalization, Dense, TimeDistributed, Dropout, Bidirectional, Flatten, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam


import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/portuguese-sentiment-analysis/tw_pt.csv')

In [None]:
data.head()

In [None]:
data = data.iloc[:,1:3]
data.head()

In [None]:
data.info()

# <p style="background-color:#66ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Preprocessing</p>

In [None]:
fig = px.histogram(data, x='Classificacao')
fig.update_layout(go.Layout(template="plotly_dark", 
                            title={'text': "Percentage of Type",'y':0.9,'x':0.45,
                                   'xanchor': 'center','yanchor': 'top'},
                            font=dict(size=18, color='white', family="Courier New, monospace"), 
                            xaxis=dict(title='Type'), yaxis=dict(title='Count')))

fig.show()

In [None]:
data['Classificacao'].value_counts()

In [None]:
values = data['Classificacao'].value_counts().values
fig = go.Figure(data=[go.Pie(labels=['Positivo','Neutro','Negativo',], values=values)])
fig.update_layout(template="plotly_dark",title={'text': "Percentage of Type",'y':0.9,
                                                'x':0.45,'xanchor': 'center','yanchor': 'top'},
                  font=dict(size=18, color='white', family="Courier New, monospace"))
fig.show()

In [None]:
def limpa_texto(data):
    tx = data.apply(lambda x: re.sub("http\S+", '', str(x)))
    tx = tx.swifter.apply(lambda x: re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',x))
    tx = tx.swifter.apply(lambda x: re.sub(' +', ' ', x)) # remover espaços em brancos
    tx = tx.swifter.apply(lambda x: re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', x)) # remover as hashtag
    tx = tx.swifter.apply(lambda x: re.sub('(@[A-Za-z]+[A-za-z0-9-_]+)', '', x)) # remover os @usuario
    tx = tx.swifter.apply(lambda x: re.sub('rt', '', x)) # remover os rt
    tx = tx.swifter.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    return tx

In [None]:
data['Text'] = limpa_texto(data['Text'])
data.head()

<div style="color:black; background-color:#f5f7b0; border-radius:10px; padding:20px;">
The data set contains 4 blank lines after the data has been cleared.
</div>

In [None]:
print('Before:',len(data))
data = data.loc[(data['Text'] != ' ')]
print('After:', len(data))

In [None]:
data.drop_duplicates(inplace=True)
data.info()

In [None]:
values = data['Classificacao'].value_counts().values
fig = go.Figure(data=[go.Pie(labels=['Positivo','Neutro','Negativo',], values=values)])
fig.update_layout(template="plotly_dark",title={'text': "Percentage of Type",'y':0.9,
                                                'x':0.45,'xanchor': 'center','yanchor': 'top'},
                  font=dict(size=18, color='white', family="Courier New, monospace"))
fig.show()

<div style="color:black; background-color:#f5f7b0; border-radius:10px; padding:20px;">
As we can see we have a lot of duplicate values, the dataset starts with 8199 rows after we have cleared that we have 3626 rows. In addition, the datast is unbalanced.
</div>

In [None]:
data['message_len'] = data['Text'].swifter.apply(lambda x: len(x.split(' ')))
data.head()

In [None]:
fig = px.histogram(data, x='message_len')
fig.update_layout(template="plotly_dark",title={'text': "Phrase Length",'y':0.9,
                                                'x':0.45,'xanchor': 'center','yanchor': 'top'},
                  font=dict(size=18, color='white', family="Courier New, monospace"))
fig.show()

# <p style="background-color:#66ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">WordCloud</p>

In [None]:
tw = np.array(Image.open('../input/mascaras/tw_img.png'))

def formato(val):
    if val == 0:
        return 255
    else:
        return val
    
mask_tw = np.ndarray((tw.shape[0],tw.shape[1]), np.int32)

for i in range(len(mask_tw)):
    mask_tw[i] = list(map(formato, mask_tw[i]))
    
wc = WordCloud(background_color="white", mask=tw, contour_width=3, contour_color='#1DA1F2')
wc.generate(' '.join(text for text in data.loc[data['Classificacao'] == 'Positivo', 'Text']))
wc.to_file("tw_img.png")
plt.figure(figsize=[20,10])
plt.title('Top words Positive', fontdict={'size': 22,  'verticalalignment': 'bottom', 'color':'#1DA1F2'})
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



In [None]:
tw = np.array(Image.open('../input/mascaras/tw_img.png'))

def formato(val):
    if val == 0:
        return 255
    else:
        return val
    
mask_tw = np.ndarray((tw.shape[0],tw.shape[1]), np.int32)

for i in range(len(mask_tw)):
    mask_tw[i] = list(map(formato, mask_tw[i]))
    
wc = WordCloud(background_color="white", mask=tw, contour_width=3, contour_color='#1DA1F2')
wc.generate(' '.join(text for text in data.loc[data['Classificacao'] == 'Negativo', 'Text']))
wc.to_file("tw_img.png")
plt.figure(figsize=[20,10])
plt.title('Top words Negative', fontdict={'size': 22,  'verticalalignment': 'bottom', 'color':'#1DA1F2'})
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



In [None]:
tw = np.array(Image.open('../input/mascaras/tw_img.png'))

def formato(val):
    if val == 0:
        return 255
    else:
        return val
    
mask_tw = np.ndarray((tw.shape[0],tw.shape[1]), np.int32)

for i in range(len(mask_tw)):
    mask_tw[i] = list(map(formato, mask_tw[i]))
    
wc = WordCloud(background_color="white", max_words=1000, mask=tw, contour_width=3, contour_color='#1DA1F2')
wc.generate(' '.join(text for text in data.loc[data['Classificacao'] == 'Neutro', 'Text']))
wc.to_file("tw_img.png")
plt.figure(figsize=[20,10])
plt.title('Top words Neutral', fontdict={'size': 22,  'verticalalignment': 'bottom', 'color':'#1DA1F2'})
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



# <p style="background-color:#66ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">LSTM</p>

# <p style="background-color:#e4e7eb; font-family:newtimeroman; font-size:120%; text-align:center; color:#0091ff; border-radius: 20px 20px; padding-top:8px; padding-bottom:8px;">Token</p>

In [None]:
tk = Tokenizer(num_words=32)
tk.fit_on_texts(data['Text'])
x = tk.texts_to_sequences(data['Text'])

# <p style="background-color:#e4e7eb; font-family:newtimeroman; font-size:120%; text-align:center; color:#0091ff; border-radius: 20px 20px; padding-top:8px; padding-bottom:8px;">Padding</p>

In [None]:
x = pad_sequences(x, padding='post')


# <p style="background-color:#e4e7eb; font-family:newtimeroman; font-size:120%; text-align:center; color:#0091ff; border-radius: 20px 20px; padding-top:8px; padding-bottom:8px;">Encoder</p>

In [None]:
le = LabelEncoder()
le.fit(data['Classificacao'])
y = le.fit_transform(data['Classificacao'])


# <p style="background-color:#e4e7eb; font-family:newtimeroman; font-size:120%; text-align:center; color:#0091ff; border-radius: 20px 20px; padding-top:8px; padding-bottom:8px;">Train and Test</p>

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)

# <p style="background-color:#e4e7eb; font-family:newtimeroman; font-size:120%; text-align:center; color:#0091ff; border-radius: 20px 20px; padding-top:8px; padding-bottom:8px;">Model</p>

In [None]:
max_palavra = 100
embedding_dim = 30 

In [None]:
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model = Sequential()
model.add(Embedding(max_palavra, embedding_dim, input_length=len(x[0])))
model.add(Bidirectional(LSTM(32, return_sequences=True, recurrent_dropout=0.2)))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.25))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='softmax'))
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

# <p style="background-color:#e4e7eb; font-family:newtimeroman; font-size:120%; text-align:center; color:#0091ff; border-radius: 20px 20px; padding-top:8px; padding-bottom:8px;">Callback</p>

In [None]:
checkpoint = ModelCheckpoint('model.hr', monito='val_loss', verbose=1, save_best_only=True)
reduce = ReduceLROnPlateau(monitor='val_loss', factor=.5, verbose=1, patience=5, min_lr=0.001)
stoped = EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)

In [None]:
history = model.fit(x_train, y_train, epochs=30, batch_size=32, validation_data=(x_test, y_test), verbose=1, callbacks=[checkpoint, reduce, stoped])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
axes[0].plot(history.history['accuracy'])
axes[0].plot(history.history['val_accuracy'])
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('Acurácia')
axes[0].legend(['Acurácia em Treino','Acurácia em Teste'])
axes[0].grid(True)

axes[1].plot(history.history['loss'])
axes[1].plot(history.history['val_loss'])
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Erro')
axes[1].legend(['Erro em Treino','Erro em Teste'])
axes[1].grid(True)

<div style="color:black; background-color:#f5f7b0; border-radius:10px; padding:20px;">
<b>Conclusion</b><br/>
The model can't learn to preditc classes.<br/>
</div>

# <p style="background-color:#66ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Continuous with BERT</p>