In [None]:
!pip install swifter

In [None]:
import re
import string
import swifter
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
sns.set(rc={'figure.figsize':(10,6)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from tqdm import tqdm
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import (LSTM, Embedding, BatchNormalization, Dense, TimeDistributed, 
                          Dropout, Bidirectional, Flatten,  GlobalMaxPool1D)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

from sklearn.metrics import accuracy_score, plot_confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
train.head()

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">EDA</p>

In [None]:
train.info()

In [None]:
train.drop(columns=['id','keyword','location'], inplace=True)

In [None]:
train['text_len'] = train['text'].apply(lambda x: len(x.split(' ')))

In [None]:
values = train['target'].value_counts().values
fig = go.Figure(data=[go.Pie(labels=['Count 0','Count 1',], values=values)])
fig.update_layout(template="plotly_dark",title={'text': "Count of Type",'y':0.9,
                                                'x':0.45,'xanchor': 'center','yanchor': 'top'},
                  font=dict(size=18, color='white', family="Courier New, monospace"))
fig.show()

In [None]:
fig = px.histogram(train, x='text_len')
fig.update_layout(template="plotly_dark",title={'text': "Phrase Length",'y':0.9,
                                                'x':0.45,'xanchor': 'center','yanchor': 'top'},
                  font=dict(size=18, color='white', family="Courier New, monospace"))
fig.show()

In [None]:
train = train[train['text_len'] <=31]

In [None]:
fig = px.histogram(train, x='text_len')
fig.update_layout(template="plotly_dark",title={'text': "Phrase Length",'y':0.9,
                                                'x':0.45,'xanchor': 'center','yanchor': 'top'},
                  font=dict(size=18, color='white', family="Courier New, monospace"))
fig.show()

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Processing</p>

In [None]:
def text_clear(data):
    tx = data.apply(lambda x: x.lower())
    tx = tx.apply(lambda x: re.sub("http\S+", '', str(x)))
    tx = tx.swifter.apply(lambda x: re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',x))
    tx = tx.swifter.apply(lambda x: re.sub(' +', ' ', x)) # remover espaços em brancos
    tx = tx.swifter.apply(lambda x: re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', x)) # remover as hashtag
    tx = tx.swifter.apply(lambda x: re.sub('(@[A-Za-z]+[A-za-z0-9-_]+)', '', x)) # remover os @usuario
    tx = tx.swifter.apply(lambda x: re.sub('rt', '', x)) # remover os rt
    tx = tx.swifter.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    return tx

nltk.download('stopwords');
stopwords = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stopwords + more_stopwords

def remove_stopwords(text):    
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text


stemmer = nltk.SnowballStemmer("english")
def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [None]:
train['text'] = text_clear(train['text'])
train['text'] = train['text'].apply(remove_stopwords)
train['text'] = train['text'].apply(stemm_text)

train.head()

In [None]:
text = train['text']
target = train['target']

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(text)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

In [None]:
max_palavra = vocab_length 
embedding_dim = 32

In [None]:
x = word_tokenizer.texts_to_sequences(text)
x = pad_sequences(x, padding='post')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, target, stratify=target)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

# <p style="background-color:#80ccff; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  80px 5px; padding-top:8px; padding-bottom:8px;">Model</p>

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(x_train, y_train)
pred = nb.predict(x_test)
acc = accuracy_score(y_test, pred)
print('Acuracy:', acc*100)
plot_confusion_matrix(nb, x_test, y_test);
plt.grid(False)
plt.title('Result');

## Keras

In [None]:
def modelo():
    model = Sequential()
    model.add(Embedding(max_palavra, embedding_dim, input_length=len(x[0])))    
    model.add(Bidirectional(LSTM(embedding_dim, return_sequences=True, recurrent_dropout=0.2)))    
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(embedding_dim, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(embedding_dim, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = modelo()
model.summary()

In [None]:
checkpoint = ModelCheckpoint('model.h5', monitor = 'val_loss', verbose=1, save_best_only = True)
reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, verbose=1, patience=3, min_lr=0.001)
stoped = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.0001)

In [None]:
history = model.fit(x_train, y_train, epochs=30, batch_size=32, validation_data=(x_test, y_test),
                    verbose=1, callbacks=[reduce, checkpoint, stoped])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel('Epochs')
plt.ylabel('Acurácia')
plt.legend(['Acurácia em Treino','Acurácia em Teste'])
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('Epochs')
plt.ylabel('Erro')
plt.legend(['Erro em Treino','Erro em Teste'])
plt.show()