In [1]:
%%capture
!pip install pandas
!pip install tensorflow
!pip install -U scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install numpy
!pip install nltk
!pip install unidecode

from google.colab import files
uploaded = files.upload()

# Bibliotecas

In [2]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import re
import unidecode
import warnings
import seaborn as sns
import time
import pickle
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

sw = set(stopwords.words('english'))
es = EarlyStopping(patience = 3)
np.random.seed(23)
sns.set_style('dark')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing

In [3]:
with open(r"glove_300d.pickle", "rb") as input_file:
  glove = pickle.load(input_file)

In [4]:
with open(r"glove_300d_preprocessing.pickle", "rb") as input_file:
  glove_preprocessing = pickle.load(input_file)

In [5]:
def remove_username(text):
  text = re.sub(r'\@[^\s]+', ' ', text)
  return text

def remove_newline(text):
  text = text.replace('\n', ' ')
  return text

def only_letters(text):
  text = re.sub(r'[^A-Za-z]+', ' ', text)
  return text

def remove_link(text):
  text = re.sub(r'www\.?[^\s]+', ' ', text)
  return text

def remove_hyperlink(text):
  text = re.sub(r'\<.?\>', ' ', text)
  return text

def remove_accent(text):
  text = unidecode.unidecode(text)
  return text

def adjustment_text(text):
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()
  return text

def remove_stopwords(text):
  text = [word for word in text.split() if word not in sw]
  text = ' '.join(text)
  return text

def remove_spam(text):
  text = re.sub(r'\&amp', ' ', text)
  text = re.sub(r'\&lt', ' ', text)
  text = re.sub(r'\&gt', ' ', text)
  text = re.sub(r'\#follow|\#followme|\#like|\#f4f|\#photooftheday', ' ', text)
  return text

def remove_slangs(text):
  text = re.sub(r' b4 ', ' before ', text)
  text = re.sub(r' 2b ', ' to be ', text)
  text = re.sub(r' 2morrow ', ' tomorrow ', text)
  text = re.sub(r' rn ', ' right now ', text)
  text = re.sub(r' brb ', ' be right back ', text)
  text = re.sub(r' mb ', ' my bad ', text)
  text = re.sub(r' luv ', ' love ', text)
  text = re.sub(r' b ', ' be ', text)
  text = re.sub(r' r ', ' are ', text)
  text = re.sub(r' u ', ' you ', text)
  text = re.sub(r' y ', ' why ', text)
  text = re.sub(r' ur ', ' your ', text)
  text = re.sub(r' hbd ', ' happy birthday ', text)
  text = re.sub(r' bday ', ' birthday ', text)
  text = re.sub(r' bihday ', ' birthday ', text)
  text = re.sub(r' omg ', ' oh my god ', text)
  text = re.sub(r' lol ', ' laughing out loud ', text)
  return text

def remove_abbreviations(text):
  text = re.sub(r" can't ", " can not ", text)
  text = re.sub(r" i'm ", " i am ", text)
  text = re.sub(r" i'll ", " i will ", text)
  text = re.sub(r" i'd ", " i would ", text)
  text = re.sub(r" i've ", " i have ", text)
  text = re.sub(r" ain't ", " am not ", text)
  text = re.sub(r" haven't ", " have not ", text)
  text = re.sub(r" hasn't ", " has not ", text)
  text = re.sub(r" can't ", " can not ", text)
  text = re.sub(r" won't ", " will not ", text)
  text = re.sub(r" you're ", " you are ", text)
  text = re.sub(r" we're ", " we are ", text)
  text = re.sub(r" they're ", " they are ", text)
  text = re.sub(r" he's ", " he is ", text)
  text = re.sub(r" she's ", " she is ", text)
  text = re.sub(r" it's ", " it is ", text)
  text = re.sub(r" don't ", " do not ", text)
  text = re.sub(r" doesn't ", " does not ", text)
  text = re.sub(r" wouldn't ", " would not ", text)
  text = re.sub(r" couldn't ", " could not ", text)
  text = re.sub(r" shouldn't ", " should not ", text)
  text = re.sub(r" no-one ", " noone ", text)
  return text

def remove_one_len_word(text):
  text = re.sub(r'\b[a-z]\b', ' ', text)
  return text

def preprocessing(data):
  data['cleaned_tweet'] = data['tweet'].apply(str)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(lambda x: x.lower())
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_newline)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_hyperlink)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_spam)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_link)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_username)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_accent)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_abbreviations)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(only_letters)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_slangs)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_stopwords)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_one_len_word)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(adjustment_text)
  return data

# Modelos

## Modelo 1

In [6]:
def modelo1(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocessing):
  model1 = Sequential()

  if embedding == 'glove':

    if preprocessing:
      model1.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove_preprocessing),
                      trainable = False))
    else:
      model1.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove),
                      trainable = False))
  else: 
    model1.add(Embedding(input_dim = vocab_size,
                        output_dim = embedding_dim,
                        input_length = length_size))
    
  model1.add(Conv1D(filters = 32,
                   kernel_size = 3,
                   padding = 'same',
                   activation = 'relu'))
  model1.add(MaxPooling1D())
  model1.add(Flatten())
  model1.add(Dropout(rate = 0.2))
  model1.add(Dense(units = 1,
                  activation = 'sigmoid'))
  
  #model.summary()
  model1.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])
  
  history_model1 = model1.fit(x = X_train,
                            y = y_train,
                            validation_data = (X_validation, y_validation),
                            batch_size = 100,
                            epochs = 20,
                            callbacks = [es],
                            verbose = 0)
  
  predicted = (model1.predict(X_validation) > 0.5).astype("int32")
  score1 = f1_score(predicted, y_validation, average='weighted')
  return model1, score1

## Modelo 2

In [7]:
def modelo2(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess):
  model2 = Sequential()

  if embedding == 'glove':

    if preprocessing:
      model2.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove_preprocessing),
                      trainable = False))
    else:
      model2.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove),
                      trainable = False))
  else: 
    model2.add(Embedding(input_dim = vocab_size,
                        output_dim = embedding_dim,
                        input_length = length_size))
    
  model2.add(Conv1D(filters = 32,
                   kernel_size = 3,
                   padding = 'same',
                   activation = 'relu'))
  model2.add(MaxPooling1D())
  model2.add(Conv1D(filters = 64,
                   kernel_size = 5,
                   padding = 'same',
                   activation = 'relu'))
  model2.add(MaxPooling1D())
  model2.add(Flatten())
  model2.add(Dropout(rate = 0.2))
  model2.add(Dense(units = 1,
                  activation = 'sigmoid'))
  
  #model.summary()
  model2.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])
  
  history_model2 = model2.fit(x = X_train,
                            y = y_train,
                            validation_data = (X_validation, y_validation),
                            batch_size = 100,
                            epochs = 20,
                            callbacks = [es],
                            verbose = 0)
  
  predicted = (model2.predict(X_validation) > 0.5).astype("int32")
  score2 = f1_score(predicted, y_validation, average='weighted')
  return model2, score2

## Modelo 3

In [8]:
def modelo3(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess):
  model3 = Sequential()
  
  if embedding == 'glove':

    if preprocessing:
      model3.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove_preprocessing),
                      trainable=False))
    else:
      model3.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove),
                      trainable=False))
  else: 
    model3.add(Embedding(input_dim = vocab_size,
                        output_dim = embedding_dim,
                        input_length = length_size))
    
  model3.add(Conv1D(filters = 32,
                   kernel_size = 3,
                   padding = 'same',
                   activation = 'relu'))
  model3.add(MaxPooling1D())
  model3.add(Conv1D(filters = 64,
                   kernel_size = 5,
                   padding = 'same',
                   activation = 'relu'))
  model3.add(MaxPooling1D())
  model3.add(Conv1D(filters = 128,
                   kernel_size = 7,
                   padding = 'same',
                   activation = 'relu'))
  model3.add(MaxPooling1D())
  model3.add(Flatten())
  model3.add(Dropout(rate = 0.2))
  model3.add(Dense(units = 1,
                  activation = 'sigmoid'))
  
  #model.summary()
  model3.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])
  
  history_model3 = model3.fit(x = X_train,
                            y = y_train,
                            validation_data = (X_validation, y_validation),
                            batch_size = 100,
                            epochs = 20,
                            callbacks = [es],
                            verbose = 0)
  
  predicted = (model3.predict(X_validation) > 0.5).astype("int32")
  score3 = f1_score(predicted, y_validation, average='weighted')
  return model3, score3

## Modelo 4

In [9]:
def modelo4(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess):
  model4 = Sequential()
  
  if embedding == 'glove':

    if preprocessing:
      model4.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove_preprocessing),
                      trainable=False))
    else:
      model4.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove),
                      trainable=False))
  else: 
    model4.add(Embedding(input_dim = vocab_size,
                        output_dim = embedding_dim,
                        input_length = length_size))
    
  model4.add(Conv1D(filters = 32,
                   kernel_size = 3,
                   padding = 'same',
                   activation = 'relu'))
  model4.add(MaxPooling1D())
  model4.add(Conv1D(filters = 64,
                   kernel_size = 5,
                   padding = 'same',
                   activation = 'relu'))
  model4.add(MaxPooling1D())
  model4.add(Flatten())
  model4.add(Dropout(rate = 0.2))
  model4.add(Dense(units = 64,
                  activation = 'relu'))
  model4.add(Dense(units = 1,
                  activation = 'sigmoid'))
  
  #model.summary()
  model4.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])
  
  history_model4 = model4.fit(x = X_train,
                            y = y_train,
                            validation_data = (X_validation, y_validation),
                            batch_size = 100,
                            epochs = 20,
                            callbacks = [es],
                            verbose = 0)
  
  predicted = (model4.predict(X_validation) > 0.5).astype("int32")
  score4 = f1_score(predicted, y_validation, average='weighted')
  return model4, score4

## Modelo 5

In [10]:
def modelo5(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess):
  model5 = Sequential()
  
  if embedding == 'glove':

    if preprocessing:
      model5.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove_preprocessing),
                      trainable=False,))
    else:
      model5.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove),
                      trainable=False))
  else: 
    model5.add(Embedding(input_dim = vocab_size,
                        output_dim = embedding_dim,
                        input_length = length_size))
    
  model5.add(Conv1D(filters = 32,
                   kernel_size = 3,
                   padding = 'same',
                   activation = 'relu'))
  model5.add(MaxPooling1D())
  model5.add(Flatten())
  model5.add(Dropout(rate = 0.2))
  model5.add(Dense(units = 64,
                  activation = 'relu'))
  model5.add(Dense(units = 1,
                  activation = 'sigmoid'))
  
  #model.summary()
  model5.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])
  
  history_model5 = model5.fit(x = X_train,
                            y = y_train,
                            validation_data = (X_validation, y_validation),
                            batch_size = 100,
                            epochs = 20,
                            callbacks = [es],
                            verbose = 0)
  
  predicted = (model5.predict(X_validation) > 0.5).astype("int32")
  score5 = f1_score(predicted, y_validation, average='weighted')
  return model5, score5

## Modelo 6

In [11]:
def modelo6(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess):
  model6 = Sequential()
  
  if embedding == 'glove':

    if preprocessing:
      model6.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove_preprocessing),
                      trainable=False))
    else:
      model6.add(Embedding(input_dim = vocab_size,
                      output_dim = embedding_dim,
                      input_length = length_size,
                      embeddings_initializer = keras.initializers.Constant(glove),
                      trainable=False))
  else: 
    model6.add(Embedding(input_dim = vocab_size,
                        output_dim = embedding_dim,
                        input_length = length_size))
    
  model6.add(Conv1D(filters = 32,
                   kernel_size = 3,
                   padding = 'same',
                   activation = 'relu'))
  model6.add(MaxPooling1D())
  model6.add(Conv1D(filters = 64,
                   kernel_size = 5,
                   padding = 'same',
                   activation = 'relu'))
  model6.add(MaxPooling1D())
  model6.add(Conv1D(filters = 128,
                   kernel_size = 7,
                   padding = 'same',
                   activation = 'relu'))
  model6.add(MaxPooling1D())
  model6.add(Flatten())
  model6.add(Dropout(rate = 0.2))
  model6.add(Dense(units = 64,
                  activation = 'relu'))
  model6.add(Dense(units = 1,
                  activation = 'sigmoid'))
  
  #model.summary()
  model6.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])
  
  history_model6 = model6.fit(x = X_train,
                            y = y_train,
                            validation_data = (X_validation, y_validation),
                            batch_size = 100,
                            epochs = 20,
                            callbacks = [es],
                            verbose = 0)
  
  predicted = (model6.predict(X_validation) > 0.5).astype("int32")
  score6 = f1_score(predicted, y_validation, average='weighted')
  return model6, score6

## Predict

In [12]:
def predict(model, nome, use_preprocessing, tokenizer, embedding):
  test = pd.read_csv('Data/test.csv')

  if use_preprocessing:
    test = preprocessing(test)
    test['tokenized'] = tokenizer.texts_to_sequences(test['cleaned_tweet'])
  else:
    test['tokenized'] = tokenizer.texts_to_sequences(test['tweet'])

  X_test = pad_sequences(sequences = test['tokenized'],
                         maxlen = length_size,
                         padding = 'post')

  predicted = (model.predict(X_test) > 0.5).astype("int32")
  prediction = pd.DataFrame()
  prediction['id'] = test['id']
  prediction['label'] = predicted

  if use_preprocessing:

    if embedding == 'glove':
      prediction.to_csv('Submission/' + nome + '_preprocessamento_glove.csv', index=False)
    else:
      prediction.to_csv('Submission/' + nome + '_preprocessamento_normal.csv', index=False)
  else:

    if embedding == 'glove':
      prediction.to_csv('Submission/' + nome + '_glove.csv', index=False)
    else:
      prediction.to_csv('Submission/' + nome + '_normal.csv', index=False)

# Main

In [13]:
def preprocessing_step(use_preprocessing, test_size, data):
  tokenizer = Tokenizer()

  if use_preprocessing:
    tokenizer.fit_on_texts(data['cleaned_tweet'])
    data['tokenized'] = tokenizer.texts_to_sequences(data['cleaned_tweet'])
  
  else:
    tokenizer.fit_on_texts(data['tweet'])
    data['tokenized'] = tokenizer.texts_to_sequences(data['tweet'])

  vocab_size = len(tokenizer.word_index) + 1

  X = pad_sequences(sequences = data['tokenized'],
                  maxlen = length_size,
                  padding = 'post')

  y = data['label']

  X_train, X_validation, y_train, y_validation = train_test_split(X,
                                                                  y, 
                                                                  test_size = test_size,
                                                                  random_state = 23)
  
  return X_train, X_validation, y_train, y_validation, vocab_size, tokenizer

In [14]:
test_size = 0.15
length_size = 15
embedding_dim = 300

In [15]:
data = pd.read_csv('Data/train.csv')
data = data.drop(columns=['id'])
data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [16]:
data = preprocessing(data)
data = data.dropna()
data.head(15)

Unnamed: 0,label,tweet,cleaned_tweet
0,0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,0,@user @user thanks for #lyft credit i can't us...,thanks lyft credit use cause offer wheelchair ...
2,0,bihday your majesty,birthday majesty
3,0,#model i love u take with u all the time in ...,model love take time urd ddddd
4,0,factsguide: society now #motivation,factsguide society motivation
5,0,[2/2] huge fan fare and big talking before the...,huge fan fare big talking leave chaos pay disp...
6,0,@user camping tomorrow @user @user @user @use...,camping tomorrow dannya
7,0,the next school year is the year for exams.ð...,next school year year exams think school exams...
8,0,we won!!! love the land!!! #allin #cavs #champ...,love land allin cavs champions cleveland cleve...
9,0,@user @user welcome here ! i'm it's so #gr...,welcome gr


In [17]:
data.shape

(31962, 3)

In [18]:
modelos = ['modelo1']
embeddings = ['normal', 'glove']
using_preprocessing = [False]
use_preprocessing = []
modelos_usados = []
embeddings_usados = []
tempos = []
resultados = pd.DataFrame()
scores = []

print("Treinamento iniciado!\n")

for modelo in modelos:

  for preprocess in using_preprocessing:

    for embedding in embeddings:

      X_train, X_validation, y_train, y_validation, vocab_size, tokenizer = preprocessing_step(preprocess, test_size, data)

      print("------ \nTreinando o modelo: {}\nUsando pré-processamento: {}\nUsando o embedding: {}\n------".format(modelo, preprocess, embedding))

      ini = time.time()

      if modelo == 'modelo1':
        m, score = modelo1(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess)

      elif modelo == 'modelo2':
        m, score = modelo2(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess)
        
      elif modelo == 'modelo3':
        m, score = modelo3(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess)
        
      elif modelo == 'modelo4':
        m, score = modelo4(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess)
        
      elif modelo == 'modelo5':
        m, score = modelo5(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess)
        
      elif modelo == 'modelo6':
        m, score = modelo6(X_train, X_validation, y_train, y_validation, length_size, embedding_dim, vocab_size, embedding, preprocess)
      
      fim = time.time()
      tempo = round(fim - ini, 3)
      tempos.append(tempo)
      print("\nTreinamento finalizado em {} segundos!".format(tempo))
      print()

      embeddings_usados.append(embedding)
      scores.append(score)
      modelos_usados.append(modelo)
      use_preprocessing.append(preprocess)
      predict(m, modelo, preprocess, tokenizer, embedding)

Treinamento iniciado!

------ 
Treinando o modelo: modelo1
Usando pré-processamento: False
Usando o embedding: normal
------

Treinamento finalizado em 217.21 segundos!

------ 
Treinando o modelo: modelo1
Usando pré-processamento: False
Usando o embedding: glove
------

Treinamento finalizado em 17.68 segundos!



In [19]:
resultados['modelo'] = modelos_usados
resultados['tempo (s)'] = tempos
resultados['f1_score'] = scores
resultados['preprocessamento'] = use_preprocessing
resultados['embedding'] = embeddings_usados
resultados.head(6)

Unnamed: 0,modelo,tempo (s),f1_score,preprocessamento,embedding
0,modelo1,217.21,0.964925,False,normal
1,modelo1,17.68,0.960894,False,glove


In [20]:
resultados.to_csv('teste_embeddings.csv', index=False)