In [1]:
%%capture
!pip install pandas
!pip install numpy
!pip install tensorflow
!pip install keras
!pip install sklearn
!pip install matplotlib
!pip install seaborn
!pip install unidecode
!pip install -U imbalanced-learn
!pip3 install pickle5

In [2]:
import tensorflow as tf
import pandas as pd
import warnings
import unidecode
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle5 as pickle
import random
import os
import re
import time
import nltk
nltk.download('stopwords')

from imblearn.over_sampling import RandomOverSampler
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Reshape, Dense, Dropout, Input, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, confusion_matrix

sw = set(stopwords.words('english'))
os.environ['PYTHONHASHSEED']=str(23)
tf.random.set_seed(23)
random.seed(23)
warnings.filterwarnings('ignore')
np.random.seed(23)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Preprocessing

In [3]:
def remove_username(text):
  text = re.sub(r'\@[^\s]+', ' ', text)
  return text

def remove_newline(text):
  text = text.replace('\n', ' ')
  return text

def only_letters(text):
  text = re.sub(r'[^a-záâàãéêèẽíìîĩóòõôúùũû\s]+', ' ', text)
  return text

def remove_link(text):
  text = re.sub(r'www\.?[^\s]+', ' ', text)
  return text

def remove_hyperlink(text):
  text = re.sub(r'\<.?\>', ' ', text)
  return text

def remove_accent(text):
  text = unidecode.unidecode(text)
  return text

def adjustment_text(text):
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()
  return text

def remove_stopwords(text):
  text = [word for word in text.split() if word not in sw]
  text = ' '.join(text)
  return text

def remove_spam(text):
  text = re.sub(r'\&amp', ' ', text)
  text = re.sub(r'\&lt', ' ', text)
  text = re.sub(r'\&gt', ' ', text)
  text = re.sub(r'\#follow|\#followme|\#like|\#f4f|\#photooftheday', ' ', text)
  return text

def remove_slangs(text):
  text = re.sub(r' b4 ', ' before ', text)
  text = re.sub(r' 2b ', ' to be ', text)
  text = re.sub(r' 2morrow ', ' tomorrow ', text)
  text = re.sub(r' rn ', ' right now ', text)
  text = re.sub(r' brb ', ' be right back ', text)
  text = re.sub(r' mb ', ' my bad ', text)
  text = re.sub(r' luv ', ' love ', text)
  text = re.sub(r' b ', ' be ', text)
  text = re.sub(r' r ', ' are ', text)
  text = re.sub(r' u ', ' you ', text)
  text = re.sub(r' y ', ' why ', text)
  text = re.sub(r' ur ', ' your ', text)
  text = re.sub(r' hbd ', ' happy birthday ', text)
  text = re.sub(r' bday ', ' birthday ', text)
  text = re.sub(r' bihday ', ' birthday ', text)
  text = re.sub(r' omg ', ' oh my god ', text)
  text = re.sub(r' lol ', ' laughing out loud ', text)
  return text

def remove_abbreviations(text):
  text = re.sub(r" can\'t ", " can not ", text)
  text = re.sub(r" i\'m ", " i am ", text)
  text = re.sub(r" i\'ll ", " i will ", text)
  text = re.sub(r" i\'d ", " i would ", text)
  text = re.sub(r" i\'ve ", " i have ", text)
  text = re.sub(r" ain\'t ", " am not ", text)
  text = re.sub(r" haven\'t ", " have not ", text)
  text = re.sub(r" hasn\'t ", " has not ", text)
  text = re.sub(r" can\'t ", " can not ", text)
  text = re.sub(r" won\'t ", " will not ", text)
  text = re.sub(r" you\'re ", " you are ", text)
  text = re.sub(r" we\'re ", " we are ", text)
  text = re.sub(r" they\'re ", " they are ", text)
  text = re.sub(r" he\'s ", " he is ", text)
  text = re.sub(r" she\'s ", " she is ", text)
  text = re.sub(r" it\'s ", " it is ", text)
  text = re.sub(r" don\'t ", " do not ", text)
  text = re.sub(r" doesn\'t ", " does not ", text)
  text = re.sub(r" wouldn\'t ", " would not ", text)
  text = re.sub(r" couldn\'t ", " could not ", text)
  text = re.sub(r" shouldn\'t ", " should not ", text)
  return text

def remove_one_len_word(text):
  text = re.sub(r'\b[a-z]\b', ' ', text)
  return text

def preprocessing(data):
  data['cleaned_tweet'] = data['tweet'].apply(str)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(lambda x: x.lower())
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_newline)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_hyperlink)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_spam)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_link)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_username)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_abbreviations)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(only_letters)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_accent)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_slangs)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_stopwords)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_one_len_word)
  data['cleaned_tweet'] = data['cleaned_tweet'].apply(adjustment_text)
  return data

from google.colab import files
uploaded = files.upload()

!unzip tokenizer_RNN_seed23

model = keras.models.load_model('model_0.3_dropout.h5')
model.summary()

plt.figure(figsize = (10, 7))
predicted = (model.predict(X_validation) > 0.5).astype("int32")
matrix = confusion_matrix(y_validation, predicted, labels=[0, 1])
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, cmap="Blues", fmt='d', annot_kws={"size": 16})
plt.xlabel('Classe prevista')
plt.ylabel('Classe real')
plt.savefig('matriz_confusao.jpg')

In [4]:
normal_data = pd.read_csv('Data/train.csv')
normal_data = normal_data.drop(columns=['id'])
normal_data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [5]:
normal_data.shape

(31962, 2)

In [6]:
preprocessed_data = normal_data.copy()
preprocessed_data = preprocessing(preprocessed_data)
preprocessed_data = preprocessed_data.replace('None', pd.NA)
preprocessed_data = preprocessed_data.dropna()
preprocessed_data = preprocessed_data.drop_duplicates()
preprocessed_data = preprocessed_data.drop(columns=['tweet'])
preprocessed_data = preprocessed_data.rename(columns={'cleaned_tweet': 'tweet'})
preprocessed_data.head()

Unnamed: 0,label,tweet
0,0,father dysfunctional selfish drags kids dysfun...
1,0,thanks lyft credit use cause offer wheelchair ...
2,0,birthday majesty
3,0,model love take time
4,0,factsguide society motivation


In [7]:
preprocessed_data.shape

(29530, 2)

In [8]:
ros = RandomOverSampler(random_state=23, sampling_strategy='minority')
X_resampled, y_resampled = ros.fit_resample(preprocessed_data[['tweet']], preprocessed_data['label'])
data_preprocessing_augmentation = pd.concat([X_resampled, y_resampled], axis=1)
data_preprocessing_augmentation.head()

Unnamed: 0,tweet,label
0,father dysfunctional selfish drags kids dysfun...,0
1,thanks lyft credit use cause offer wheelchair ...,0
2,birthday majesty,0
3,model love take time,0
4,factsguide society motivation,0


In [9]:
ros = RandomOverSampler(random_state=23, sampling_strategy='minority')
X_resampled, y_resampled = ros.fit_resample(normal_data[['tweet']], normal_data['label'])
data_augmentation = pd.concat([X_resampled, y_resampled], axis=1)
data_augmentation.head()

Unnamed: 0,tweet,label
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [10]:
data_augmentation.shape

(59440, 2)

with open(r"content/Tokenizer/tokenizer.pickle", "rb") as output_file:
    tokenizer_lstm = pickle.load(output_file)

In [11]:
with open(r"glove_total.pickle", "rb") as output_file:
    glove = pickle.load(output_file)

FileNotFoundError: ignored

In [None]:
all_messages = pd.concat([data_preprocessing_augmentation,
                          preprocessed_data,
                          normal_data,
                          data_augmentation], axis=0)

all_messages = all_messages.reset_index(drop=True)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_messages['tweet'].values)

teste = pd.DataFrame({'tweet': ["the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams   #hate #imagine #actorslife #revolutionschool #girl",
                                "we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â¦ ",
                                "it was a hard monday due to cloudy weather. disabling oxygen production for today. #goodnight #badmonday "]})

teste['tokenized'] = tokenizer.texts_to_sequences(teste['tweet'].values)
teste['tokenized_lstm'] = tokenizer_lstm.texts_to_sequences(teste['tweet'].values)
teste.head()

teste['tokenized_lstm'].equals(teste['tokenized'])

## Tokenizer

with open(r"tokenizer_rand_augmentantion.pickle", "rb") as output_file:
    tokenizer = pickle.load(output_file)

## Parameters

In [None]:
non_linearity_function = 'relu'
lstm_units = 50
first_dropout_rate = 0.25
last_dropout_rate = 0.5
epochs = 10
batch_size = 100
embedding_dim = 200
length_size = 30

## Predict

In [None]:
def predict(model, parameter, value):

  test = pd.read_csv('Data/test.csv')
  # test = preprocessing(test)
  test['tokenized'] = tokenizer.texts_to_sequences(test['tweet'].values)
  
  X_test = pad_sequences(sequences = test['tokenized'],
                         maxlen = length_size,
                         padding = 'pre')

  predicted = (model.predict(X_test) > 0.5).astype("int32")
  prediction = pd.DataFrame()
  prediction['id'] = test['id']
  prediction['label'] = predicted
  prediction.to_csv('Submission/{}_{}.csv'.format(parameter, value), index=False)

In [None]:
def generate_cm(model, X_validation, y_validation, parametro, valor):

  plt.figure(figsize = (10, 7))
  predicted_validation = (model.predict(X_validation) > 0.5).astype("int32")
  matrix = confusion_matrix(y_validation, predicted_validation, labels=[0, 1])
  sns.set(font_scale=1.4)
  sns.heatmap(matrix, annot=True, cmap="Blues", fmt='d', annot_kws={"size": 16})
  plt.xlabel('Classe prevista')
  plt.ylabel('Classe real')
  
  plt.savefig('Images/matriz_confusao_' + parametro + '_' + str(valor) + '.jpg')

## Model

In [None]:
def lstm_normal(vocab_size, tokenizer, X_train, X_validation, y_train, y_validation):

  model = Sequential()

  model.add(Embedding(input_dim=vocab_size,
                      output_dim=embedding_dim,
                      input_length=length_size,
                      weights=[glove],
                      trainable=True,
                      name='embedding'))
  
  #droupout layer
  model.add(Dropout(rate = first_dropout_rate))

  #lstm layer
  model.add(LSTM(units = lstm_units))

  #dropout layer
  model.add(Dropout(rate = last_dropout_rate))

  #output layer
  model.add(Dense(units=1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  #model_random.summary()

  history_random = model.fit(X_train, 
                             y_train,
                             batch_size=batch_size,
                             epochs=epochs,
                             validation_data=(X_validation, y_validation))
  
  predicted_validation = (model.predict(X_validation) > 0.5).astype("int32")
  score = f1_score(y_validation, predicted_validation, average='weighted')
  score = round(score, 4)

  generate_cm(model, X_validation, y_validation, 'units', lstm_units)
  save_model(model, lstm_units)
  predict(model, 'units', lstm_units)
  
  return score

In [None]:
from keras.models import model_from_json

def lstm(dropout):

  model.layers[3].rate = dropout
  
  new_model = model_from_json(model.to_json())

  new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  history_random = new_model.fit(X_train, 
                                 y_train,
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 validation_data=(X_validation, y_validation))
  
  predicted_validation = (new_model.predict(X_validation) > 0.5).astype("int32")
  score = f1_score(y_validation, predicted_validation, average='weighted')
  score = round(score, 4)

  generate_cm(new_model, X_validation, y_validation, 'dropout', dropout)
  save_model(new_model, dropout)
  predict(new_model, 'dropout', dropout)
  
  return score 

In [None]:
def save_model(modelo, epoch):
  file_name = 'model_'
  file_name = file_name + str(epoch) + '_dropout.h5'
  modelo.save('Model/' + file_name)

## Main

In [None]:
#tokenizer = Tokenizer()
#tokenizer.fit_on_texts(data_augmentation['tweet'].values)
data_augmentation['tokenized'] = tokenizer.texts_to_sequences(data_augmentation['tweet'].values)
vocab_size = len(tokenizer.word_index) + 1

X = pad_sequences(sequences = data_augmentation['tokenized'],
                  maxlen = length_size,
                  padding = 'pre')

y = data_augmentation['label']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.15, random_state=23)

test = pd.read_csv('Data/test.csv')
test = preprocessing(test)
test['tokenized'] = tokenizer.texts_to_sequences(test['cleaned_tweet'].values)

X_test = pad_sequences(sequences = test['tokenized'],
                       maxlen = length_size,
                       padding = 'post')

predicted = (model.predict(X_test) > 0.5).astype("int32")
prediction = pd.DataFrame()
prediction['id'] = test['id']
prediction['label'] = predicted
prediction.to_csv('Submission/aaaaaaaaaa.csv', index=False)

In [None]:
units = [25, 50, 75]
tempos = []
scores = []

for unit in units:
  print("Testando com {} units".format(unit))
  ini = time.time()
  # score_validation = lstm(dropout)
  lstm_units = unit
  score_validation = lstm_normal(vocab_size, tokenizer, X_train, X_validation, y_train, y_validation)
  fim = time.time()
  tempo = fim - ini

  tempos.append(tempo)
  scores.append(score_validation)
  print()

In [None]:
results = pd.DataFrame({
    'unidade': units,
    'tempo': tempos,
    'score_validação': scores 
})

results.to_csv('Tempos_Units_RNN.csv', index=False)

In [None]:
results.head(20)

In [None]:
!zip -r /content/Model_Units.zip /content/Model

In [None]:
!zip -r /content/Submission_Units.zip /content/Submission

In [None]:
!zip -r /content/Images.zip /content/Images

In [None]:
while True:
  pass

In [None]:
1+1