<a href="https://colab.research.google.com/github/rafael-carvalho/brazilian-literature/blob/master/Rafa_Literatura_Brasileira_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install required libraries and download the kaggle dataset

In [1]:

! pip install kaggle
! mkdir ~/.kaggle/
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d rtatman/brazilian-portuguese-literature-corpus
! unzip brazilian-portuguese-literature-corpus.zip
! pip install nltk
! pip install gensim


Downloading brazilian-portuguese-literature-corpus.zip to /content
 52% 9.00M/17.5M [00:00<00:00, 18.1MB/s]
100% 17.5M/17.5M [00:00<00:00, 32.5MB/s]
Archive:  brazilian-portuguese-literature-corpus.zip
  inflating: Brazilian_Portugese_Corpus/A Alma do Lazaro.txt  
  inflating: Brazilian_Portugese_Corpus/A Condessa Vesper.txt  
  inflating: Brazilian_Portugese_Corpus/A Danca dos Ossos.txt  
  inflating: Brazilian_Portugese_Corpus/A Escrava Isaura.txt  
  inflating: Brazilian_Portugese_Corpus/A Mao e a Luva.txt  
  inflating: Brazilian_Portugese_Corpus/A Moreninha.txt  
  inflating: Brazilian_Portugese_Corpus/A Mortalha de Alzira.txt  
  inflating: Brazilian_Portugese_Corpus/A Normalista.txt  
  inflating: Brazilian_Portugese_Corpus/A Pata da Gazela.txt  
  inflating: Brazilian_Portugese_Corpus/A Viuvinha.txt  
  inflating: Brazilian_Portugese_Corpus/Adolfo Caminha/A Normalista.txt  
  inflating: Brazilian_Portugese_Corpus/Adolfo Caminha/Bom Crioulo.txt  
  inflating: Brazilian_Portugese

### Imports the libraries and downloads some text processing tools

In [2]:
import pandas as pd
import nltk
import gensim

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words("portuguese")
stop_words.append('www.nead.unama.br')


csv_path = '/content/guideToDocuments.csv'
df = pd.read_csv(csv_path)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Pre Processing the files. Read the books in the appropriate encoding and processes the text to remove stopwords and break books into multiple X items

In [3]:
def load_book(path, path_prepend):
  output = []
  path = path.replace(",", "").replace("'", "")
  with open(f'{path_prepend}/{path}', "r", encoding='ISO8859_1', errors='ignore') as f:
    output = f.readlines()
  return output

def read_book(book):
  sentences = list()
  for line in book:
      clean_text = preprocess_text(line)
      if clean_text:
        sentences.append(clean_text)
  
  return sentences

def group_lines_into_sequences(book_text, lines_per_sequence=10):
  X = list()
  sequence = list()
  for i in range(len(book_text)):
    sequence += book_text[i]
    if i % lines_per_sequence == 0:
      sequence = list()
      X.append(sequence)
  
  output = list() 
  for sequence in X:
    s = " "
    output.append(s.join(sequence).strip())

  return output


def preprocess_text(input):
  new_line = input.replace('www.nead.unama.br', " ")
  clean_text = gensim.utils.simple_preprocess(new_line)
  clean_text = [i for i in clean_text if i not in stop_words]
  return clean_text


In [5]:
data = list()
for author, work in zip(df['Author'], df['Work']):
  book = load_book(work, '/content/Brazilian_Portugese_Corpus')
  lines = read_book(book)
  sequences = group_lines_into_sequences(lines)
  
  for s in sequences:
    data.append({
        'author': author,
        'work': work.replace('.txt', ""),
        'sequence': s
    })    


dataframe = pd.DataFrame(data)
dataframe.head()
  

Unnamed: 0,author,work,sequence
0,Adolfo Caminha,A Normalista,normalista nead núcleo educação distância av a...
1,Adolfo Caminha,A Normalista,joão maciel mata gadelha conhecido fortaleza j...
2,Adolfo Caminha,A Normalista,havia silêncio morno concentrado destacava rol...
3,Adolfo Caminha,A Normalista,risadinhas explodiam espaços gostosas indiscre...
4,Adolfo Caminha,A Normalista,podem conferir disse erguendo risonho segunda ...


### One hot encoding for the author names. 



In [6]:

unique_values = dataframe['author'].unique()

mapping = dict()
for i, x in enumerate(unique_values):
  mapping[x] = i

print(mapping)
dataframe['author_label'] = dataframe['author'].map(mapping)
Y = dataframe['author_label'].values

{'Adolfo Caminha': 0, 'Aluisio Azevedo': 1, 'Bernardo Guimaraes': 2, 'Joaquim Manuel de Macedo': 3, 'Jose de Alencar': 4, 'Machado de Assis': 5, 'Manuel Antonio de Almeida': 6}


### Splits the data into train and test datasets

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test, Ytrain, Ytest = train_test_split(dataframe['sequence'], Y, test_size=0.33)

### Imports all the Machine Learning libraries

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Model
import numpy as np

### Text preprocessing to get the data ready for the training. We use only the test data for the tokenization process

In [9]:
# Convert sentences to sequences
MAX_VOCAB_SIZE = 20000
embedding_dim = 16
max_length = 200
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df_train)
sequences_train = tokenizer.texts_to_sequences(df_train)
sequences_test = tokenizer.texts_to_sequences(df_test)

# get word -> integer mapping
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens.' % V)

Found 73566 unique tokens.


### Pad the sentences to the fixed dimension

In [10]:
# pad sequences so that we get a N x T matrix
data_train = pad_sequences(sequences_train, padding=padding_type, truncating=trunc_type)
print('Shape of data train tensor:', data_train.shape)

# get sequence length
T = data_train.shape[1]
data_test = pad_sequences(sequences_test, maxlen=T)
print('Shape of data test tensor:', data_test.shape)

Shape of data train tensor: (22507, 116)
Shape of data test tensor: (11087, 116)


### Creates the actual model

In [11]:
# Create the model

# We get to choose embedding dimensionality
D = 20

# Hidden state dimensionality
M = 20

# Note: we actually want to the size of the embedding to (V + 1) x D,
# because the first index starts from 1 and not 0.
# Thus, if the final index of the embedding matrix is V,
# then it actually must have size V + 1.

i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(rate=0.3)(x)
x = Dense(len(unique_values), activation='softmax')(x)

model = Model(i, x)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 116)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 116, 20)           1471340   
_________________________________________________________________
lstm (LSTM)                  (None, 116, 20)           3280      
_________________________________________________________________
global_max_pooling1d (Global (None, 20)                0         
_________________________________________________________________
dropout (Dropout)            (None, 20)                0         
_________________________________________________________________
dense (Dense)                (None, 7)                 147       
Total params: 1,474,767
Trainable params: 1,474,767
Non-trainable params: 0
___________________________________________________

### Compiles the model and trains it

In [14]:
# Compile and fit
model.compile(
  loss='sparse_categorical_crossentropy',
  optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
  metrics=['accuracy']
)


print('Training model...')
r = model.fit(
  data_train,
  Ytrain,
  epochs=25,
  validation_data=(data_test, Ytest),
  callbacks=[]
)



Training model...
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


### Picks a random sample on the test set and verifies if the predicted value matches to the correct label

In [None]:

sample = df_test.sample()
val = sample.values[0]
ind = sample.index[0]

print(f'{unique_values[Y[ind]]}')
print(val)
pred = predict_raw_lines([val])

print(unique_values[np.argmax(pred)])

### Required preparation for text that can be inputed by the user

In [None]:

def prep(input):
  new_line = input.replace('www.nead.unama.br', " ")
  
  clean_text = gensim.utils.simple_preprocess(new_line)
  
  clean_text = [i for i in clean_text if i not in stop_words]
  
  return clean_text

def predict_raw_lines(lines):
  output = []
  for l in lines:
    new_line = prep(l)
    s = " "
    new_line = s.join(new_line).strip()
    sequences = tokenizer.texts_to_sequences([new_line])
    
    padded = pad_sequences(sequences, maxlen=T, padding=padding_type, truncating=trunc_type)
    output.append(padded)
  
  return model.predict(output)[0]


print(unique_values)

input = '''
       Meu caro colega. ­ Acho-me seriamente embaraçado da maneira por que descreverei a
visita que,  a qual já os nossos
tiveram uma ligeira notícia neste mesmo jornal.'''

pred = predict_raw_lines([input])
print(np.asarray(pred))
print(unique_values[np.argmax(pred)])




In [None]:
input_text = 'Olhe, continuou, acariciando-o sempre; n\xE3o se meta com donzelas, entende?... S\xE3o o diabo! Por d\xE1 c\xE1 aquela palha fica um homem em apuros! agora quanto \xE0s outras, papo com elas! N\xE3o mande nenhuma ao vig\xE1rio, nem lhe doa a cabe\xE7a, porque, no fim de contas, nas circunst\xE2ncias de Dona Estela, \xE9 at\xE9 um grande servi\xE7o que voc\xEA lhe faz! Meu rico amiguinho, quando uma mulher j\xE1 passou dos trinta e pilha a jeito um rapazito da sua idade, \xE9 como se descobrisse ouro em p\xF3! sabe-lhe a gaitas! Fique ent\xE3o sabendo de que n\xE3o \xE9 s\xF3 a ela que voc\xEA faz o obs\xE9quio, mas tamb\xE9m ao marido: qua' #@param {type:"string"}
pred = predict_raw_lines([input_text])
print(np.asarray(pred))
print(unique_values[np.argmax(pred)])

In [None]:
model.save('literatura_brasileira.h5')