# Importações

In [172]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, LSTM, Bidirectional, GRU, Bidirectional, Dropout
from tensorflow.keras.models import Model 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import os

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
from tensorflow.compat.v1.keras.backend import set_session

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
set_session(sess)

np.seed = 42

In [2]:
#import tensorflow as tf
#from tensorflow.compat.v1.keras.backend import set_session
#config = tf.compat.v1.ConfigProto()
#config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
#config.log_device_placement = True  # to log device placement (on which device the operation ran)
#sess = tf.compat.v1.Session(config=config)
#set_session(sess)

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device



In [3]:
#os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# Lendo a base de dados

In [55]:
df = pd.read_csv('CSTR.csv')

In [56]:
df

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence
...,...,...,...
294,39.txt,Scoring protocols are a broad class of voting ...,Theory
295,5.txt,We study the behavior of Range Voting and Norm...,Theory
296,28.txt,Using entropy of traffic distributions has bee...,Theory
297,27.txt,We study the complexity of influencing electio...,Theory


In [57]:
df.shape

(299, 3)

In [58]:
texts = df['text']
classes = df['class']
possible_labels = classes.unique()

In [59]:
possible_labels

array(['ArtificiallIntelligence', 'Robotics', 'Systems', 'Theory'],
      dtype=object)

In [60]:
encoder = LabelEncoder()
encoder.fit(classes)

LabelEncoder()

In [61]:
y = encoder.transform(classes)

# Definições

In [189]:
MAX_SEQUENCE_LENGHT = 500
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
BATCH_SIZE = 10
EPOCHS = 100

# Pré-processando os textos

## Tokenização

In [63]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [64]:
len(sequences)

299

In [65]:
num_words = min(MAX_VOCAB_SIZE, len(tokenizer.word_index)+1)

## Paddando (completando com 0 as posições faltantes para que todas as sequências tenham o mesmo tamanho)

In [66]:
X = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGHT)

In [67]:
X.shape

(299, 500)

# Separando o conjunto de treino e teste

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [184]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
num_classes = len(possible_labels)

In [193]:
accuracies = []
for train_index, test_index in kf.split(X, y):
    print(f'=================================================================')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = lstm2(num_classes)
    model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)
    confidences = model.predict(X_test)
    predictions = np.zeros(len(confidences), dtype='int')
    for i, class_confidences in enumerate(confidences): 
        predictions[i] = np.argmax(class_confidences)
    accuracies.append(accuracy_score(y_test, predictions))
print('Done!')

 2s 6ms/sample - loss: 0.1494 - accuracy: 0.9554
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Ep

In [194]:
accuracies = np.array(accuracies)

In [195]:
accuracies

array([0.73333333, 0.66666667, 0.6       , 0.73333333, 0.53333333,
       0.7       , 0.63333333, 0.53333333, 0.33333333, 0.5862069 ])

In [196]:
accuracies.mean()

0.605287356321839

# Funções para definir os modelos

In [181]:
def cnn1(num_classes): 
    embedding_layer = Embedding(num_words,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGHT, trainable=True)
    input_ = Input(shape=(MAX_SEQUENCE_LENGHT,))
    x = embedding_layer(input_)
    x = Conv1D(128,3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(128,3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(128,3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    output = Dense(num_classes, activation='softmax')(x)
    model = Model(input_, output)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [182]:
def lstm1(num_classes):
    embedding_layer = Embedding(num_words,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGHT, trainable=True)
    input_ = Input(shape=(MAX_SEQUENCE_LENGHT,))
    x = embedding_layer(input_)
    x = LSTM(15, return_sequences=True)(x)
    x = GlobalMaxPooling1D()(x)
    output = Dense(num_classes, activation='sigmoid')(x)
    model = Model(input_, output)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [192]:
def lstm2(num_classes): 
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=num_words, 
                            output_dim=EMBEDDING_DIM, 
                            input_length=MAX_SEQUENCE_LENGHT, trainable=True))

    model.add(tf.keras.layers.LSTM(units=128, return_sequences=True))
    model.add(tf.keras.layers.LSTM(units=64))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(32))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
    return model

In [139]:
model = cnn1(len(possible_labels))

In [140]:
r = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

Train on 269 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [141]:
predictions = model.predict(X_test)

In [142]:
predictions

array([[7.52629220e-01, 2.44613588e-01, 2.48592021e-03, 2.71279365e-04],
       [4.57323942e-04, 9.99310017e-01, 2.32611230e-04, 4.81157914e-09],
       [5.94759174e-02, 9.37754750e-01, 2.76541081e-03, 3.97840267e-06],
       [2.22915322e-01, 7.76472032e-01, 6.09904935e-04, 2.70933401e-06],
       [1.33837992e-03, 2.76053138e-02, 9.68958259e-01, 2.09794356e-03],
       [9.70944017e-03, 9.90289211e-01, 1.43069917e-06, 2.60647320e-11],
       [7.03034997e-01, 2.96957016e-01, 7.94072639e-06, 3.62830086e-08],
       [3.36785585e-01, 6.63205981e-01, 8.46043167e-06, 1.14716983e-08],
       [4.22340751e-01, 5.77509701e-01, 1.48930820e-04, 6.01220393e-07],
       [5.50240744e-03, 3.73522758e-01, 6.20758057e-01, 2.16815897e-04],
       [9.93622005e-01, 6.37802947e-03, 5.85898707e-09, 2.28186831e-10],
       [6.84814714e-03, 9.93002594e-01, 1.49204716e-04, 1.80610371e-08],
       [2.44801015e-07, 9.86663599e-06, 9.97363985e-01, 2.62591406e-03],
       [9.11002755e-01, 8.89971927e-02, 1.09882265e

In [143]:
predictions_final = np.zeros(len(predictions), dtype='int')

In [144]:
for i, confidences in enumerate(predictions): 
    predictions_final[i] = np.argmax(confidences)

In [145]:
predictions_final

array([0, 1, 1, 1, 2, 1, 0, 1, 1, 2, 0, 1, 2, 0, 1, 0, 3, 3, 3, 0, 1, 1,
       1, 0, 0, 1, 3, 1, 1, 1])

In [146]:
y_test

array([0, 0, 0, 1, 2, 1, 1, 0, 0, 2, 0, 0, 3, 0, 1, 0, 3, 3, 3, 0, 1, 0,
       1, 0, 0, 1, 3, 1, 0, 0])

In [147]:
accuracy_score(y_test, predictions_final)

0.6666666666666666

In [148]:
y_test[0]

0

In [88]:
predictions_final[0]

3