In [0]:
import numpy as np
import pandas as pd
import os
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from keras.layers import Input, Embedding, GRU, LSTM, MaxPooling1D, GlobalMaxPool1D
from keras.layers import Dropout, Dense, Activation, Flatten,Conv1D, SpatialDropout1D
from keras.models import Sequential
from keras.optimizers import RMSprop 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [0]:
max_len = 126
max_words = 10000

In [0]:
df = pd.read_csv('/content/drive/My Drive/MyCovid/abstract_clusters.csv',encoding='utf-8',error_bad_lines=False,engine='python')

In [0]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [0]:
from sklearn.utils import shuffle
df = shuffle(df)

In [0]:
!unzip "/content/drive/My Drive/Paragram/glove.840B.300d.zip"

In [0]:
X =df['Abstract'].values

In [0]:
y =df['Label'].values

In [0]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
    # convert texts to sequences
sequences = tokenizer.texts_to_sequences(X)
    # generate work index
word_index = tokenizer.word_index
    # print top words count
print('{} of unique tokens found'.format(len(word_index)))
    # pad sequences using max_len param
X = pad_sequences(sequences, maxlen=max_len)
    # convert list of labels into numpy array
# labels = np.asarray(labels)
    # print shape of text and label tensors
# print('data tensor shape: {}\nlabel tensor shape:{}'.format(data.shape, labels.shape))

In [0]:
embeddings_index = {}
        # open embeddings file
try:
    f = open('glove.840B.300d.txt')
            # iterate over lines and split on individual words
            # split coefficient of word values
            # map words and coefficients to embeddings dictionary
    for line in f:
        values = line.split(' ') # returns list of [word, coeff]
        word = values[0] # gets first list element
        coeff = np.asarray(values[1:], dtype='float32')  # slice coefficiennt value array from remainder of list
                # assign mapping to dictionary
        embeddings_index[word] = coeff
    f.close()
except IOError:
    print('cannot read file. check file paths')

In [0]:
       # prepare glove word-embedding matrix
        # create empty embedding tensor
embedding_matrix = np.zeros((max_words,300))
        # map the top words of the data into the glove embedding matrix
        # words not found from the data in glove will be zeroed
for word, i in word_index.items():
        if i >= max_words: continue
        embedding_vector = embeddings_index.get(word)
        #ALLmight
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector

In [0]:
X_train, X_val_test, y_train, y_val_test = train_test_split(
        X,y, test_size=0.2, random_state=0, stratify = y
        )

In [0]:
X_val, X_test, y_val, y_test = train_test_split(
        X_val_test,y_val_test, test_size=0.5, random_state=0, stratify = y_val_test
        )

In [0]:
model = Sequential()
model.add(Embedding(max_words, 300, weights=[embedding_matrix], trainable=True))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(4))
model.add(LSTM(64, dropout=0.1)) 
model.add(Dense(10, activation='softmax'))
model.summary()

In [0]:
Adam = keras.optimizers.Adam(lr = 0.0005)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam, metrics=['sparse_categorical_accuracy'])

In [0]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

In [0]:
prediction = model.predict(X_test, verbose = 1)

In [0]:
preds = np.argmax(prediction, axis =1)

In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

In [0]:
# define plotting metrics
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
# plot model training and validation accuracy and loss
plot_training_and_validation(acc, val_acc, loss, val_loss)