<a href="https://colab.research.google.com/github/shreyawalia/ai-powered-file-management/blob/main/word2vec_ml_sdg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade keras

Requirement already up-to-date: keras in /usr/local/lib/python3.6/dist-packages (2.4.3)


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score
from keras import optimizers
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Flatten
from keras.models import Model, Sequential
from keras.initializers import Constant
# Conv
from keras.layers import Conv1D, MaxPooling1D, Embedding
# LSTM
from keras.layers import Dense, Input, Embedding, Dropout, SpatialDropout1D, Bidirectional, GRU, LSTM
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

import gensim
from gensim.models import Word2Vec

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import Counter
from keras.models import load_model
from keras.optimizers import Adam, RMSprop


import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
base_dir = "/content/drive/My Drive/sdg/"


In [None]:
TEXT_DATA_DIR = f"{base_dir}dataset/sdg_tag.csv"
EMBEDDINGS_DIR = f"{base_dir}embeddings/word2vec/"
CROSS_FOLDS = f"{base_dir}dataset/cross_validation/"

MAX_SEQUENCE_LENGTH = 500
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
NUM_EPOCHS = 20
BATCH_SIZE = 128
labels_index = [str(i) for i in range(1,18)]

In [None]:
import nltk
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [None]:
labelled = pd.read_csv(TEXT_DATA_DIR)
labelled['description'] = labelled['description'].str.lower()
labelled['description'] = labelled['description'].apply(cleanHtml)
labelled['description'] = labelled['description'].apply(cleanPunc)
labelled['description'] = labelled['description'].apply(keepAlpha)

In [None]:
vocab = Counter()

# Masked for training and valid. This will be part of the vocab and index
texts = [word_tokenize(t.lower()) for t in labelled.description]

# Same masked vocab, embeddings and index
for text in texts:
    vocab.update(text)    
model = Word2Vec(texts, size=EMBEDDING_DIM, window=5, min_count=5, workers=16, sg=0, negative=5)
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NUM_WORDS))}

In [None]:
word_vectors = model.wv

In [None]:
# Masked padded sequences for training
masked_sequences = np.array([[word_index.get(t, 0) for t in text]
             for text in texts])
masked_data = pad_sequences(masked_sequences, maxlen=MAX_SEQUENCE_LENGTH)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
masked_sequences

array([list([135, 2379, 5, 203, 4, 294, 1, 32, 19, 114, 39, 295, 307, 14, 3908, 5, 82, 275, 1, 118, 47, 80, 8, 26, 4388, 928, 7, 31, 2, 164, 240, 5843, 2, 263, 5, 695, 10, 1, 37, 961, 6, 747, 3, 1787, 2980, 2, 9481, 74, 2, 84, 11, 6, 691, 3, 47, 80, 25, 1345, 4, 18, 378, 28, 1, 31, 2, 164, 356, 9, 1906, 31, 526, 546, 27, 8, 26, 2123, 1526, 7, 5843, 4, 346, 95, 99, 6, 158, 3, 197, 2, 369, 5, 203, 4, 1346, 1, 121, 3, 183, 2, 33, 216, 1036, 4, 335, 263, 2, 37, 77, 5, 203, 4, 1108, 389, 307, 241, 1, 39, 5, 21, 1198, 201, 2, 565, 601, 16, 10326, 14784, 25, 662, 6, 17, 4, 208, 6, 197, 2, 369, 427, 2572, 5, 280, 9, 31, 240, 2, 161, 263, 12, 14, 449, 798, 4, 1, 39, 5, 687, 5, 0, 695, 366, 2, 73, 1, 17, 327, 28, 258, 1, 2572, 3, 197, 2, 369, 314, 12, 4511, 4, 1, 957, 901, 5, 102, 106, 2, 273, 5, 695, 324, 234, 93, 2326, 4286, 88, 13, 2981, 1, 451, 2, 630, 2054, 13, 17, 101, 13, 2981, 1, 74, 2, 1692, 84, 4660, 2054, 13, 5843, 243, 9, 1, 84, 1660, 30, 669, 84, 533, 451, 1, 427, 2572, 330, 1312, 1

In [None]:
masked_data

array([[    0,     0,     0, ..., 11377,  1376,  8191],
       [    0,     0,     0, ...,   671,  7296,  7297],
       [    2,  8755,  1833, ...,   109, 12805,   361],
       ...,
       [    0,     0,     0, ...,     4,  6091,  6612],
       [    0,     0,     0, ...,   170,   368,   331],
       [ 2304,     1,  7294, ...,   109,     0,   223]], dtype=int32)

In [None]:
mlb = MultiLabelBinarizer(classes = ("goal_1", "goal_2", "goal_3", "goal_4", "goal_5", "goal_6", "goal_7", "goal_8", "goal_9", "goal_10", "goal_11", "goal_12", "goal_13", "goal_14", "goal_15", "goal_16", "goal_17"))
#create boolean mask matched non NaNs values
mask = labelled['Tag'].notnull()

labels = np.array(mlb.fit_transform(labelled.loc[mask, 'Tag'].dropna().str.strip('[]').str.split(',')))

In [None]:
models = []
arch = 'Conv1D_glorot_uniform'
is_mask = "masked"

for fold in os.listdir(CROSS_FOLDS):
    train_index = np.load(f"{CROSS_FOLDS}{fold}/train.npy")
    val_index = np.load(f"{CROSS_FOLDS}{fold}/val.npy")
    test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    x_train, x_val, x_test = masked_data[train_index], masked_data[val_index], masked_data[test_index]
    y_train, y_val, y_test = labels[train_index], labels[val_index], labels[test_index]
    
        
    print(F"Training {fold}")

    print('Preparing embedding matrix.')
    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    
    for word, i in word_index.items():
        if i > MAX_NUM_WORDS:
            continue
        try:
            embedding_vector = word_vectors[word]
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        except:
            pass   
    
    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    
    print('Training model.')
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # 0.22
    if arch == 'conv': 
        # 1D convnet with global maxpooling
        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = GlobalMaxPooling1D()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(len(labels_index), activation='sigmoid')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy', 
                    optimizer=Adam(lr=0.01), 
                    metrics=['accuracy'])
    
    
    # 0.16, 8 epochs without Bidirectional
    # 0.15, 8 epochs with Bidirectional
    # 0.13, 10 epochs with Bidirectional
    if arch == "bidirectionalGRU":
        x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1,recurrent_dropout=0.1))(embedded_sequences)
        x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        x = concatenate([avg_pool, max_pool])
        preds = Dense(17, activation="sigmoid")(x)
        model = Model(sequence_input, preds)
        model.summary() 
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

     # around .21, 10 epochs with Bidirectional
    if arch == "Bidirectional_LSTM":
        x = Bidirectional(LSTM(25, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedded_sequences)
        x = GlobalMaxPooling1D()(x)
        x = Dense(50, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(17, activation="sigmoid")(x)
        model = Model(inputs=sequence_input, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        
        
    if arch == "Conv1D_glorot_uniform":
        x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(embedded_sequences)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        x = concatenate([avg_pool, max_pool])
        preds = Dense(len(labels_index), activation='sigmoid')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy', 
                #optimizer=Adam(lr=0.001),
                optimizer='rmsprop',
                metrics=['accuracy'])
    
    model.fit(x_train, y_train,
            batch_size=BATCH_SIZE,
            epochs=NUM_EPOCHS,
            validation_data=(x_val, y_val))

    models.append([model, x_test, y_test])
    model.save(EMBEDDINGS_DIR + f"{is_mask}{arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5")

    

Training fold_1
Preparing embedding matrix.
Training model.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training fold_2
Preparing embedding matrix.
Training model.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training fold_3
Preparing embedding matrix.
Training model.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training fold_4
Preparing embedding matrix.
Training model.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
E

In [None]:

def metrics_avg(models_testx_testy, labels_, thres=0.3):
    def calc(model, test_x, test_y):
        predictions = model.predict(test_x)>thres
        metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
        metrics_df = pd.DataFrame.from_dict(metrics)
        h = hamming_loss(test_y, predictions)
        roc = roc_auc_score(test_y, predictions, average='micro')
        return metrics_df, h, roc

    model_1, test_x_first, test_y_first = models_testx_testy[0]
    metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_first)
    n = len(models_testx_testy)

    for model, test_x, test_y in models_testx_testy[1:]:
        metrics, h, r = calc(model, test_x, test_y)
        metrics_agg += metrics
        ham += h
        roc += r

    return metrics_agg/n, ham/n, roc/n

In [None]:
loaded_arch = 'maskedConv1D_glorot_uniform'
loaded_models = []
final_models = []
for i, fold in enumerate(os.listdir(CROSS_FOLDS)):
    print(f"Loading {fold}...")
    test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    x_test = masked_data[test_index]
    y_test = labels[test_index]
    
    load_dir = EMBEDDINGS_DIR + f"{loaded_arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5"
    
    final_models.append((loaded_models[i], x_test, y_test))
print(f"Finished loading the {loaded_arch} models.")

Loading fold_1...


IndexError: ignored

In [None]:
avg_results = metrics_avg(models, labels_index, thres=0.3)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
avg_results[0]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,micro avg,macro avg,weighted avg,samples avg
precision,0.31388,0.230229,0.29146,0.410082,0.316481,0.243094,0.276805,0.334438,0.213759,0.226546,0.316958,0.29939,0.29754,0.17177,0.250106,0.282028,0.428877,0.319589,0.288438,0.304945,0.324689
recall,0.617798,0.275868,0.441237,0.761963,0.50991,0.321677,0.383333,0.766928,0.160911,0.336842,0.401052,0.388439,0.502349,0.070588,0.308292,0.393789,0.945092,0.502759,0.446239,0.502759,0.540859
f1-score,0.412096,0.23294,0.313526,0.530061,0.376215,0.249995,0.316556,0.46109,0.181724,0.236786,0.338333,0.322345,0.354064,0.075101,0.262982,0.325708,0.589433,0.387543,0.328174,0.359908,0.336721
support,109.8,88.6,97.0,139.6,111.0,79.6,84.0,120.2,69.6,76.0,96.2,89.6,114.4,50.8,77.2,91.4,163.8,1658.8,1658.8,1658.8,1658.8


In [None]:
hl = round(avg_results[1],4)
roc_auc = round(avg_results[2],4)
print(f"hl;{hl}")
print(f"roc-auc;{roc_auc}")

hl;0.3984
roc-auc;0.5692
