**Source Citation** https://keras.io/examples/nlp/pretrained_word_embeddings/<br>
**Author:** [fchollet](https://twitter.com/fchollet)<br>
**Date created:** 2020/05/05<br>
**Last modified:** 2020/05/05<br>
**Description:** Text classification on the Newsgroup20 dataset using pre-trained GloVe word embeddings.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    untar=True,
)

In [3]:
import os
import pathlib

data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
### print("Number of directories:", len(dirnames))
### print("Directory names:", dirnames)

fnames = os.listdir(data_dir / "comp.graphics")
### print("Number of files in comp.graphics:", len(fnames))
### print("Some example filenames:", fnames[:5])

In [4]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    ### print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

### print("Classes:", class_names)
### print("Number of samples:", len(samples))

In [5]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

In [6]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [7]:
##voc = vectorizer.get_vocabulary()
##word_index = dict(zip(voc, range(len(voc))))

In [8]:
def train_model(button):
    print("file: ", file)
    path_to_glove_file = os.path.join(
        os.path.expanduser("~"), "NMHU\\BSSD5350\\Lessons\\L06\\glove.6B\\" + file.value + ".txt"
    )

    embeddings_index = {}
    with open(path_to_glove_file, encoding="cp437", errors='ignore') as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    print("Found %s word vectors." % len(embeddings_index))

    num_tokens = len(voc) + 2
    if file.value == 'glove.6B.50d':
        dim = 50
    elif file.value == 'glove.6B.100d':
        dim = 100
    elif file.value == 'glove.6B.200d':
        dim = 200
    elif file.value == 'glove.6B.300d':
        dim = 300
    else:
        dim = 50
    embedding_dim = dim
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))

    from tensorflow.keras.layers import Embedding

    embedding_layer = Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )


    from tensorflow.keras import layers

    int_sequences_input = keras.Input(shape=(None,), dtype="int64")
    embedded_sequences = embedding_layer(int_sequences_input)
    x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
    x = layers.MaxPooling1D(5)(x)
    x = layers.Conv1D(128, 5, activation="relu")(x)
    x = layers.MaxPooling1D(5)(x)
    x = layers.Conv1D(128, 5, activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    preds = layers.Dense(len(class_names), activation="softmax")(x)
    model = keras.Model(int_sequences_input, preds)
    model.summary()

    x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
    x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

    y_train = np.array(train_labels)
    y_val = np.array(val_labels)

    model.compile(
        loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
    )
    model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))


    #you can copy the save and load from notebook 6.1
    #but change theoutput filename
    model.save("glove-newsgroups")
    #this created a 126.4 MB fo

In [9]:
from IPython.display import display
from ipywidgets import Dropdown, Button

!jupyter nbextension enable --py widgetsnbextension

def dropdown_eventhandler(change):
    print(change.new)

#define dropdown
option_list = ('glove.6B.50d','glove.6B.100d','glove.6B.200d','glove.6B.300d')
file = Dropdown(description="Choose a file:", options=option_list)
file.observe(dropdown_eventhandler, names='value')
display(file)

# define button
btn=Button(description="Train")
display(btn)
btn.on_click(train_model) 

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


Dropdown(description='Choose a file:', options=('glove.6B.50d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.30…

Button(description='Train', style=ButtonStyle())

glove.6B.200d
file:  Dropdown(description='Choose a file:', index=2, options=('glove.6B.50d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d'), value='glove.6B.200d')
Found 400000 word vectors.
Converted 18018 words (1982 misses)
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 200)         4000400   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         128128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         82048     
____________________