# Tutorial - Text Mining - Classification - Word embedding
We will predict the category of discussion posts in a newsgroup.

**The unit of analysis is a discussion post**

In [None]:
import pandas as pd
import numpy as np

np.random.seed(33)

In [None]:
news = pd.read_csv('news.csv')

In [None]:
news.head(5)

## Change the target variable to ordinal

This is a multi-class classification problem. There are three categories we will predict:<br>
Whether a post is "graphics," "hockey," or "medical" related

#### Keras doesn't like text-based target values. So, we have to change it to "ordinal" values. Though, this is only needed to convert each category to an integer value.

In [None]:
#Convert the target to ordinal
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

news['target'] = enc.fit_transform(news[['newsgroup']])



In [None]:
news.head()

In [None]:
target = news['target']

## Assign the "text" (input) variable

In [None]:
# Check for missing values

news[['TEXT']].isna().sum()

In [None]:
input_data = news['TEXT']

## Split the data

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set, train_y, test_y = train_test_split(input_data, target, test_size=0.3, random_state=42)

In [None]:
train_set.shape, train_y.shape

In [None]:
test_set.shape, test_y.shape

In [None]:
train_set

In [None]:
test_set

## Keras: Tokenizer

In [None]:
# import tokenizer (after installing Tensorflow)
from tensorflow.keras.preprocessing.text import Tokenizer

# When initializing a tokenizer, "num_words" selects the most frequently occuring N terms only
# If you make it "num_words=None" then all terms are included
keras_tokenizer = Tokenizer(num_words=500, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)

keras_tokenizer.fit_on_texts(train_set)


In [None]:
# After identifying the terms to be used in the term-by-document matrix, 
# identify the sequence of terms in each document

train_sequence = keras_tokenizer.texts_to_sequences(train_set)

test_sequence = keras_tokenizer.texts_to_sequences(test_set)
                                                                                                              

In [None]:
#print(keras_tokenizer.word_counts)
#print(keras_tokenizer.document_count)
print(keras_tokenizer.word_index)
#print(keras_tokenizer.word_docs)

In [None]:
# be careful: the sequences are of unequal length

print(train_sequence)

In [None]:
# We need to pad sequences with zeros, because some are shorter than others
# Also, you can set a predefined length (using maxlen) to trim long sequences

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_train = pad_sequences(train_sequence, maxlen = 200, padding='post')

In [None]:
print(padded_train)

In [None]:
padded_train.shape

In [None]:
# we have to make the test shape the same: otherwise, it won't be compatible with train

padded_test = pad_sequences(test_sequence, maxlen=200, padding='post')

padded_test.shape

In [None]:
print(padded_test)


## Embedding

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding

# input_dim = vocabulary size
# output_dim = dimension of embedding (vector for one term)
# input_length = num of columns (i.e. length of sequence) 


embedding = tf.keras.layers.Embedding(input_dim=500, output_dim=4, input_length=200, mask_zero=True)

In [None]:
train_embedding = embedding(padded_train)

train_embedding.shape

In [None]:
train_embedding.numpy()

In [None]:
#Inspect the first document of train:

train_embedding.numpy()[0]

# Neural Network (to be discussed in a later module)

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=500, output_dim=50, input_length=200),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(padded_train, train_y, epochs=25, validation_data=(padded_test, test_y), verbose=1)

In [None]:
print("train accuracy =" , history.history['accuracy'][-1])
print("test accuracy =" , history.history['val_accuracy'][-1])