# Tutorial - Text Mining - Classification - KERAS

We will predict the category of discussion posts in a newsgroup.

**The unit of analysis is a discussion post**

In [None]:
import pandas as pd
import numpy as np

In [None]:
news = pd.read_csv('news.csv')

In [None]:
news.head(5)

## Change the target variable to ordinal

This is a multi-class classification problem. There are three categories we will predict:<br>
Whether a post is "graphics," "hockey," or "medical" related

#### Keras doesn't like text-based target values. So, we have to change it to "ordinal" values. Though, this is only needed to convert each category to an integer value.

In [None]:
#Convert the target to ordinal
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

news['target'] = enc.fit_transform(news[['newsgroup']])



In [None]:
news.head()

In [None]:
target = news['target']

## Assign the "text" (input) variable

In [None]:
# Check for missing values

news[['TEXT']].isna().sum()

In [None]:
input_data = news['TEXT']

## Split the data

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set, train_y, test_y = train_test_split(input_data, target, test_size=0.3, random_state=42)

In [None]:
train_set.shape, train_y.shape

In [None]:
test_set.shape, test_y.shape

## Keras: Tokenizer

In order to use Keras, you first need to install tensorflow. You can start the Anaconda Prompt and enter the following to do so: `pip install tensorflow`

Keras Tokenizer works a little different than scikit-learn (but the idea is the same)

In [None]:
# import tokenizer (after installing Tensorflow)
from tensorflow.keras.preprocessing.text import Tokenizer

# When initializing a tokenizer, "num_words" selects the most frequently occuring N terms only
# If you make it "num_words=None" then all terms are included
keras_tokenizer = Tokenizer(num_words=500, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)

keras_tokenizer.fit_on_texts(train_set)


In [None]:
# After identifying the terms to be used in the term-by-document matrix, 
# create the matrix using one of the below


#train_binary_matrix = keras_tokenizer.texts_to_matrix(train_set, mode='binary')
train_binary_matrix = keras_tokenizer.texts_to_matrix(train_set, mode='tfidf')
#train_binary_matrix = keras_tokenizer.texts_to_matrix(train_set, mode='freq')  # ratio of terms in a document
#train_binary_matrix = keras_tokenizer.texts_to_matrix(train_set, mode='count')


train_binary_matrix.shape

In [None]:
# Now we need to perform the test data set

#test_binary_matrix = keras_tokenizer.texts_to_matrix(test_set, mode='binary')
test_binary_matrix = keras_tokenizer.texts_to_matrix(test_set, mode='tfidf')
#test_binary_matrix = keras_tokenizer.texts_to_matrix(test_set, mode='freq')  # ratio of terms in a document
#test_binary_matrix = keras_tokenizer.texts_to_matrix(test_set, mode='count')


test_binary_matrix.shape

In [None]:
train_binary_matrix

In [None]:
# Tokenizer's attributes:

print(keras_tokenizer.word_counts)
#print(keras_tokenizer.document_count)
#print(keras_tokenizer.word_index)
#print(keras_tokenizer.word_docs)


# word_counts: A dictionary of words and their counts.
# word_docs: A dictionary of words and how many documents each appeared in.
# word_index: A dictionary of words and their uniquely assigned integers.
# document_count:An integer count of the total number of documents that were used to fit the Tokenizer.

### We are not creating SVDs here. But you can if you want. It is a matter of preference. 
### Also, note that we limited the columns to 500 (by selecting the most commonly occurring terms)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 

from sklearn.metrics import accuracy_score

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1) 

rnd_clf.fit(train_binary_matrix, train_y)



## Accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_binary_matrix)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_binary_matrix)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)

## Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=100, tol=1e-3)


In [None]:
sgd_clf.fit(train_binary_matrix, train_y)

## Accuracy

In [None]:
#Train accuracy

train_y_pred = sgd_clf.predict(train_binary_matrix)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = sgd_clf.predict(test_binary_matrix)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)