# Template Text Classifier
This is a text classifier I used for one of my previous projects. The dataset has been redacted due to data protection issues. Feel free to use the code as a template for your own dataset

## Load Required Packages

In [1]:
import os
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import random
import nltk
import math
from nltk.stem import PorterStemmer
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, cross_validate, GridSearchCV, cross_val_predict, KFold, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer, cohen_kappa_score, f1_score, confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import spacy

In [None]:
# download packages for preprocessing
nltk.download('stopwords')
nltk.download('names')
nltk.download("brown")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("universal_tagset")
nltk.download("punkt")

## Preprocessing
The goal of preprocessing is to clean up noisy text that might affect the performance of the text classifier. I did the following steps in preprocessing: converting words to lower case, removing punctuation and normalizing non-standard words.

In [6]:
# Helper functions to preprocess data
import string
from spacy.lang.en import English
from normalise import normalise

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()


# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in nlp(sentence) ]

#     remove punctuation
    mytokens = [tok for tok in mytokens if (tok not in punctuations)]

#     normalise
    if '-4th' in mytokens:
        mytokens[mytokens.index('-4th')] = 'to the power of negative four'
    else:    
        mytokens = normalise(mytokens, verbose=False)
    
    # return preprocessed list of tokens
    return mytokens

# convert list of tokens into one string
def join_tokens(tokens):
    return ' '.join(tokens)

To load your own dataset, replace "MY_DATA" with your own data path. The code here loads data from csv to pandas dataframe, but you can load other types of data as long as you change the code.

In [None]:
# load data from csv
data_path = os.path.join(os.getcwd(), "MY_DATA")
data = pd.read_csv(data_path)
data.head(5)

In [None]:
# clean data

tokenized = []

for row in labeled.itertuples():
    text = row.text_without_emoji
    tokens = spacy_tokenizer(text)
    tokens = join_tokens(tokens)
    tokenized.append(tokens)

# Replace 'features' with the name of the feature column in your own data
data['features'] = tokenized
data.dropna(subset=['features'], inplace=True)

clean_path = os.path.join(os.getcwd(), "CLEAN_DATA")
labeled.to_csv(path_or_buf=clean_path)

Once you preprocess your text and save it in a csv, you can load the cleaned text directly next time you classify your text.

In [None]:
# load preprocessed data from csv
data_path = os.path.join(os.getcwd(), "CLEAN_DATA")
data = pd.read_csv(data_path)
data.head(5)

## Random Forest Classifier
For baseline model, I build a pipeline to extract word features as TF-IDF vectors and classify text using random forest. I tune the model hyperparameters using grid search cross validation on stratified 10-fold data.

In [8]:
# define X, y
# You may replace 'features' and 'labels' with your own features and labels column names
X = data['features'].values
y = data['labels'].values

In [9]:
# create text classification pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer(analyzer='word', 
                                               ngram_range=(1, 2),
                                               sublinear_tf=True)), 
                     ('clf', RandomForestClassifier(n_estimators=200, 
                                max_samples=None, criterion='gini',
                                random_state=0, max_features=None, 
                                bootstrap=True, class_weight='balanced_subsample', 
                                n_jobs=-1))])

In [None]:
# tune pipeline parameters on stratified 10-fold
skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
parameters = {'clf__min_samples_leaf':[0.015, 0.02, 0.03, 0.04]}


cohen_kappa = make_scorer(cohen_kappa_score)
f1 = make_scorer(f1_score, average='macro')
accuracy = make_scorer(accuracy_score)

search = GridSearchCV(text_clf, parameters, cv=skf, scoring={'f1': f1, 
                        'accuracy': accuracy, 'cohen_kappa': cohen_kappa}, 
                      refit='f1', verbose=20, n_jobs=-1)



search.fit(X, y)
results = search.cv_results_
print(results)
print(search.best_params_)

y_pred = cross_val_predict(search.best_estimator_, X, y, cv=skf, 
                           verbose=20, n_jobs=-1)

### Visualize Classifier
I use the following code to visualize the classification process and performance. You may replace the class labels with the values in your own dataset.

In [None]:
# visualize a randomly selected decision tree in the random forest

# Extract single tree
estimator = text_clf.named_steps.clf.estimators_[131]

from sklearn.tree import export_graphviz
import graphviz

dot_data = export_graphviz(estimator, out_file=None, 
                feature_names=text_clf.named_steps.tfidf.get_feature_names(), 
                          class_names=['1', '2', '3'], filled=True)

graph = graphviz.Source(dot_data) 
graph.render("tfidf")
graph


In [9]:
# helper function to plot confusion matrix
def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True)
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
# plot confusion matrix
cm = confusion_matrix(y, y_pred)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=['1', '2', '3'])

## Neural Network Classifier
Now I use the Keras neural model in Tensorflow to classify my text. I compare the performance of this neural network with the random forest.

### Preparing Data
Since the Keras model in Tensorflow requires a different data input format, I need to convert our data to the format accepted by Keras.

In [None]:
# First, create a dataframe with features labels ONLY. Save it in a csv.
text = data['features'].values
labels = data['labels'].values
xy_dict = {'features': text, 'labels': labels}
xy_df = pd.DataFrame.from_dict(xy_dict)
out_path = os.path.join(os.getcwd(), "xy.csv")
xy_df.to_csv(path_or_buf=out_path)

You only need to do the conversion once. You can load the dataset directly in the future.


In [None]:
# helper to create train, validation, test dataframes from csv
def split_df_from_csv(data_path, feature='comments', target='CE_label', test_size=0.3, n_val_splits=10):
    X_train_val, X_test, y_train_val, y_test = load_and_test_split(data_path, feature, target, test_size)
    X_train, X_val, y_train, y_val = val_split(X_train_val, y_train_val, n_val_splits)
    return splits_to_df(X_train, y_train, X_val, y_val, X_test, y_test)

# helper to load data from csv and separate test split
def load_and_test_split(data_path, feature, target, test_size):
    df = pd.read_csv(data_path)
    X = df[feature].values
    y = df[target].values
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=test_size, random_state=0, 
                                                stratify=y)
    return X_train_val, X_test, y_train_val, y_test

# helper to create validation split
def val_split(X, y, n_splits):     
    sss = StratifiedShuffleSplit(n_splits=n_splits, random_state=0)
    X_train, X_val, y_train, y_val = [], [], [], []
    for train_index, val_index in sss.split(X, y):
        X_train.append(X[train_index])
        X_val.append(X[val_index])
        y_train.append(y[train_index])
        y_val.append(y[val_index])
    return X_train, X_val, y_train, y_val

# helper to create n dataframes from n splits
def n_splits_to_df(X_splits, y_splits):
    split_dfs = []
    for i in range(len(X_splits)):
        split_dict = {'comments': X_splits[i], 'CE_label': y_splits[i]}
        split_df = pd.DataFrame.from_dict(split_dict)
        split_dfs.append(split_df)
    return split_dfs
    
#   helper to put split data in pandas dataframe
def splits_to_df(X_train, y_train, X_val, y_val, X_test, y_test):
    test_dict = {'comments': X_test, 'CE_label': y_test}
    test_df = pd.DataFrame.from_dict(test_dict)
    train_dfs = n_splits_to_df(X_train, y_train)
    val_dfs = n_splits_to_df(X_val, y_val)
    return train_dfs, val_dfs, test_df

# load reformatted data from csv into pandas dataframe
data_path = os.path.join(os.getcwd(), "comments_labels.csv")
train_dfs, val_dfs, test_df = split_df_from_csv(data_path)

In [None]:
# helper to encode class labels as numeric values
def encode_labels(labels):
    labels_lookup = {'1': 0, '2': 1, '3': 2, '4': 3}
    labels_encoded = np.vectorize(labels_lookup.get(labels))
    return labels_encoded
    
# helper to convert a list of pandas dataframes to a list of tensorflow datasets
def dfs_to_tf_data(dfs, example, target):
    tf_sets = []
    for df in dfs:
        tf_set = tf.data.Dataset.from_tensor_slices((df[example].values, encode_labels(df[target].values)))
        tf_sets.append(tf_set)
    return tf_sets

# helper to convert train, validation, test split dataframes to tensorflow dataset
def splits_to_tf_data(train_dfs, val_dfs, test_df, example='features', target='labels'):
    train_sets = dfs_to_tf_data(train_dfs, example, target)
    val_sets = dfs_to_tf_data(val_dfs, example, target)
    test_set = tf.data.Dataset.from_tensor_slices((test_df[example].values, encode_labelsde_labels(test_df[target].values)))
    return train_sets, val_sets, test_set

# convert train, validation, test dataframes into tensorflow datasets
train_sets, val_sets, test_set = splits_to_tf_data(train_dfs, val_dfs, test_df)

### Training Keras
Now the data is formatted for Tensorflow, I can train my model using the data.

In [None]:
# embed training examples using model trained on google news
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [None]:
# build the neural model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='softmax'))

model.summary()

In [None]:
# compile model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# train model
train_set = train_sets[0]
val_set = val_sets[0]
history = model.fit(train_set.shuffle(309).batch(32), 
                    epochs=20,
                    validation_data=val_set.batch(32),
                    verbose=2)

### Evaluation
I test my model on the testing dataset.

In [None]:
# getting model results on test data.
results = model.evaluate(test_data.batch(256), verbose=2)

for name, value in zip(model.metrics_names, results):
    print("%s: %.3f" % (name, value))