# Text Classification Models - An Extensive List 

I won't go into the details & bore you'll with the information about "what is text classification?". Instead I shall go straight to implementing various models for text classification [assuming thats what you're here for :-)]. 

I will keep the notebook fairely organised & well commented for easy reading, please do **UPVOTE** if you find it helpful.

# Setup

### Libraries

In [None]:
# Generic
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, warnings, gc, string
warnings.filterwarnings("ignore")

# SKLearn
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

# Tensorflow / Keras
import tensorflow as tf
from keras.preprocessing import text,sequence
from keras import layers,models,optimizers
import tensorflow_hub as hub

# XGBoost & Textblob
import xgboost

#Gensim Library for Text Processing
import gensim.parsing.preprocessing as gsp
from gensim import utils

### Data Setup

In [None]:
'''Load'''

#train
url = '../input/analytics-vidhya-identify-the-sentiments/train.csv'
df = pd.read_csv(url, header='infer')

#Drop Columns
df.drop('id', inplace=True, axis=1)

#Inspect
print("Total Records (training dataset): ", df.shape[0])

In [None]:
'''Tweet Data Cleaning Utility Function'''

processes = [
               gsp.strip_tags, 
               gsp.strip_punctuation,
               gsp.strip_multiple_whitespaces,
               gsp.strip_numeric,
               gsp.remove_stopwords, 
               gsp.strip_short, 
               gsp.stem_text
            ]

def proc_txt(txt):
    text = txt.lower()
    text = utils.to_unicode(text)
    for p in processes:
        text = p(text)
    return text


# Training Dataset
df['tweet_cln'] = df['tweet'].apply(lambda x: proc_txt(x))

In [None]:
# Training Dataset
df.head()

In [None]:
 '''Data Split (training dataset)'''
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['tweet_cln'], df['label'])



'''Feature Engineering of Training Dataset [TF-IDF Vectors] - Basic Classifiers'''
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df['tweet_cln'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)



'''Feature Engineering of Training Dataset [Word Embedding] - Deep Neural'''
embeddings_index = {}

for i, line in enumerate(open('../input/wikinews300d1mvec/wiki-news-300d-1M.vec')):  #Pretrained Word Embedding Vectors
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# Tokenizer 
token = text.Tokenizer()
token.fit_on_texts(df['tweet_cln'])
word_index = token.word_index

# Text to Sequence 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# Token-embedding Mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build Model

In [None]:
'''Utility Function'''

def model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
   
    #free memory
    gc.collect()
    
    return metrics.accuracy_score(predictions, valid_y)

### Naive Bayes

In [None]:
nb_acc = model(naive_bayes.MultinomialNB(),xtrain_tfidf, train_y, xvalid_tfidf)
print("Naive Bayes(multinomial) Accuracy Achieved: ", '{:.2%}'.format(nb_acc))

### Logistic Reg Classifier

In [None]:
ln_acc = model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("Logistic Reg Accuracy Achieved: ", '{:.2%}'.format(ln_acc))

### Random Forest

In [None]:
rf_acc = model(ensemble.RandomForestClassifier(random_state=42), xtrain_tfidf, train_y, xvalid_tfidf)
print("Random Forest Accuracy Achieved: ", '{:.2%}'.format(rf_acc))

### XGBoost

In [None]:
xgb_acc = model(xgboost.XGBClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("XGBoost Accuracy Achieved: ", '{:.2%}'.format(xgb_acc))

### CNN (Keras)

In [None]:
'''Create Model'''

# Input Layer
input_layer = layers.Input((70, ))

# Word Embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# Convolutional Layer
conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

# Pooling Layer
pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

# Output Layers
output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

# Compile
cnn_model = models.Model(inputs=input_layer, outputs = output_layer2)
cnn_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')


cnn_acc = model(cnn_model, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("CNN Model Accuracy Achieved: ", '{:.2%}'.format(cnn_acc))

### RNN - LSTM

In [None]:
'''Create Model'''

# Input Layer
input_layer = layers.Input((70, ))

# Word Embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# LSTM Layer
lstm_layer = layers.LSTM(100)(embedding_layer)

# Output Layers
output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

# Compile
rnn_model = models.Model(inputs=input_layer, outputs = output_layer2)
rnn_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')


rnn_acc = model(rnn_model, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN(LSTM) Model Accuracy Achieved: ", '{:.2%}'.format(rnn_acc))

### RNN - GRU

In [None]:
'''Create Model'''

# Input Layer
input_layer = layers.Input((70, ))

# Word Embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# GRU Layer
gru_layer = layers.GRU(100)(embedding_layer)

# Output Layers
output_layer1 = layers.Dense(50, activation="relu")(gru_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

# Compile
rnngru_model = models.Model(inputs=input_layer, outputs = output_layer2)
rnngru_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')


rnngru_acc = model(rnngru_model, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN(GRU) Model Accuracy Achieved: ", '{:.2%}'.format(rnngru_acc))

### RNN - BiDirectional(GRU)

In [None]:
'''Create Model'''

# Input Layer
input_layer = layers.Input((70, ))

# Word Embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# BiDirectional Layer
bi_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

# Output Layers
output_layer1 = layers.Dense(50, activation="relu")(bi_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

# Compile
rnnbi_model = models.Model(inputs=input_layer, outputs = output_layer2)
rnnbi_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')


rnnbi_acc = model(rnnbi_model, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN(BiDirectional-GRU) Model Accuracy Achieved: ", '{:.2%}'.format(rnnbi_acc))

### RNN - BiDirectional(LSTM)

In [None]:
'''Create Model'''

# Input Layer
input_layer = layers.Input((70, ))

# Word Embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# BiDirectional Layer
bi_layer = layers.Bidirectional(layers.LSTM(100))(embedding_layer)

# Output Layers
output_layer1 = layers.Dense(50, activation="relu")(bi_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

# Compile
rnnbil_model = models.Model(inputs=input_layer, outputs = output_layer2)
rnnbil_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')


rnnbil_acc = model(rnnbil_model, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN(BiDirectional-LSTM) Model Accuracy Achieved: ", '{:.2%}'.format(rnnbil_acc))

### RCNN

In [None]:
'''Create Model'''

# Input Layer
input_layer = layers.Input((70, ))

# Word Embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# Recurrent Layer
rnn_layer = layers.Bidirectional(layers.GRU(100,return_sequences=True))(embedding_layer)
    
# Convolutional Layer
conv_layer = layers.Convolution1D(100, 3, activation="relu")(rnn_layer)

# Pooling Layer
pooling_layer = layers.GlobalMaxPool1D()(conv_layer)


# Output Layers
output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

# Compile
rcnn_model = models.Model(inputs=input_layer, outputs = output_layer2)
rcnn_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')


rcnn_acc = model(rcnn_model, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RCNN Model Accuracy Achieved: ", '{:.2%}'.format(rcnn_acc))