Text classification is the task of assigning a set of predefined categories to open-ended text. Text classifiers can be used to organize, structure, and categorize pretty much any kind of text – from documents, medical studies and files, and all over the web.We will classify the text into 9 categories.The 9 categories are:
- computer       
- science        
- politics       
- sport          
- automobile     
- religion        
- medicine       
- sales           
- alt.atheism

# Import Libraries

Let's first import all the required libraries

In [None]:
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from time import time
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_confusion_matrix, confusion_matrix, f1_score
from statistics import mean
import pickle
from tensorflow import keras
from keras import layers
from keras import losses
from keras import utils
from keras.layers.experimental.preprocessing import TextVectorization
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout
from tensorflow.keras.models import load_model
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

# Load Dataset

We will going to use the 20 news group dataset.Let's load the dataset in dataframe

In [None]:
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
df = pd.DataFrame()
df['text'] = dataset.data
df['source'] = dataset.target
label=[]
for i in df['source']:
    label.append(dataset.target_names[i])
df['label']=label

In [None]:
# first few rows of the dataset
df.head()

We will later use the label enocder to convert the labels (categorical value) into numeric value.So now, we will drop that column

In [None]:
# drop source column
df.drop(['source'],axis=1,inplace=True)

Let's see the count of each label

In [None]:
# value count
df['label'].value_counts()

In our dataset we have very less data in a each categorical label and there are 20 categories which are too much.We will combine the sub-categories

- So in politics we have mideast, guns and misc sub-topics we will replace all to politics
- We have sub-categories in sports, we will going to replace this also into sports
- We have two sub categories in religion, we will replace them to one
- We are going to make 9 categories in all

In [None]:
# replace to politics
df['label'].replace({'talk.politics.misc':'politics','talk.politics.guns':'politics',
                     'talk.politics.mideast':'politics'},inplace=True)
                    
# replace to sport
df['label'].replace({'rec.sport.hockey':'sport','rec.sport.baseball':'sport'},inplace=True)
                    
# replace to religion
df['label'].replace({'soc.religion.christian':'religion','talk.religion.misc':'religion'},inplace=True)
                    
# replace to computer
df['label'].replace({'comp.windows.x':'computer','comp.sys.ibm.pc.hardware':'computer',
                    'comp.os.ms-windows.misc':'computer','comp.graphics':'computer',
                    'comp.sys.mac.hardware':'computer'},inplace=True)  
# replace to sales
df['label'].replace({'misc.forsale':'sales'},inplace=True)

# replace to automobile
df['label'].replace({'rec.autos':'automobile','rec.motorcycles':'automobile'},inplace=True)

# replace to science
df['label'].replace({'sci.crypt':'science','sci.electronics':'science','sci.space':'science'},inplace=True)

# replace to medicine
df['label'].replace({'sci.med':'medicine'},inplace=True)

Let's see the number of unique targets 

In [None]:
# number of targets
df['label'].nunique()

In [None]:
# value count
df['label'].value_counts()

We are going to make a number of words column in which there is the number of words in a particular text

In [None]:
df['Number_of_words'] = df['text'].apply(lambda x:len(str(x).split()))
df.head()

Check the basic stats of number of words, like maximum, minimum, average number of words

In [None]:
# basic stats
df['Number_of_words'].describe()

So the maximum number of words in our dataset is 11,765.Let's have a look at it

In [None]:
df[df['Number_of_words']==11765]

So maximu number of words text is belongs to electronics category.In our dataset we have some rows where there are no text at all i.e. the number of words is 0.We will drop those rows

In [None]:
no_text = df[df['Number_of_words']==0]
print(len(no_text))

# drop these rows
df.drop(no_text.index,inplace=True)

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(12,6))
sns.distplot(df['Number_of_words'],kde = False,color="red",bins=200)
plt.title("Frequency distribution of number of words for each text extracted", size=20)

# Data Pre-Processing

Now it's time to clean our dataset, we will lower the text, remove the text in square brackets, remove links and remove words containing numbers

In [None]:
# cleaning the text

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to  datasets
df['cleaned_text'] = df['text'].apply(lambda x: clean_text(x))

# updated text
df['cleaned_text'].head()

Let's convert our cleaned text into tokens

In [None]:
tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
df['tokens'] = df['cleaned_text'].apply(lambda x:tokenizer.tokenize(x))
df.head()

Stopwords are those english words which do not add much meaning to a sentence.They are very commonly used words and we do not required those words. So we can remove those stopwords

In [None]:
# stopwords
stopwords.words('english')[0:5]

Let's check number of stopwords in nltk library

In [None]:
len(stopwords.words('english'))

Now we are going to remome the stopwords from the sentences

In [None]:
# removing stopwords
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 
df['stopwordremove_tokens'] = df['tokens'].apply(lambda x : remove_stopwords(x))
df.head()

It's time to do lemmatization

In [None]:
# lemmatization
lem = WordNetLemmatizer()
def lem_word(x):
    return [lem.lemmatize(w) for w in x]

df['lemmatized_text'] = df['stopwordremove_tokens'].apply(lem_word)
df.head()

Now we are going to combine our text, this is our final text

In [None]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

df['final_text'] = df['lemmatized_text'].apply(lambda x : combine_text(x))
df.head()

So we have cleaned the dataset and remove stopwords, it's possible that there are rows in which the text length is 0.We will find those rows and remove them

In [None]:
df['Final_no_of_words'] = df['final_text'].apply(lambda x:len(str(x).split()))
df.head()

In [None]:
# basic stats
df['Final_no_of_words'].describe()

In [None]:
# number of rows with text lenth = 0
print(len(df[df['Final_no_of_words']==0]))

# drop those rows
df.drop(df[df['Final_no_of_words']==0].index,inplace=True)

Now our text has been cleaned, we will convert the labels into numeric values using LableEncoder()

In [None]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['target']= label_encoder.fit_transform(df['label'])
  
df['target'].unique()

# Dependent and Independent Variable

In [None]:
# dependent and independent variable
X = df['final_text']
y = df['target']

In [None]:
X.shape,y.shape

# Bag-of-Words

CountVectorizer is used to transform a given text into a vector on the basis of the frequency(count) of each word that occurs in the entire text.It involves counting the number of occurences each words appears in a document(text)

In [None]:
count_vectorizer = CountVectorizer()
count_vector = count_vectorizer.fit_transform(X)
print(count_vector[0].todense())

# Tf-Idf

Tf-Idf stands for Term Frequency-Inverse document frequency.It is a techinque to quantify a word in documents,we generally compute a weight to each word which signifies the importance of the word which signifies the importance of the word in the document and corpus

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df = 2,max_df = 0.5,ngram_range = (1,2))
tfidf = tfidf_vectorizer.fit_transform(X)
print(tfidf[0].todense())

# SMOTE technique to balance the dataset

So we can clearly see that our dataset is imbalanced dataset.We will use SMOTE technique to balance the dataset.SMOTE is an oversampling technique where the synthetic samples are generated for the minority class.The algorithm helps to overcome the overfitting problem posed by random sampling. 

In [None]:
# count vector
smote = SMOTE(random_state = 402)
X_smote, Y_smote = smote.fit_resample(count_vector,y)


sns.countplot(Y_smote)

In [None]:
# tfidf
smote = SMOTE(random_state = 402)
X_smote_tfidf, Y_smote_tfidf = smote.fit_resample(tfidf,y)

sns.countplot(Y_smote_tfidf)

## Train-Test Split

In [None]:
# train-test split countvector
X_train, X_test, y_train, y_test = train_test_split(X_smote, Y_smote, test_size = 0.20, random_state = 0)
X_train.shape, X_test.shape,y_train.shape, y_test.shape

In [None]:
# train-test split tfidf
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_smote_tfidf, Y_smote_tfidf , test_size = 0.20, random_state = 0)

In [None]:
training_time_container = {'linear_svm_tfidf':0,'linear_svm':0,'mnb_naive_bayes_tfidf':0,
                         'mnb_naive_bayes':0,'random_forest_tfidf':0,'random_forest':0,
                          'logistic_reg':0,'logistic_reg_tfidf':0}
prediction_time_container = {'linear_svm_tfidf':0,'linear_svm':0,'mnb_naive_bayes_tfidf':0,
                         'mnb_naive_bayes':0,'random_forest_tfidf':0,'random_forest':0,
                            'logistic_reg':0,'logistic_reg_tfidf':0}
accuracy_container = {'linear_svm_tfidf':0,'linear_svm':0,'mnb_naive_bayes_tfidf':0,
                         'mnb_naive_bayes':0,'random_forest_tfidf':0,'random_forest':0,
                     'logistic_reg':0,'logistic_reg_tfidf':0}

# Logistic Regression

In [None]:
# on countvector
lg = LogisticRegression(C = 1.0)
#Fitting the model 
t0=time()
lg.fit(X_train,y_train)
training_time_container['logistic_reg']=time()-t0


# Predicting the Test set results
t0 = time()
y_pred_lg = lg.predict(X_test)
prediction_time_container['logistic_reg']=time()-t0

lg_test_accuracy =  accuracy_score(y_test,y_pred_lg)
accuracy_container['logistic_reg'] = lg_test_accuracy

print('Training Accuracy : ', accuracy_score(y_train,lg.predict(X_train)))
print('Testing Accuracy: ',lg_test_accuracy)
print("Training Time: ",training_time_container['logistic_reg'])
print("Prediction Time: ",prediction_time_container['logistic_reg'])
print(confusion_matrix(y_test,y_pred_lg))

In [None]:
# on tfidf
lg = LogisticRegression(C = 1.0)
#Fitting the model 
t0=time()
lg.fit(X_train_tfidf,y_train_tfidf)
training_time_container['logistic_reg_tfidf']=time()-t0

# Predicting the Test set results
t0=time()
ypred_lg_tf = lg.predict(X_test_tfidf)
prediction_time_container['logistic_reg_tfidf']=time()-t0

lg_test_accuracy_tf  = accuracy_score(y_test_tfidf,ypred_lg_tf)
accuracy_container['logistic_reg_tfidf'] = lg_test_accuracy_tf

print('Training Accuracy: ', accuracy_score(y_train_tfidf,lg.predict(X_train_tfidf)))
print('Testing Accuracy: ', lg_test_accuracy_tf)
print("Training Time: ",training_time_container['logistic_reg_tfidf'])
print("Prediction Time: ",prediction_time_container['logistic_reg_tfidf'])
print(confusion_matrix(y_test,ypred_lg_tf))

## Multinomial Naive Bayes

In [None]:
# on countvector
nb = MultinomialNB()
#Fitting the model 
t0=time()
nb.fit(X_train,y_train)
training_time_container['mnb_naive_bayes']=time()-t0


# Predicting the Test set results
t0 = time()
y_pred_nb = nb.predict(X_test)
prediction_time_container['mnb_naive_bayes']=time()-t0

mnb_test_accuracy =  accuracy_score(y_test,y_pred_nb)
accuracy_container['mnb_naive_bayes'] = mnb_test_accuracy

print('Training Accuracy : ', accuracy_score(y_train,nb.predict(X_train)))
print('Testing Accuracy: ',mnb_test_accuracy)
print("Training Time: ",training_time_container['mnb_naive_bayes'])
print("Prediction Time: ",prediction_time_container['mnb_naive_bayes'])
print(confusion_matrix(y_test,y_pred_nb))

In [None]:
# on tfidf
nb = MultinomialNB()
#Fitting the model 
t0=time()
nb.fit(X_train_tfidf,y_train_tfidf)
training_time_container['mnb_naive_bayes_tfidf']=time()-t0

# Predicting the Test set results
t0=time()
ypred_nb_tf = nb.predict(X_test_tfidf)
prediction_time_container['mnb_naive_bayes_tfidf']=time()-t0

mnb_tfidf_test_accuracy = accuracy_score(y_test_tfidf,ypred_nb_tf)
accuracy_container['mnb_naive_bayes_tfidf'] = mnb_tfidf_test_accuracy 


print('Training Accuracy: ', accuracy_score(y_train_tfidf,nb.predict(X_train_tfidf)))
print('Testing Accuracy: ',mnb_tfidf_test_accuracy )
print("Training Time: ",training_time_container['mnb_naive_bayes_tfidf'])
print("Prediction Time: ",prediction_time_container['mnb_naive_bayes_tfidf'])
print(confusion_matrix(y_test,ypred_nb_tf))

## SVM using Stochastic Gradient Descent

In [None]:
# Used hinge loss which gives linear Support Vector Machine. Also set the learning rate to 0.0001 (also the default value)
# which is a constant that's gets multiplied with the regularization term. For penalty, I've used L2 which is the standard
#regularizer for linear SVMs


# on countvector
svm_classifier = linear_model.SGDClassifier(loss='hinge',alpha=0.0001)
t0=time()
svm_classifier.fit(X_train,y_train)
training_time_container['linear_svm']=time()-t0

# Predicting the Test set results
t0=time()
y_pred_svm = svm_classifier.predict(X_test)
prediction_time_container['linear_svm']=time()-t0

svm_test_accuracy  = accuracy_score(y_test,y_pred_svm)
accuracy_container['linear_svm'] = svm_test_accuracy 

print('Training Accuracy : ', accuracy_score(y_train,svm_classifier.predict(X_train)))
print('Testing Accuracy: ',svm_test_accuracy )
print("Training Time: ",training_time_container['linear_svm'])
print("Prediction Time: ",prediction_time_container['linear_svm'])
print(confusion_matrix(y_test,y_pred_svm))

In [None]:
# on tfidf
svm_classifier = linear_model.SGDClassifier(loss='hinge',alpha=0.0001)
#Fitting the model 
t0=time()
svm_classifier.fit(X_train_tfidf,y_train_tfidf)
training_time_container['linear_svm_tfidf']=time()-t0

# Predicting the Test set results
t0=time()
ypred_svm_tf = svm_classifier.predict(X_test_tfidf)
prediction_time_container['linear_svm_tfidf']=time()-t0

svm_test_accuracy_tf  = accuracy_score(y_test_tfidf,ypred_svm_tf)
accuracy_container['linear_svm_tfdif'] = svm_test_accuracy_tf 

print('Training Accuracy: ', accuracy_score(y_train_tfidf,svm_classifier.predict(X_train_tfidf)))
print('Testing Accuracy: ', svm_test_accuracy_tf)
print("Training Time: ",training_time_container['linear_svm_tfidf'])
print("Prediction Time: ",prediction_time_container['linear_svm_tfidf'])
print(confusion_matrix(y_test,ypred_svm_tf))

## RandomForest 

In [None]:
# on count vectorizer
rf = RandomForestClassifier(n_estimators=50)
t0=time()
rf.fit(X_train,y_train)
training_time_container['random_forest']=time()-t0

# Predicting the Test set results
t0=time()
y_pred_rf = rf.predict(X_test)
prediction_time_container['random_forest']=time()-t0

rf_test_accuracy  = accuracy_score(y_test,y_pred_rf)
accuracy_container['random_forest'] = rf_test_accuracy 


print('Training Accuracy : ', accuracy_score(y_train,rf.predict(X_train)))
print('Testing Accuracy: ',rf_test_accuracy )
print("Training Time: ",training_time_container['random_forest'])
print("Prediction Time: ",prediction_time_container['random_forest'])
print(confusion_matrix(y_test,y_pred_rf))

In [None]:
# on tfidf
rf = RandomForestClassifier(n_estimators=50)
#Fitting the model 
t0=time()
rf.fit(X_train_tfidf,y_train_tfidf)
training_time_container['random_forest_tfidf']=time()-t0

# Predicting the Test set results
t0=time()
ypred_rf_tf = rf.predict(X_test_tfidf)
prediction_time_container['random_forest_tfidf']=time()-t0

rf_test_accuracy_tf  = accuracy_score(y_test_tfidf,ypred_rf_tf)
accuracy_container['random_forest_tfidf'] = rf_test_accuracy_tf

print('Training Accuracy: ', accuracy_score(y_train_tfidf,rf.predict(X_train_tfidf)))
print('Testing Accuracy: ',rf_test_accuracy_tf )
print("Training Time: ",training_time_container['random_forest_tfidf'])
print("Prediction Time: ",prediction_time_container['random_forest_tfidf'])
print(confusion_matrix(y_test,ypred_rf_tf ))

In [None]:
fig=go.Figure(data=[go.Bar(y=list(training_time_container.values()),x=list(training_time_container.keys()),
                           marker={'color':np.arange(len(list(training_time_container.values())))}
                          ,text=list(training_time_container.values()), textposition='auto' )])

fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)',
                  title="Comparison of Training Time of different classifiers",
                    xaxis_title="Machine Learning Models",
                    yaxis_title="Training time in seconds" )

fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"  
fig

In [None]:
fig=go.Figure(data=[go.Bar(y=list(prediction_time_container.values()),x=list(prediction_time_container.keys()),
                           marker={'color':np.arange(len(list(prediction_time_container.values())))}
                          ,text=list(prediction_time_container.values()), textposition='auto' )])

fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)',
                  title="Comparison of Prediction Time of different classifiers",
                    xaxis_title="Machine Learning Models",
                    yaxis_title="Prediction time in seconds" )

fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"  
fig

In [None]:
fig=go.Figure(data=[go.Bar(y=list(accuracy_container.values()),x=list(accuracy_container.keys()),
                           marker={'color':np.arange(len(list(accuracy_container.values())))}
                          ,text=list(accuracy_container.values()), textposition='auto' )])

fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)',
                  title="Comparison of Accuracy Scores of different classifiers",
                    xaxis_title="Machine Learning Models",
                    yaxis_title="Accuracy Scores" )

fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"  
fig

# Stratified K-fold CV

In machine learning, when we want to train our ML model we split our entire dataset into train set and test set using train test split class present in sklearn.Then we train our model on train set and test our model on test set. The problems that we face are, whenever we change the random_state parameter present in train_test_split(), we get different accuracy for different random_state and hence we can’t exactly point out the accuracy for our model.<br>
The solution for the this problem is to use K-Fold Cross-Validation. But K-Fold Cross Validation also suffer from second problem i.e. random sampling.<br>
The solution for both first and second problem is to use Stratified K-Fold Cross-Validation.Stratified k-fold cross-validation is same as just k-fold cross-validation, But in Stratified k-fold cross-validation, it does stratified sampling instead of random sampling.

## SVM

In [None]:
svm_skcv = linear_model.SGDClassifier(loss='hinge',alpha=0.0001)

# StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified_svm = []
   
for train_index, test_index in skf.split(X_smote_tfidf,Y_smote_tfidf):
    x_train_fold, x_test_fold = X_smote_tfidf[train_index], X_smote_tfidf[test_index]
    y_train_fold, y_test_fold = Y_smote_tfidf[train_index], Y_smote_tfidf[test_index]
    svm_skcv.fit(x_train_fold, y_train_fold)
    lst_accu_stratified_svm.append(svm_skcv.score(x_test_fold, y_test_fold))
   
# Print the output.
print('List of possible accuracy:', lst_accu_stratified_svm)
print('\nMaximum Accuracy That can be obtained from this model is:',max(lst_accu_stratified_svm)*100, '%')
print('\nMinimum Accuracy:', min(lst_accu_stratified_svm)*100, '%')
print('\nOverall Accuracy:',mean(lst_accu_stratified_svm)*100, '%')

## RandomForest

In [None]:
rf_skcv = RandomForestClassifier(n_estimators=50)

# StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified_rf = []
   
for train_index, test_index in skf.split(X_smote_tfidf,Y_smote_tfidf):
    x_train_fold, x_test_fold = X_smote_tfidf[train_index], X_smote_tfidf[test_index]
    y_train_fold, y_test_fold = Y_smote_tfidf[train_index], Y_smote_tfidf[test_index]
    rf_skcv.fit(x_train_fold, y_train_fold)
    lst_accu_stratified_rf.append(rf_skcv.score(x_test_fold, y_test_fold))
   
# Print the output.
print('List of possible accuracy:', lst_accu_stratified_rf)
print('\nMaximum Accuracy That can be obtained from this model is:', max(lst_accu_stratified_rf)*100, '%')
print('\nMinimum Accuracy:', min(lst_accu_stratified_rf)*100, '%')
print('\nOverall Accuracy:', mean(lst_accu_stratified_rf)*100, '%')

## Multinomial Naive Bayes

In [None]:
nb_skcv = MultinomialNB()

# StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified_nb = []
   
for train_index, test_index in skf.split(X_smote_tfidf,Y_smote_tfidf):
    x_train_fold, x_test_fold = X_smote_tfidf[train_index], X_smote_tfidf[test_index]
    y_train_fold, y_test_fold = Y_smote_tfidf[train_index], Y_smote_tfidf[test_index]
    nb_skcv.fit(x_train_fold, y_train_fold)
    lst_accu_stratified_nb.append(nb_skcv.score(x_test_fold, y_test_fold))
   
# Print the output.
print('List of possible accuracy:', lst_accu_stratified_nb)
print('\nMaximum Accuracy That can be obtained from this model is:', max(lst_accu_stratified_nb)*100, '%')
print('\nMinimum Accuracy:', min(lst_accu_stratified_nb)*100, '%')
print('\nOverall Accuracy:', mean(lst_accu_stratified_nb)*100, '%')

# Save the models

In [None]:
import joblib

In [None]:
# cv and tfidf
joblib.dump(count_vectorizer, open('cv.pkl', 'wb'),8)
joblib.dump(tfidf_vectorizer, open('tfidf.pkl', 'wb'),8)

In [None]:
# mnb 
joblib.dump(nb, open('mnb.pkl', 'wb'),8)

# svm
joblib.dump(svm_classifier, open('svm.pkl', 'wb'),8)

# randomforest
joblib.dump(rf , open('rf.pkl', 'wb'),8)

# LSTM

We will not going to create RNN model due to its vanishing gradient problem instead of that we will going to create LSTM model.LSTMs have an additional state called ‘cell state’ through which the network makes adjustments in the information flow. The advantage of this state is that the model can remember or forget the leanings more selectively.
First of all we are going to do tokenization then we will generate sequence of n-grams.After that we will going to do padding.Padding is required because all the sentences are of different length so we need to make them of same length.We will going to do this by adding 0 in the end of the text with the help of pad_sequences function of keras

In [None]:
max_features = 6433     # the maximum number of words to keep, based on word frequency
tokenizer = Tokenizer(num_words=max_features )
tokenizer.fit_on_texts(df['cleaned_text'].values)

In [None]:
X = tokenizer.texts_to_sequences(df['cleaned_text'].values)
X = pad_sequences(X, padding = 'post', maxlen = 6433 )

In [None]:
X

In [None]:
X.shape[1]

In [None]:
Y = pd.get_dummies(df['label']).values

In [None]:
Y

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42,stratify = Y)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
embid_dim = 300
lstm_out = 32


model = keras.Sequential()
model.add(Embedding(max_features, embid_dim, input_length = X.shape[1] ))
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dropout(0.4))
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.4))
model.add(Dense(9,activation = 'softmax'))

model.summary()

So our model is created now it's time to train our model, we will going to use 10 epochs

In [None]:
batch_size = 128
earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
history = model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 1, validation_data= (X_test, Y_test),callbacks=[earlystop])

### Plot Accuracy and Loss

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

### Save LSTM model

In [None]:
model.save('lstm.h5')

# BERT

So now we will going to make the bert model.In our kernel we have less memory so we will going to take 50% of our dataset

In [None]:
df_bert = df.sample(frac=0.5)

In [None]:
df_bert.reset_index(inplace=True)

In [None]:
df_bert['target'].value_counts()

So our dataset is imbalanced, we split the dataset in a stratified way

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_bert.index.values, 
                                                  df_bert.target.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df_bert.target.values)

In [None]:
df_bert['data_type'] = ['not_set']*df_bert.shape[0]

df_bert.loc[X_train, 'data_type'] = 'train'
df_bert.loc[X_val, 'data_type'] = 'val'

Now we will construct the BERT Tokenizer.Based on wordpiece.We will intantiate a pre-trained model configuration to encode our data

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

- To convert all the titles from text into encoded form, we use a function called *batch_encode_plus* and we will proceed train and test data seperately.The first parameter inside the function is the text.
- *add_special_tokens = True* means the sequences will encoded with the special tokens realtive to their model
- *return_attention_mask=True* returns the attention mask according to the special tokenizer defined by *max_length* attribute

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df_bert[df_bert.data_type=='train'].final_text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df_bert[df_bert.data_type=='val'].final_text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df_bert[df_bert.data_type=='train'].target.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df_bert[df_bert.data_type=='val'].target.values)

Now we got encoded dataset, we can create training data and validation data

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
# length of training and validation data 
len(dataset_train), len(dataset_val)

We are treating each title as its unique sequence, so one sequence will be classified into one of the 12 labels

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=12,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

DataLoader combines a dataset and a sampler and provides an iterable over the given dataset.

In [None]:
batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)

In [None]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

We will use f1 score as a performance metrics

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

### Training loop

In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    #model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')