# Multi-label Classification Modeling using LSTM

In [39]:
# import libraries.
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re, string, nltk

nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet
from collections import Counter

from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import pad_sequences

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# import train and test data. 

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_y= pd.read_csv('test_labels.csv')

print(train.shape)
print(test.shape)


(159571, 8)
(153164, 2)


## 1. Data Preprocessing 
### Use the same text preprocessing from Step 1 to build a neural network classifier.

In [23]:
# Adding 'no_label' column for comments that do not have a label. 
label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train['no_label'] = (train[label].max(axis=1) == 0).astype(int)

df_col = train.groupby(label)\
                    .size()\
                    .sort_values(ascending=False)\
                    .to_frame('count')\
                    .reset_index()\
                    .head(5)
df_col

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,count
0,0,0,0,0,0,0,143346
1,1,0,0,0,0,0,5666
2,1,0,1,0,1,0,3800
3,1,0,1,0,0,0,1758
4,1,0,0,0,1,0,1215


In [24]:
# Examine comments with multiple labels. 
train["multi_label"] = train.iloc[:,2:8].apply(lambda x: sum(x), axis=1)

multi_label_total = {}
for value in train["multi_label"].unique():
    multi_label_total[value] = train["multi_label"].value_counts()[value]

key_list= list(multi_label_total.keys()) 
value_list = list(multi_label_total.values())



# Text Cleaning

In [25]:
# text cleaning 

def clean_text(text):

    # replace html characters with " "
    text = re.sub("(http://.*?\s)|(http://.*)",'',str(text))
    # remove punctuations
    text = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]').sub(" ", text)
    # consider only alphabets and numerics
    text = re.sub('[^a-zA-Z]',' ',text)  
    # replace newline with space
    text = re.sub("\n"," ",text)
    # convert to lower case
    text = text.lower()
    # remove any text starting with User 
    text = re.sub("\[\[User.*",'',str(text))
    # remove IP addresses or user IDs
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(text))
    # split and join the words
    text=' '.join(text.split())
    # remove stop words
    stop_words = set(stopwords.words('english'))
    text = [word for word in str(text).split() if word not in stop_words]
    # remove any non ascii
    text = [word.encode('ascii', 'ignore').decode('ascii') for word in text]
    # lematize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in text]


    return text


In [26]:
try:
    del train['multi_label']
    del train['no_label']
except:
    pass

In [27]:
# Vectorizer 
vector = TfidfVectorizer(ngram_range=(1, 1), analyzer='word',
                         tokenizer=clean_text, stop_words='english',
                         strip_accents='unicode', use_idf=1, min_df=10)
X_train = vector.fit_transform(train['comment_text'])
X_test = vector.transform(test['comment_text'])

## LSTM

In [28]:
col =['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [29]:
sentiment = train['comment_text']

In [30]:
y_train = train[col].values

In [31]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))

In [32]:
seq = tokenizer.texts_to_sequences(sentiment)
pad = pad_sequences(seq, maxlen=100)

In [33]:
test_sentiment = test['comment_text']
test_seq = tokenizer.texts_to_sequences(test_sentiment)
test_pad = pad_sequences(test_seq, maxlen=100)

In [34]:
def model_():
    inputs = Input(shape=(100, ))
    x = Embedding(20000, 128)(inputs)
    x = Bidirectional(LSTM(50))(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model
model = model_add()
print(model.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 100, 128)          2560000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 100)              71600     
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 50)                5050      
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                           

In [35]:

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model_train = model.fit(pad, y_train, batch_size=32, epochs=2, validation_split=0.1, callbacks=early)


Epoch 1/2


2023-01-12 21:36:53.242709: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:36:53.474954: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:36:53.505623: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:36:53.830158: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:36:53.862004: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-01-12 21:41:22.753908: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:41:22.851664: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:41:22.859753: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


In [36]:
y_test = model.predict([test_pad], batch_size=1024, verbose=1)

2023-01-12 21:46:10.252979: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:46:10.330594: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-12 21:46:10.352097: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [37]:
lstm = pd.DataFrame(y_test, columns = col)
lstm.index = test.id
lstm = lstm.reset_index()

In [38]:
lstm

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.936067,6.979026e-02,0.806317,0.020687,0.532997,0.075930
1,0000247867823ef7,0.000502,1.600872e-07,0.000253,0.000003,0.000031,0.000009
2,00013b17ad220c46,0.001934,1.097890e-06,0.001028,0.000016,0.000118,0.000047
3,00017563c3f7919a,0.010419,6.665001e-06,0.002707,0.000129,0.000883,0.000251
4,00017695ad8997eb,0.004132,3.799201e-06,0.001394,0.000060,0.000374,0.000145
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.744648,3.169456e-03,0.357610,0.006847,0.180631,0.016579
153160,fffd7a9a6eb32c16,0.025268,1.827607e-05,0.005273,0.000359,0.001990,0.000663
153161,fffda9e8d6fafa9e,0.005374,4.509513e-06,0.001780,0.000079,0.000488,0.000170
153162,fffe8f1340a79fc2,0.007772,2.483366e-06,0.001914,0.000054,0.000438,0.000130


# Conclusion: 

### Our result shows that LSTM generates 99% accuracy in classifying labels correctly, illustrating that neural networks are more accurate than the traditional machine learning algorithms in classifying labels correctly. 

###  In the above table, the value in each entry is the predicted probability of column label equal to 1.