In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip', low_memory=False , sep=',') 
print(df)
print(df.shape)

In [None]:
#List the fields in our dataframe
print(df.dtypes)

In [None]:
# below line causes shuffling of indices, to avoid using train_test_split later
df = df.reindex(np.random.permutation(df.index))

In [None]:
print(df)

**Separate the Comment field data and outcome lables**

In [None]:
comment = df['comment_text']
print(comment.head())
comment = comment.to_numpy()

In [None]:
label = df[['toxic', 'severe_toxic' , 'obscene' , 'threat' , 'insult' , 'identity_hate']]
print(label.head())
label = label.to_numpy()

In [None]:
print(label)

Find out the frequency of occurence of multilabelled data

In [None]:
# ct1 counts samples having atleast one label
# ct2 counts samples having 2 or more than 2 labels
ct1,ct2 = 0,0
for i in range(label.shape[0]):
    ct = np.count_nonzero(label[i])
    if ct :
        ct1 = ct1+1
    if ct>1 :
        ct2 = ct2+1
print(ct1)
print(ct2)

**Data Visualisations**

Analyse the no. of comments having lengths varying from 0 to 1200

In [None]:
x = [len(comment[i]) for i in range(comment.shape[0])]

print('average length of comment: {:.3f}'.format(sum(x)/len(x)) )
bins = [1,200,400,600,800,1000,1200]
plt.hist(x, bins=bins)
plt.xlabel('Length of comments')
plt.ylabel('Number of comments')       
plt.axis([0, 1200, 0, 90000])
plt.grid(True)
plt.show()

Comments classified as toxic,severe_toxic,.. etc depending on numbers of comments and their lengths

In [None]:
y = np.zeros(label.shape)
for ix in range(comment.shape[0]):
    l = len(comment[ix])
    if label[ix][0] :
        y[ix][0] = l
    if label[ix][1] :
        y[ix][1] = l
    if label[ix][2] :
        y[ix][2] = l
    if label[ix][3] :
        y[ix][3] = l
    if label[ix][4] :
        y[ix][4] = l
    if label[ix][5] :
        y[ix][5] = l

labelsplt = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
color = ['red','green','blue','yellow','orange','chartreuse']        
plt.hist(y,bins = bins,label = labelsplt,color = color)
plt.axis([0, 1200, 0, 8000])
plt.xlabel('Length of comments')
plt.ylabel('Number of comments') 
plt.legend()
plt.grid(True)
plt.show()

**Removing Stop Words**

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
# Initialize the stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
stop_words.append('')
for x in range(ord('b'), ord('z')+1):
    stop_words.append(chr(x))
print(stop_words)

**Stemming and Lemmatizing**

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
#create objects for stemmer and lemmatizer
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()
#download words from wordnet library
nltk.download('wordnet')

**We can now, loop once through all the comments applying :**

* punctuation removal
* splitting the words by space
* applying stemmer and lemmatizer
* recombining the words again for further processing

In [None]:
for i in range(len(comment)):
    comment[i] = comment[i].lower().translate(trantab)
    l = []
    for word in comment[i].split():
        l.append(stemmer.stem(lemmatiser.lemmatize(word,pos="v")))
    comment[i] = " ".join(l)

In [None]:
type(comment), len(comment)

**Applying Count Vectorizer

Here we can finally convert our comments into a matrix of token counts, which signifies the number of times it occurs.**

In [None]:
#import required library
from sklearn.feature_extraction.text import CountVectorizer

#create object supplying our custom stop words
count_vector = CountVectorizer(stop_words=stop_words)

tf = count_vector.fit_transform(comment).toarray()

In [None]:
print(tf.shape)

**Splitting dataset into training and testing**

In [None]:
def shuffle(matrix, target, test_proportion):
    ratio = int(matrix.shape[0]/test_proportion)
    X_train = matrix[ratio:,:]
    X_test =  matrix[:ratio,:]
    Y_train = target[ratio:,:]
    Y_test =  target[:ratio,:]
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = shuffle(tf, labels,3)

print(X_test.shape)
print(X_train.shape)

**Finalising Evaluation Metric - Example based metrics**

**1. Label based metrics**

It includes one-error, average precision, etc. These are calculated separately for each of the labels, and then averaged for all without taking into account any relation between the labels.

**2. Example based metrics**

It include accuracy, hamming loss, etc.These are calculated for each example and then averaged across the test set.

**defining the evaluation metrics**

In [None]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

def evaluate_score(Y_test,predict): 
    loss = hamming_loss(Y_test,predict)
    print("Hamming_loss : {}".format(loss*100))
    accuracy = accuracy_score(Y_test,predict)
    print("Accuracy : {}".format(accuracy*100))
    try : 
        loss = log_loss(Y_test,predict)
    except :
        loss = log_loss(Y_test,predict.toarray())
    print("Log_loss : {}".format(loss))

**Applying algorithmic techniques to build a multi-label classifier**

**1. Problem transformation methods** like binary relevance method, label power set, classifier chain and random k-label sets (RAKEL) algorithm 

**2. Adaptation algorithms** like the AdaBoost MH, AdaBoost MR, k-nearest neighbours, decision trees and back propagation-multi label neural networks(BP-MLL).

**I. Problem Transformation Methods**

( Using scikit-multilearn library is used for implementing the various methods. eg.: Multinomial Naive Bayes, Gaussian Naive Bayes and SVC.)

**1. Binary Relevance (BR) Method with MultinomialNB classifiers**

In [None]:
pip install scikit-multilearn

In [None]:
from sklearn.naive_bayes import MultinomialNB
#clf will be the list of the classifiers for all the 6 labels
# each classifier is fit with the training data and corresponding classifier
clf = []
for ix in range(6):
    clf.append(MultinomialNB())
    clf[ix].fit(X_train,Y_train[:,ix])

In [None]:
# predict list contains the predictions, it is transposed later to get the proper shape
predict = []
for ix in range(6):
    predict.append(clf[ix].predict(X_test))

predict = np.asarray(np.transpose(predict))
print(predict.shape)

In [None]:
evaluate_score(Y_test,predict)

**2. BR Method with SVM classifier (from scikit-multilearn)**

In [None]:
#create and fit classifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
classifier = BinaryRelevance(classifier = SVC(), require_dense = [False, True])
classifier.fit(X_train, Y_train)

In [None]:
#predictions
predictions = classifier.predict(X_test)
#calculate scores
evaluate_score(Y_test,predictions)

**3. BR Method with Multinomial classifier (from scikit-multilearn)**

In [None]:
#create and fit classifier
classifier = BinaryRelevance(classifier = MultinomialNB(), require_dense = [False, True])
classifier.fit(X_train, Y_train)

In [None]:
#predictions
predictions = classifier.predict(X_test)
#calculate scores
evaluate_score(Y_test,predictions)

**4. BR Method with GausseanNB classifier (from scratch)**

In [None]:
from sklearn.naive_bayes import GaussianNB
#create and fit classifiers
clf = []
for ix in range(6):
    clf.append(GaussianNB())
    clf[ix].fit(X_train,Y_train[:,ix])

In [None]:
#predictions
predict = []
for ix in range(6):
    predict.append(clf[ix].predict(X_test))

In [None]:
#calculate scores
predict = np.asarray(np.transpose(predict))
evaluate_score(Y_test,predict)

**5. Classifier chain with MultinomialNB classifier (from scikit-multilearn)**

In [None]:
#create and fit classifier
from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(MultinomialNB())
classifier.fit(X_train, Y_train)

In [None]:
#predictions
predictions = classifier.predict(X_test)
#calculate scores
evaluate_score(Y_test,predictions)

**6. Label Powerset with MultinomialNB classifier (from scikit-multilearn)**

In [None]:
#create and fit classifier
from skmultilearn.problem_transform import LabelPowerset
classifier = LabelPowerset(MultinomialNB())
classifier.fit(X_train, Y_train)

In [None]:
#predictions
predictions = classifier.predict(X_test)
evaluate_score(Y_test,predictions)

**II. Adaptation Algorithms**

**7. MLkNN with k=2 (from scikit-multilearn) (Multi label version of K-nearest neighbours)**

In [None]:
#create and fit classifier
from skmultilearn.adapt import MLkNN
classifier = MLkNN(k=2)
classifier.fit(X_train, Y_train)

In [None]:
#predictions
predictions = classifier.predict(X_test)
#calculate scores
evaluate_score(Y_test,predictions)

**8. BP-MLL Neural Networks (from scratch) (Back propagation Multi-label Neural Networks)**

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

In [None]:
#define model architecture
model = Sequential()
model.add(Dense(4, activation='relu', input_dim = X_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(6, activation='softmax'))
model.summary()

In [None]:
#compile model with all parameters set
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
#Fit using check pointer
from keras.callbacks import ModelCheckpoint  

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.myneural.h5py', 
                               verbose=1, save_best_only=True)
model.fit(X_train, Y_train, epochs=10, batch_size=32)

In [None]:
#predictions
predict = model.predict(X_test)

In [None]:
#calculate score
loss = log_loss(Y_test,predict)
print("Log_loss : {}".format(loss))
predict = np.round(predict)
loss = hamming_loss(Y_test,predict)
print("Hamming_loss : {}".format(loss*100))
accuracy = accuracy_score(Y_test,predict)
print("Accuracy : {}".format(accuracy*100))

**improving the BP-MLL model**

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import optimizers

#define parameters for using in param grid
nodes = [16, 32, 64] # number of nodes in the hidden layer
lrs = [0.001, 0.002, 0.003] # learning rate, default = 0.001
epochs = [10,20,30]
batch_size = 64

In [None]:
def create_model(nodes=10,lr=0.001):
    model = Sequential()
    model.add(Dense(nodes, activation='relu', input_dim = X_train.shape[1]))
    model.add(Dropout(0.3))
    model.add(Dense(6, activation='softmax'))
    opt = optimizers.RMSprop(lr=lr)
    model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model)

In [None]:
#start fitting process
param_grid = dict(epochs=epochs,nodes=nodes, lr=lrs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1,refit=True,verbose=2)
grid_result = grid.fit(X_train, Y_train)

In [None]:
print(grid_result)

In [None]:
print('Best estimator : {}'.format (grid.best_estimator_))
print('Best score : {}'.format(grid.best_score_))
print('Best params : {}'.format(grid.best_params_))

In [None]:
print(grid.cv_results_)

In [None]:
#predictions
predictions = grid.predict(X_test)

In [None]:
#predictions
predict = grid.predict_proba(X_test)
print(predict.shape)

In [None]:
#calculate score
loss = log_loss(Y_test,predict)
print("Log_loss : {}".format(loss))
predict = np.round(predict)
loss = hamming_loss(Y_test,predict)
print("Hamming_loss : {}".format(loss*100))
accuracy = accuracy_score(Y_test,predict)
print("Accuracy : {}".format(accuracy*100))

**Visualisation**

Let us have a plot showing the hamming-loss and log-loss of different models, which we selected.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
import itertools

**Hamming Loss**

In [None]:
x = ['BR-MultNB','BR-GausNB','BR-SVC','CC-MultNB','LP-MultNB','BP-MLL-ini','BP-MLL-fin']
y = [3.27,20.74,4.26,3.56,3.17,13.96,15.158]
colors = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k'])
plt.ylabel('Hamming-Loss')
plt.xlabel('Model-details')
plt.xticks(rotation=90)
for i in range(len(y)):
    plt.bar(x[i], y[i], color=next(colors))
plt.show()

**Log Loss**

In [None]:
x = ['BR-MultNB','BR-GausNB','BR-SVC','CC-MultNB','LP-MultNB','BP-MLL-ini','BP-MLL-fin']
y = [1.92,1.422,0.46,1.5,1.47,0.36,0.35]
colors = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k'])
plt.ylabel('Log-Loss')
plt.xlabel('Model-details')
plt.xticks(rotation=90)
for i in range(len(y)):
    plt.bar(x[i], y[i], color=next(colors))
plt.show()