# **Importing Libraries**

In [1]:
import numpy as np 
import pandas as pd
import spacy
from spacy.util import minibatch, compounding 
#'minibatch'divide the training data into batches
#compounding gradually increasing the batch size over time
from spacy.training import Example
import random



# **Reading Dataset**

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

#**Texts Processing**

In [3]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
#Removing Newline Characters
train_data['text'] = train_data.comment_text.apply(lambda x: x.replace('\n', ' '))
test_data['text'] = test_data.comment_text.apply(lambda x: x.replace('\n', ' '))

In [5]:
cats = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_prepared_data = []

def format_text_spacy(text):
    return (text.text, {'cats': {cat: text[cat] for cat in cats}})
    
for i in range(0,len(train_data)):
    text = train_data.iloc[i]
    train_prepared_data.append(format_text_spacy(text))

In [6]:
train_prepared_data[0:5]

[("Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
  {'cats': {'toxic': 0,
    'severe_toxic': 0,
    'obscene': 0,
    'threat': 0,
    'insult': 0,
    'identity_hate': 0}}),
 ("D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
  {'cats': {'toxic': 0,
    'severe_toxic': 0,
    'obscene': 0,
    'threat': 0,
    'insult': 0,
    'identity_hate': 0}}),
 ("Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
  {'cats': {'toxic': 0,
    'severe_toxic': 0,
    'obscene': 0,
    'threat': 0,
    'insult': 0,
    'identity_hate': 0

In [7]:
nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat_multilabel")
textcat.add_label("toxic")
textcat.add_label("severe_toxic")
textcat.add_label("obscene")
textcat.add_label("threat")
textcat.add_label("insult")
textcat.add_label("identity_hate")

1

 * The pipeline is now fully set up to handle the text classification task.
 * The model can now be trained on the prepared data using the 'textcat' pipe, and then used to classify new text samples based on their toxicity levels.

# **Training and Testing Data**

In [9]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    for epoch in range(10):
        losses = {}
        batches = minibatch(train_prepared_data[0:10000], size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for text, annot in batch:
                examples.append(Example.from_dict(nlp.make_doc(text), annot))
            nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
        print("Epoch: {} Loss: {}".format(epoch+1, losses))

Training the model...
Epoch: 1 Loss: {'textcat_multilabel': 38.21415030779394}
Epoch: 2 Loss: {'textcat_multilabel': 23.74933233314141}
Epoch: 3 Loss: {'textcat_multilabel': 18.941587105632607}
Epoch: 4 Loss: {'textcat_multilabel': 14.51262758230223}
Epoch: 5 Loss: {'textcat_multilabel': 11.978524126807088}
Epoch: 6 Loss: {'textcat_multilabel': 10.099753917691714}
Epoch: 7 Loss: {'textcat_multilabel': 8.380332477777529}
Epoch: 8 Loss: {'textcat_multilabel': 7.092953461811078}
Epoch: 9 Loss: {'textcat_multilabel': 5.893946531348661}
Epoch: 10 Loss: {'textcat_multilabel': 5.364915788115653}


In [10]:
test = nlp("Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.")

In [11]:
test.cats

{'toxic': 0.011529107578098774,
 'severe_toxic': 0.0011514670914039016,
 'obscene': 0.0020827658008784056,
 'threat': 0.002530499827116728,
 'insult': 0.0007587347063235939,
 'identity_hate': 0.0008342122309841216}

In [22]:
test_data["text"][153163]

'"   :::Stop already. Your bullshit is not welcome here. I\'m no fool, and if you think that kind of explination is enough, well pity you.    "'

In [23]:
test_2=nlp("   :::Stop already. Your bullshit is not welcome here. I'm no fool, and if you think that kind of explination is enough, well pity you.    ")

In [24]:
test_2.cats

{'toxic': 0.7894149422645569,
 'severe_toxic': 0.0008440674864687026,
 'obscene': 0.31717610359191895,
 'threat': 0.004593106918036938,
 'insult': 0.02608342468738556,
 'identity_hate': 0.006667386740446091}