## Comment Toxicity Classificaion

+ This folder Contains 3 files :
    - train.csv
    - test.csv
    - sample_submission.csv

In [None]:
%ls -l

## Import required packages

+ Basics
+ Vizualization
+ Natural language Processing tool
+ Feture Engineering
+ Setting

In [None]:
import pandas as pd 
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image

### About NLP Libararies

+ Spacy
    - [Spacy Tutorial by Analytics Vidiya](https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/)
+ NLTK 
    - [NLTK book](http://www.nltk.org/book/)
+ RE (Regular Expression libraries)
    - [RE tutorial](https://docs.python.org/2/howto/regex.html)

In [None]:
import string
import re    
import nltk
from nltk.corpus import stopwords

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD

stoplist = set(stopwords.words("english"))
%matplotlib inline

### Starting  feature Engg

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.tail(10)

In [None]:
x = train.iloc[:,2:].sum()
print(x.values)

In [None]:
rowsums = train.iloc[:,2:].sum(axis=1)
train['non-toxic'] = (rowsums==0)
train['non-toxic'].sum()

In [None]:
print("Total comments = ",len(train))
print("Total clean comments = ",train['non-toxic'].sum())
print("Total tags =",x.sum())

In [None]:
x = train.iloc[:,2:].sum()
plt.figure(figsize=(8,4))
ax = sns.barplot(x.index, x.values)
plt.title("Class Distribution")
plt.ylabel('Class frequency', fontsize=15)
plt.xlabel('Class Types', fontsize=15)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

In [None]:
train.iloc[2,:][1]

## Replacement Algorithm or Modules

In [None]:
replacement_patterns = [  
    (r'won\'t', 'will not'),  
    (r'can\'t', 'cannot'),  
    (r'i\'m', 'i am'),  
    (r'ain\'t', 'is not'),  
    (r'(\w+)\'ll', '\g<1> will'),  
    (r'(\w+)n\'t', '\g<1> not'),  
    (r'(\w+)\'ve', '\g<1> have'),  
    (r'(\w+)\'s', '\g<1> is'),  
    (r'(\w+)\'re', '\g<1> are'),  
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):  
    def __init__(self, patterns=replacement_patterns):    
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]  
        
    def replace(self, text):    
        s = text    
        for (pattern, repl) in self.patterns:      
            s = re.sub(pattern, repl, s)    
        return s


## Replacing negations with antonyms

In [None]:
from nltk.corpus import wordnet

class AntonymReplacer(object):
    
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words


# Noise Removal

In [None]:
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

stoplist = set(stopwords.words("english"))

class Remove_Noise(object):
    
    def __init__(self,stop_word = stoplist):
        self.stop_word = stoplist
    
    def noise_rm(self,doc):
        doc = re.sub('[#$%^&\',:()*+/<=>@[\\]^_``{|}~]',' ',doc)
        doc = re.sub('[0-9]+',' ',doc)
        doc = re.sub('\n','',doc)
        doc = re.sub(' +',' ',doc)
        doc = doc.lower()
        return doc
    
    def lemmatize(self,token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        lemmatizer = WordNetLemmatizer()
        return lemmatizer.lemmatize(token, tag)
    
    def tokenize(self,document): 
        #document = unicode(document,'utf-8')
        lemmy = []
        for sent in sent_tokenize(document):
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                if token in self.stop_word:
                    continue
                lemma = self.lemmatize(token, tag)
                lemmy.append(lemma)
        return lemmy

In [None]:
replacer = RegexpReplacer()
remover = Remove_Noise()
AntoRep = AntonymReplacer()

In [None]:
train['comment_text'].fillna(' ', inplace=True)
test['comment_text'].fillna(' ', inplace=True)

In [None]:
train['comment_full'] = train['comment_text'].apply(replacer.replace)
test['comment_full'] = test['comment_text'].apply(replacer.replace)

In [None]:
train['Remove_noise'] = train['comment_full'].apply(remover.noise_rm)
test['Remove_noise'] = test['comment_full'].apply(remover.noise_rm)

In [None]:
train['TokenandLemma'] = train['Remove_noise'].apply(remover.tokenize)
test['TokenandLemma'] = test['Remove_noise'].apply(remover.tokenize)

In [None]:
train["Processed"] = train['TokenandLemma'].apply(AntoRep.replace_negations)
test["Processed"] = test['TokenandLemma'].apply(AntoRep.replace_negations)

In [None]:
train.to_pickle('train_processed.pkl')
test.to_pickle('test_processed.pkl')