In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# References

There is lot to learn from these notebooks. I took some time and writing this kernel, really interesting kernels and very useful with real life datasets.
* [NLP with Disaster Tweets - EDA, Cleaning and BERT](https://www.kaggle.com/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert)
* [Improve your Score with Text Preprocessing -- V2](https://www.kaggle.com/theoviel/improve-your-score-with-text-preprocessing-v2/notebook)
* [How to: Preprocessing when using embeddings](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings)

In [None]:
import gc
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from tqdm import tqdm

tqdm.pandas()

from wordcloud import STOPWORDS
from plotly.subplots import make_subplots

random_seed = 73

In [None]:
train_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train_df.head()

In [None]:
test_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_df.head()

# EDA

In [None]:
train_df['characters length'] = train_df['comment_text'].apply(len)
train_df['words length'] = train_df['comment_text'].apply(lambda x: len(x.split()))

In [None]:
test_df['characters length'] = test_df['comment_text'].apply(len)
test_df['words length'] = test_df['comment_text'].apply(lambda x: len(x.split()))

## Characters Length Distribution Train

In [None]:
print(train_df['characters length'].describe())
fig = px.histogram(train_df, x='characters length', marginal='box')
fig.show()

## Characters Length Distribution Test

In [None]:
print(test_df['characters length'].describe())
fig = px.histogram(test_df, x='characters length', marginal='box')
fig.show()

## Word Length Distribution Train

In [None]:
print(train_df['words length'].describe())
fig = px.histogram(train_df, x='words length', marginal='box')
fig.show()

## Word Length Distribution Test

In [None]:
print(test_df['words length'].describe())
fig = px.histogram(test_df, x='words length', marginal='box')
fig.show()

## Target Distributions

In [None]:
fig = make_subplots(rows=2, cols=3)
k = 2
for i in range(2):
    for j in range(3):
        fig.add_trace(px.histogram(train_df, x=train_df.columns[k], text_auto=True)['data'][0],
                      row=i+1, col=j+1)
        fig.update_layout(bargap=0.2)
        k += 1
fig.show()

* Each class is also highly unbalacned

## Maximum number of target assigned to text

In [None]:
fig = px.histogram(train_df.iloc[:, 2:-2].sum(axis=1), barmode='group', text_auto=True)
fig.update_layout(bargap=0.3)
fig.show()

* We can see that most of text data is not toxic
* Data is multi label and second highest is with single label only

## Build Vocab

In [None]:
def build_vocab(sentences):
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            vocab[word] = vocab.get(word, 0) + 1
    return vocab

In [None]:
sentences = train_df['comment_text'].apply(lambda x: x.split()).to_numpy()
vocab = build_vocab(sentences)
sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10]

# N-grams

In [None]:
def generate_ngram_vocab(sentences, n_gram=1):
    def generate_ngrams(sentence, n_gram):
        tokens = [token for token in sentence if token != '' if token not in STOPWORDS]
        ngrams = zip(*[tokens[i:] for i in range(n_gram)])
        ngrams_list = []
        for ngram in ngrams:
            ngrams_list.append(" ".join(ngram))
        return ngrams_list

    n_gram_dict = {}
    for sentence in sentences:
        for token in generate_ngrams(sentence, n_gram):
            n_gram_dict[token] = n_gram_dict.get(token, 0) + 1
    return n_gram_dict

## Unigrams, Bigrams and Trigrams

### Unigram Analysis

In [None]:
toxic_sent = train_df[train_df['toxic'] == 1]['comment_text'].apply(lambda x: x.lower().split()).to_numpy()
severe_toxic_sent = train_df[train_df['severe_toxic'] == 1]['comment_text'].apply(lambda x: x.lower().split()).to_numpy()
obscene_sent = train_df[train_df['obscene'] == 1]['comment_text'].apply(lambda x: x.lower().split()).to_numpy()
threat_sent = train_df[train_df['threat'] == 1]['comment_text'].apply(lambda x: x.lower().split()).to_numpy()
insult_sent = train_df[train_df['insult'] == 1]['comment_text'].apply(lambda x: x.lower().split()).to_numpy()
identity_hate_sent = train_df[train_df['identity_hate'] == 1]['comment_text'].apply(lambda x: x.lower().split()).to_numpy()

In [None]:
toxic_unigram_dict = generate_ngram_vocab(toxic_sent, n_gram=1)
severe_toxic_unigram_dict = generate_ngram_vocab(severe_toxic_sent, n_gram=1)
obscene_unigram_dict = generate_ngram_vocab(obscene_sent, n_gram=1)
threat_unigram_dict = generate_ngram_vocab(threat_sent, n_gram=1)
insult_unigram_dict = generate_ngram_vocab(insult_sent, n_gram=1)
identity_hate_unigram_dict = generate_ngram_vocab(identity_hate_sent, n_gram=1)

In [None]:
toxic_unigrams_df = pd.DataFrame(sorted(toxic_unigram_dict.items(), key=lambda x: x[1])[::-1])
severe_toxic_unigrams_df = pd.DataFrame(sorted(severe_toxic_unigram_dict.items(), key=lambda x: x[1])[::-1])
obscene_unigrams_df = pd.DataFrame(sorted(obscene_unigram_dict.items(), key=lambda x: x[1])[::-1])
threat_unigrams_df = pd.DataFrame(sorted(threat_unigram_dict.items(), key=lambda x: x[1])[::-1])
insult_unigrams_df = pd.DataFrame(sorted(insult_unigram_dict.items(), key=lambda x: x[1])[::-1])
identity_hate_unigrams_df = pd.DataFrame(sorted(identity_hate_unigram_dict.items(), key=lambda x: x[1])[::-1])

In [None]:
fig, axes = plt.subplots(ncols=6, figsize=(18, 50), dpi=100)
plt.tight_layout()

N = 100
sns.barplot(y=toxic_unigrams_df[0].values[:N], x=toxic_unigrams_df[1].values[:N], ax=axes[0])
sns.barplot(y=severe_toxic_unigrams_df[0].values[:N], x=severe_toxic_unigrams_df[1].values[:N], ax=axes[1])
sns.barplot(y=obscene_unigrams_df[0].values[:N], x=obscene_unigrams_df[1].values[:N], ax=axes[2])
sns.barplot(y=threat_unigrams_df[0].values[:N], x=threat_unigrams_df[1].values[:N], ax=axes[3])
sns.barplot(y=insult_unigrams_df[0].values[:N], x=insult_unigrams_df[1].values[:N], ax=axes[4])
sns.barplot(y=identity_hate_unigrams_df[0].values[:N], x=identity_hate_unigrams_df[1].values[:N], ax=axes[5])

for i in range(6):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=13)

axes[0].set_title(f'Top {N} Toxic Comments', fontsize=15)
axes[1].set_title(f'Top {N} Severe Toxic Comments', fontsize=15)
axes[2].set_title(f'Top {N} Obsece Comments', fontsize=15)
axes[3].set_title(f'Top {N} Threat Comments', fontsize=15)
axes[4].set_title(f'Top {N} Insult Comments', fontsize=15)
axes[5].set_title(f'Top {N} Identity Hate Comments', fontsize=15)

plt.show()

### Bigram Analysis

In [None]:
toxic_bigram_dict = generate_ngram_vocab(toxic_sent, n_gram=2)
severe_toxic_bigram_dict = generate_ngram_vocab(severe_toxic_sent, n_gram=2)
obscene_bigram_dict = generate_ngram_vocab(obscene_sent, n_gram=2)
threat_bigram_dict = generate_ngram_vocab(threat_sent, n_gram=2)
insult_bigram_dict = generate_ngram_vocab(insult_sent, n_gram=2)
identity_hate_bigram_dict = generate_ngram_vocab(identity_hate_sent, n_gram=2)

In [None]:
toxic_bigrams_df = pd.DataFrame(sorted(toxic_bigram_dict.items(), key=lambda x: x[1])[::-1])
severe_toxic_bigrams_df = pd.DataFrame(sorted(severe_toxic_unigram_dict.items(), key=lambda x: x[1])[::-1])
obscene_bigrams_df = pd.DataFrame(sorted(obscene_bigram_dict.items(), key=lambda x: x[1])[::-1])
threat_bigrams_df = pd.DataFrame(sorted(threat_bigram_dict.items(), key=lambda x: x[1])[::-1])
insult_bigrams_df = pd.DataFrame(sorted(insult_bigram_dict.items(), key=lambda x: x[1])[::-1])
identity_hate_bigrams_df = pd.DataFrame(sorted(identity_hate_bigram_dict.items(), key=lambda x: x[1])[::-1])

In [None]:
fig, axes = plt.subplots(ncols=6, figsize=(18, 50), dpi=100)
plt.tight_layout()

N = 100
sns.barplot(y=toxic_bigrams_df[0].values[:N], x=toxic_bigrams_df[1].values[:N], ax=axes[0])
sns.barplot(y=severe_toxic_bigrams_df[0].values[:N], x=severe_toxic_bigrams_df[1].values[:N], ax=axes[1])
sns.barplot(y=obscene_bigrams_df[0].values[:N], x=obscene_bigrams_df[1].values[:N], ax=axes[2])
sns.barplot(y=threat_bigrams_df[0].values[:N], x=threat_bigrams_df[1].values[:N], ax=axes[3])
sns.barplot(y=insult_bigrams_df[0].values[:N], x=insult_bigrams_df[1].values[:N], ax=axes[4])
sns.barplot(y=identity_hate_bigrams_df[0].values[:N], x=identity_hate_bigrams_df[1].values[:N], ax=axes[5])

for i in range(6):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=13)

axes[0].set_title(f'Top {N} Toxic Comments', fontsize=15)
axes[1].set_title(f'Top {N} Severe Toxic Comments', fontsize=15)
axes[2].set_title(f'Top {N} Obsece Comments', fontsize=15)
axes[3].set_title(f'Top {N} Threat Comments', fontsize=15)
axes[4].set_title(f'Top {N} Insult Comments', fontsize=15)
axes[5].set_title(f'Top {N} Identity Hate Comments', fontsize=15)

plt.show()

### Trigrams Analysis

In [None]:
toxic_trigram_dict = generate_ngram_vocab(toxic_sent, n_gram=3)
severe_toxic_trigram_dict = generate_ngram_vocab(severe_toxic_sent, n_gram=3)
obscene_trigram_dict = generate_ngram_vocab(obscene_sent, n_gram=3)
threat_trigram_dict = generate_ngram_vocab(threat_sent, n_gram=3)
insult_trigram_dict = generate_ngram_vocab(insult_sent, n_gram=3)
identity_hate_trigram_dict = generate_ngram_vocab(identity_hate_sent, n_gram=3)

In [None]:
toxic_trigrams_df = pd.DataFrame(sorted(toxic_trigram_dict.items(), key=lambda x: x[1])[::-1])
severe_toxic_trigrams_df = pd.DataFrame(sorted(severe_toxic_trigram_dict.items(), key=lambda x: x[1])[::-1])
obscene_trigrams_df = pd.DataFrame(sorted(obscene_trigram_dict.items(), key=lambda x: x[1])[::-1])
threat_trigrams_df = pd.DataFrame(sorted(threat_trigram_dict.items(), key=lambda x: x[1])[::-1])
insult_trigrams_df = pd.DataFrame(sorted(insult_trigram_dict.items(), key=lambda x: x[1])[::-1])
identity_hate_trigrams_df = pd.DataFrame(sorted(identity_hate_trigram_dict.items(), key=lambda x: x[1])[::-1])

In [None]:
fig, axes = plt.subplots(ncols=6, figsize=(18, 50), dpi=100)
plt.tight_layout()

N = 100
sns.barplot(y=toxic_trigrams_df[0].values[:N], x=toxic_trigrams_df[1].values[:N], ax=axes[0])
sns.barplot(y=severe_toxic_trigrams_df[0].values[:N], x=severe_toxic_trigrams_df[1].values[:N], ax=axes[1])
sns.barplot(y=threat_trigrams_df[0].values[:N], x=threat_trigrams_df[1].values[:N], ax=axes[2])
sns.barplot(y=threat_trigrams_df[0].values[:N], x=threat_trigrams_df[1].values[:N], ax=axes[3])
sns.barplot(y=insult_trigrams_df[0].values[:N], x=insult_trigrams_df[1].values[:N], ax=axes[4])
sns.barplot(y=identity_hate_trigrams_df[0].values[:N], x=identity_hate_trigrams_df[1].values[:N], ax=axes[5])

for i in range(6):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=13)

axes[0].set_title(f'Top {N} Toxic Comments', fontsize=15)
axes[1].set_title(f'Top {N} Severe Toxic Comments', fontsize=15)
axes[2].set_title(f'Top {N} Obsece Comments', fontsize=15)
axes[3].set_title(f'Top {N} Threat Comments', fontsize=15)
axes[4].set_title(f'Top {N} Insult Comments', fontsize=15)
axes[5].set_title(f'Top {N} Identity Hate Comments', fontsize=15)

plt.show()

* we can see threat sentences real threate words
* hate sentences also have clear difference from other type of sentences
* There are repitition of particular words more than once and exclamtion marks as well.We can try to remove those with tokenizers.

# Embeddings and Cleaning Text

In [None]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    

    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [None]:
import operator

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [None]:
glove_path = "../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl"
glove_emb = np.load(glove_path, allow_pickle=True)

In [None]:
print("Glove 840 : ")
oov_glove = check_coverage(vocab, glove_emb)

In [None]:
print(oov_glove[:10])

In [None]:
sentences_low = train_df['comment_text'].apply(lambda x: x.lower().split()).to_numpy()
vocab_low = build_vocab(sentences_low)
sorted(vocab_low.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
print("Glove : ")
oov_glove = check_coverage(vocab_low, glove_emb)

* In both cases after making lowers we lost a significant amount of embeddings

In [None]:
print(oov_glove[:10])

* Contractions and special characters are causing the problem

In [None]:
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [None]:
print("Glove : ")
add_lower(glove_emb, vocab)

In [None]:
contraction_mapping = {
    "ain't": "is not", 
    "aren't": "are not",
    "can't": "cannot", 
    "'cause": "because", 
    "could've": "could have", 
    "couldn't": "could not", 
    "didn't": "did not",  
    "doesn't": "does not", 
    "don't": "do not", 
    "hadn't": "had not", 
    "hasn't": "has not", 
    "haven't": "have not", 
    "he'd": "he would",
    "he'll": "he will", 
    "he's": "he is", 
    "how'd": "how did", 
    "how'd'y": "how do you", 
    "how'll": "how will", 
    "how's": "how is",  
    "I'd": "I would", 
    "I'd've": "I would have", 
    "I'll": "I will", 
    "I'll've": "I will have",
    "I'm": "I am", 
    "I've": "I have", 
    "i'd": "i would", 
    "i'd've": "i would have", 
    "i'll": "i will",  
    "i'll've": "i will have",
    "i'm": "i am", 
    "i've": "i have", 
    "isn't": "is not", 
    "it'd": "it would", 
    "it'd've": "it would have", 
    "it'll": "it will", "it'll've": 
    "it will have","it's": "it is", 
    "let's": "let us", 
    "ma'am": "madam", 
    "mayn't": "may not", 
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have", 
    "must've": "must have", 
    "mustn't": "must not", 
    "mustn't've": "must not have", 
    "needn't": "need not", 
    "needn't've": "need not have",
    "o'clock": "of the clock", 
    "oughtn't": "ought not", 
    "oughtn't've": "ought not have", 
    "shan't": "shall not", 
    "sha'n't": "shall not", 
    "shan't've": "shall not have", 
    "she'd": "she would", 
    "she'd've": "she would have", 
    "she'll": "she will", 
    "she'll've": "she will have", 
    "she's": "she is", 
    "should've": "should have", 
    "shouldn't": "should not", 
    "shouldn't've": "should not have", 
    "so've": "so have",
    "so's": "so as", 
    "this's": "this is",
    "that'd": "that would", 
    "that'd've": "that would have", 
    "that's": "that is", 
    "there'd": "there would", 
    "there'd've": "there would have", 
    "there's": "there is", 
    "here's": "here is",
    "they'd": "they would", 
    "they'd've": "they would have", 
    "they'll": "they will", 
    "they'll've": "they will have", 
    "they're": "they are", 
    "they've": "they have", 
    "to've": "to have", 
    "wasn't": "was not", 
    "we'd": "we would", 
    "we'd've": "we would have", 
    "we'll": "we will", 
    "we'll've": "we will have", 
    "we're": "we are", 
    "we've": "we have", 
    "weren't": "were not", 
    "what'll": "what will", 
    "what'll've": "what will have", 
    "what're": "what are",  
    "what's": "what is", 
    "what've": "what have", 
    "when's": "when is", 
    "when've": "when have", 
    "where'd": "where did", 
    "where's": "where is", 
    "where've": "where have", 
    "who'll": "who will", 
    "who'll've": "who will have", 
    "who's": "who is", 
    "who've": "who have", 
    "why's": "why is", 
    "why've": "why have", 
    "will've": "will have", 
    "won't": "will not", 
    "won't've": "will not have", 
    "would've": "would have", 
    "wouldn't": "would not", 
    "wouldn't've": "would not have", 
    "y'all": "you all", 
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would", 
    "you'd've": "you would have", 
    "you'll": "you will", 
    "you'll've": "you will have", 
    "you're": "you are", 
    "you've": "you have" }

In [None]:
def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known

In [None]:
print("- Known Contractions -")
print("   Glove :")
print(known_contractions(glove_emb))

In [None]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [None]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
test_df['comment_text'] = test_df['comment_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [None]:
vocab = build_vocab(train_df['comment_text'].apply(lambda x: x.split()))
print("Glove : ")
oov_glove = check_coverage(vocab, glove_emb)

In [None]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [None]:
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

In [None]:
print("Glove :")
print(unknown_punct(glove_emb, punct))

In [None]:
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

In [None]:
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
        
    text = re.sub("(\\W)"," ",text).strip() # remove non-ascii chars
    text = re.sub('\S*\d\S*\s*','', text).strip()  # remove words containing numbers
    text = re.sub(' +', ' ', text)
    
    return text.strip()

In [None]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
test_df['comment_text'] = test_df['comment_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [None]:
train_df['comment_text']

In [None]:
vocab = build_vocab(train_df['comment_text'].apply(lambda x: x.split()))
print("Glove : ")
oov_glove = check_coverage(vocab, glove_emb)

In [None]:
print(len(oov_glove))
oov_glove[:150]

## Replace Miss Spell Words and Combined Words

In [None]:
map_wrong_words = {
    'fucksex': 'fuck sex',
    'yourselfgo': 'your self ego',
    'BeCauSe': 'because',
    'DENEID': 'DENIED',
    '\u200e': '',
    'CriminalWar': 'criminal war',
    'PaTHeTiC': 'pathetic',
    'POLITCAL': 'political',
    'talk2me': 'talk to me',
    'shitFuck': 'shift fuck',
    'BabyWhat': 'baby what',
    'Sockpuppetry': 'sock puppetry',
    'Bastered': 'bastard',
    'PHILIPPINESLONG': 'philippines long',
    'SuPeRTR0LL': 'supertroll',
    'FUCKBAGS': 'fuck bags',
    'peNis': 'penis',
    'pensnsnnienSNsn': 'penis',
    'pneis': 'penis',
    'FooL': 'fool',
    'pennnis': 'penis',
    'PenIS': 'penis',
    'itsuck': 'it suck',
    'deletionist': 'delete',
    'ReSPeCT': 'respect'
}

def clean_wrong_spell_words(text, mapping):
    for word in mapping:
        text = text.replace(word, mapping[word])
    
    return text

train_df['comment_text'] = train_df['comment_text'].apply(lambda x: clean_wrong_spell_words(x, map_wrong_words))
test_df['comment_text'] = test_df['comment_text'].apply(lambda x: clean_wrong_spell_words(x, map_wrong_words))

In [None]:
vocab = build_vocab(train_df['comment_text'].apply(lambda x: x.split()))
print("Glove : ")
oov_glove = check_coverage(vocab, glove_emb)

In [None]:
del glove_emb, oov_glove, vocab_low
gc.collect()

# Base Model Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3),
                                   stop_words='english',
                                   strip_accents='unicode',
                                   token_pattern=r'\w{1,}',
                                   lowercase=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [None]:
# Snowball stemmer
import nltk
from nltk.stem.snowball import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')

def apply_stemmer(text):
    words = text.split()
    sent = [snow_stemmer.stem(word) for word in words if not word in set(STOPWORDS)]
    return ' '.join(sent)

In [None]:
X, y = train_df['comment_text'].apply(apply_stemmer).values, train_df[train_df.columns[2:-2]].values

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['comment_text'].values, train_df[train_df.columns[2:-2]].values, test_size=0.3, random_state=random_seed)

In [None]:
%%time 
X_train = count_vectorizer.fit_transform(X_train)
X_valid = count_vectorizer.transform(X_valid)
X_test = count_vectorizer.transform(test_df['comment_text'])


In [None]:
# logistic regression
log_reg = LogisticRegression(C = 10, penalty='l2', solver = 'liblinear', random_state=random_seed)

one_vs_rest = OneVsRestClassifier(log_reg)
one_vs_rest.fit(X_train, y_train)

y_train_pred_proba = one_vs_rest.predict_proba(X_train)
y_valid_pred_proba = one_vs_rest.predict_proba(X_valid)


roc_auc_score_train = roc_auc_score(y_train, y_train_pred_proba,average='weighted')
roc_auc_score_test = roc_auc_score(y_valid, y_valid_pred_proba,average='weighted')


In [None]:
print("Train ROC AUC Score:", roc_auc_score_train)
print("test ROC AUC Score:", roc_auc_score_test)


In [None]:
y_test = one_vs_rest.predict_proba(X_test)
y_test

In [None]:
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'toxic': y_test[:, 0],
    'severe_toxic': y_test[:, 1],
    'obscene': y_test[:, 2],
    'threat': y_test[:, 3],
    'insult': y_test[:, 4],
    'identity_hate': y_test[:, 5]
    
})
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)