## Import packages

In [6]:
import os
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import numpy as np
import pandas as pd
import sklearn
import zipfile

# Libraries and packages for text (pre-)processing 
import string
import re
import nltk

## Load dataset

In [7]:
with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip","r") as zip_ref:
    zip_ref.extractall("./")

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
df.shape

(159571, 8)

In [9]:
df['comment_text'].fillna("unknown", inplace=True)

## Text cleaning

### Lower case

In [10]:
df["text_clean"] = df["comment_text"].apply(lambda x: x.lower())
display(df.head())

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text_clean
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"""\nmore\ni can't make any real suggestions on ..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


### Contractions

In [11]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.0.25-py2.py3-none-any.whl (3.2 kB)
Collecting textsearch
  Downloading textsearch-0.0.17-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: textsearch, contractions
Successfully installed contractions-0.0.25 textsearch-0.0.17
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [12]:
import contractions

df["text_clean"] = df["text_clean"].apply(lambda x: contractions.fix(x))

In [13]:
print(df['comment_text'][2])
print(df['text_clean'][2])

Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
hey man, I am really not trying to edit war. it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.


### Remove URL and HTTP tags

In [14]:
def remove_URL(text):
    """
        Remove URLs from a sample string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)


def remove_html(text):
    """
        Remove the html in sample text
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

In [15]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_URL(x))
df["text_clean"] = df["text_clean"].apply(lambda x: remove_html(x))

### Remove Non-ASCI

In [16]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text)

In [17]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_non_ascii(x))

### Remove special characters

In [18]:
def remove_special_characters(text):
    """
        Remove special special characters, including symbols, emojis, and other graphic characters
    """
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [19]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_special_characters(x))

### Remove punctuations

In [20]:
def remove_punct(text):
    """
        Remove the punctuation
    """
#     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans('', '', string.punctuation))

In [21]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_punct(x))

### Clean numbers

In [22]:
def clean_numbers(text):
    if bool(re.search(r'\d', text)):
        text = re.sub('[0-9]{5,}', '#####', text)
        text = re.sub('[0-9]{4}', '####', text)
        text = re.sub('[0-9]{3}', '###', text)
        text = re.sub('[0-9]{2}', '##', text)
    return text

In [23]:
df["text_clean"] = df["text_clean"].apply(lambda x: clean_numbers(x))

### Correct misspelling

In [24]:
#from textblob import TextBlob
#df["text_clean"] = df["text_clean"].apply(lambda x: TextBlob(x).correct())

## Preprocessing

### Tokenization

In [25]:
from nltk.tokenize import word_tokenize

df['tokenized'] = df['text_clean'].apply(word_tokenize)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text_clean,tokenized
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,"[explanation, why, the, edits, made, under, my..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour I am se...,"[daww, he, matches, this, background, colour, ..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man I am really not trying to edit war it ...,"[hey, man, I, am, really, not, trying, to, edi..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,\nmore\ni can not make any real suggestions on...,"[more, i, can, not, make, any, real, suggestio..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...,"[you, sir, are, my, hero, any, chance, you, re..."


### Remove stopwords

In [26]:
nltk.download("stopwords")
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
df['stopwords_removed'] = df['tokenized'].apply(lambda x: [word for word in x if word not in stop])

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lemmatization
with or without POS tags

In [27]:
from nltk.stem import WordNetLemmatizer

def lemmatize_word(text):
    """
        Lemmatize the tokenized words
    """

    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word, tag) for word, tag in text]
    return lemma

In [28]:
lemmatizer = WordNetLemmatizer()

df['lemmatize_word'] = df['stopwords_removed'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['lemmatize_word'] = df['lemmatize_word'].apply(lambda x: [word for word in x if word not in stop])
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text_clean,tokenized,stopwords_removed,lemmatize_word
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,"[explanation, why, the, edits, made, under, my...","[explanation, edits, made, username, hardcore,...","[explanation, edits, made, username, hardcore,..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour I am se...,"[daww, he, matches, this, background, colour, ...","[daww, matches, background, colour, I, seeming...","[daww, match, background, colour, I, seemingly..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man I am really not trying to edit war it ...,"[hey, man, I, am, really, not, trying, to, edi...","[hey, man, I, really, trying, edit, war, guy, ...","[hey, man, I, really, trying, edit, war, guy, ..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,\nmore\ni can not make any real suggestions on...,"[more, i, can, not, make, any, real, suggestio...","[make, real, suggestions, improvement, wondere...","[make, real, suggestion, improvement, wondered..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...,"[you, sir, are, my, hero, any, chance, you, re...","[sir, hero, chance, remember, page]","[sir, hero, chance, remember, page]"


## Checkpoint
save cleaned dataset

In [93]:
df.to_csv('cleaned.csv')

## Split the dataset

In [29]:
from sklearn.model_selection import train_test_split
train_blob, test_blob = train_test_split(df,test_size=0.2,random_state=42)

In [30]:
print(train_blob.shape)
print(test_blob.shape)

(127656, 12)
(31915, 12)


## Feature extraction
use TF-IDF

In [31]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer

# Since the input already been tokenized, we define an identity tokenizer to be sent to TfidfVectorizer
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer,ngram_range=(1,2),min_df=3, max_df=0.9,
                        stop_words='english', lowercase=False,use_idf=1, smooth_idf=1, sublinear_tf=1)    
train_features = tfidf.fit_transform(train_blob['lemmatize_word'])
test_features = tfidf.transform(test_blob['lemmatize_word'])

In [32]:
label_col = list(df.columns[2:8])
train_labels = train_blob[label_col].values
test_labels = test_blob[label_col].values

# NB-logistic

In [137]:
from sklearn.linear_model import LogisticRegression

def pr(y_i, y):
    p = train_features[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4)
    x_nb = train_features.multiply(r)
    return m.fit(x_nb, y), r

In [142]:
preds = np.zeros((len(test_labels), len(label_col)))

for i, j in enumerate(label_col):
    print('fit', j)
    m,r = get_mdl(train_blob[j])
    preds[:,i] = m.predict(test_features.multiply(r))

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [144]:
from sklearn.metrics import f1_score
f1_score(test_labels, preds, average='micro')

0.7080166707526355

# RandomForest

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

rnd_clf = RandomForestClassifier(n_estimators=5000, max_depth=2, n_jobs =-1,max_samples=0.8,max_features=0.5)

rnd_clf.fit(train_features,train_labels)


RandomForestClassifier(max_depth=2, max_features=0.5, max_samples=0.8,
                       n_estimators=5000, n_jobs=-1)

In [55]:
y_pred = rnd_clf.predict(test_features)
y_pred.sum(0)

array([730,   0, 728,   0, 537,   0])

In [57]:
f1_score(test_labels, y_pred, average='micro')

0.3958540081596648

**Obviously, it's still overfitting and needs hyperparameter tuning**

# NBSVM

In [82]:
from sklearn.svm import SVC, LinearSVC

def pr(y_i, y):
    p = train_features[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl2(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LinearSVC(C=0.15)
    x_nb = train_features.multiply(r)
    return m.fit(x_nb, y), r

In [83]:
preds2 = np.zeros((len(test_labels), len(label_col)))

for i, j in enumerate(label_col):
    print('fit', j)
    m,r = get_mdl2(train_blob[j])
    preds2[:,i] = m.predict(test_features.multiply(r))

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [84]:
f1_score(test_labels, preds2, average='micro')

0.7124519663151011