## Naive Bayes - LR

In [4]:
import pandas as pd
import h5py
import numpy as np
from nltk.corpus import stopwords
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.stem.wordnet import WordNetLemmatizer
import data_loader
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.set_cmap('RdYlBu')

<Figure size 432x288 with 0 Axes>

In [5]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.sparse import hstack
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
train = pd.read_csv("dataset/train.csv").fillna('unknown')
test = data_loader.load_test_data('dataset/test.csv','dataset/test_labels.csv').fillna('unknown')

In [7]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [8]:
repl = {
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " frown ",
    ":(": " frown ",
    ":s": " frown ",
    ":-s": " frown ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

In [9]:
new_train_data = []
new_test_data = []

list_train = train['comment_text'].tolist()
list_test = test['comment_text'].tolist()

for i in list_train:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in repl.keys():
            j = repl[j]
        xx = xx + j + " "
    new_train_data.append(xx)

for i in list_test:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in repl.keys():
            j = repl[j]
        xx = xx + j + " "
    new_test_data.append(xx)

train["clean_comment_text"] = new_train_data
test["clean_comment_text"] = new_test_data

In [10]:
pattern = re.compile(r'[^a-zA-Z ?!]+')
train_text = train["clean_comment_text"].tolist()
test_text = test["clean_comment_text"].tolist()
for i,c in enumerate(train_text):
    train_text[i] = pattern.sub('',train_text[i].lower())
for i,c in enumerate(test_text):
    test_text[i] = pattern.sub('',test_text[i].lower())

In [11]:
train['comment_text'] = train_text
test["comment_text"] = test_text
del train_text, test_text
train.drop(['clean_comment_text'], inplace = True, axis = 1)
test.drop(['clean_comment_text'], inplace = True, axis = 1)

In [12]:
all_text = pd.concat([train['comment_text'],test['comment_text']])

word_vectorizer = TfidfVectorizer(ngram_range =(1,3),
                             tokenizer=tokenize,
                             min_df=3, max_df=0.9,
                             strip_accents='unicode',
                             stop_words = 'english',
                             analyzer = 'word',
                             use_idf=1,
                             smooth_idf=1,
                             sublinear_tf=1 )

char_vectorizer = TfidfVectorizer(ngram_range =(1,4),
                                 min_df=3, max_df=0.9,
                                 strip_accents='unicode',
                                 analyzer = 'char',
                                 stop_words = 'english',
                                 use_idf=1,
                                 smooth_idf=1,
                                 sublinear_tf=1,
                                 max_features=50000)

vectorizer = make_union(word_vectorizer, char_vectorizer)

vectorizer.fit(all_text)

train_matrix =vectorizer.transform(train['comment_text'])
test_matrix = vectorizer.transform(test['comment_text'])

In [13]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
class_names = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [14]:
val_score = []
def cross_validation(model,y_train):
    score = cross_val_score(model,train_matrix,y_train,scoring='accuracy',cv=5)
    val_score.append(score.mean())

In [15]:
def pr(y_i, y):
    p = train_matrix[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [16]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=3, dual=True)
    x_nb = train_matrix.multiply(r)
    return m.fit(x_nb, y), r

In [17]:
preds = np.zeros((len(test), len(class_names)))

for i, j in enumerate(class_names):
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_matrix.multiply(r))[:,1]
    
np.save("nblr-svm_test_predict.npy", preds)

In [20]:
#Submission
subm = pd.read_csv('dataset/sample_submission.csv')
predictions = np.load('nblr-svm_test_predict.npy')
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(predictions, columns = class_names)], axis=1)
submission.to_csv('nblr-svm_submission.csv', index=False)