# Setup

In [0]:
# !unzip data.zip


In [0]:
import pandas as pd

In [0]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/impermium_verification_labels.csv")

## Cleanup

In [0]:

def cleanup(series):
    series = series.apply(lambda x: bytes(x, 'ascii').decode('unicode-escape'))
    # remove utf-8 control charecters
    series = series.str.replace(r"\\x\w\w", " ")
    # remove urls
    series = series.str.replace(r"https?://.*([\s]|$)", " ")
    # convert didn't to did not
    series = series.str.replace(r"(\w+)n't", r"\1 not")
    # remove weird charecters
    series = series.str.replace(r"(\\n|\\r)", " ")
    series = series.str.replace(r"[\./@\"'\\!@#$%^&*()\-_=+{\[\]}?/>,<;:|`~]", " ")
    # remove digits
    series = series.str.replace(r"\d", "")
    # remove single letters
    # series = series.str.replace(r"(\s|^)\d(\s|$)", " ")
    # trim spaces
    series = series.str.replace("\s+", " ")
    # convert to lowercase
    series = series.str.lower()

    return series

## Lemmatization and Stop Word removal

In [0]:
import nltk
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')


w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords_set = {x for x in stopwords.words()}

def lemmatize_text(text):
    return " ".join([
        lemmatizer.lemmatize(w)
        for w in w_tokenizer.tokenize(text) 
        if w not in stopwords_set
    ])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Parts of Speech


In [0]:
nltk.download('averaged_perceptron_tagger')

tag_list = []

def parts_of_speech(series, fit=True):
    global tag_list
    def count_pos(text):
        tokens = w_tokenizer.tokenize(text)
        tags = nltk.pos_tag(tokens)
        result = {}
        for _, tag in tags:
            if tag not in result:
                result[tag]  = 0
            result[tag] += 1
        for tag in result:
            result[tag] /= len(tokens)
        return result
    items = []
    for _, value in series.iteritems():
        items.append(count_pos(value))
    
    if fit:
        df = pd.DataFrame(items).fillna(0)
        tag_list = df.columns
        return df
    else:
        return pd.DataFrame(items, columns=tag_list).fillna(0)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Classifier Evaluation

User accuracy, recall and f1 score to measure accuracy of the model. 

In [0]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

def evaluate(classifier, y_true, y_pred):
    y_acc_score = accuracy_score(df_test["Insult"], y_pred)
    y_recall_score = recall_score(df_test["Insult"], y_pred)
    y_f1_score = f1_score(df_test["Insult"], y_pred)

    print(f"Accuracy : {y_acc_score:.4}")
    print(f"Recall : {y_recall_score:.4}")
    print(f"F1 Score : {y_f1_score:.4}")

# Naive Bayes

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB


def bayes_simple():
    train_comments = cleanup(df_train["Comment"])
    test_comments= cleanup(df_test["Comment"])

    vectorizer = CountVectorizer(
        lowercase=True,
        strip_accents='unicode',
        analyzer="word",
        min_df=0.02, max_df=0.7)
    train_bow = vectorizer.fit_transform(train_comments).todense()
    test_bow = vectorizer.transform(test_comments).todense()

    classifier = GaussianNB(var_smoothing=0.01, priors=[0.8, 0.2])
    classifier.fit(train_bow, df_train["Insult"])
    
    y_pred = classifier.predict(test_bow)
    evaluate(classifier, df_test["Insult"], y_pred)

bayes_simple()

Accuracy : 0.4779
Recall : 0.9424
F1 Score : 0.635


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def bayes_lemmatize():
    train_comments = cleanup(df_train["Comment"]).apply(lemmatize_text)
    test_comments= cleanup(df_test["Comment"]).apply(lemmatize_text)

    vectorizer = CountVectorizer(
        analyzer="word",
        ngram_range=(1, 1)) # replace with (2,2) for bigrams, however the score is worse

    train_bow = vectorizer.fit_transform(train_comments).todense()
    test_bow = vectorizer.transform(test_comments).todense()

    classifier = MultinomialNB(alpha=1)
    classifier.fit(train_bow, df_train["Insult"])
    
    y_pred = classifier.predict(test_bow)
    evaluate(classifier, df_test["Insult"], y_pred)
bayes_lemmatize()

Accuracy : 0.6814
Recall : 0.4782
F1 Score : 0.5913


# Feature Extraction

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

def svm_data():
    train_comments = cleanup(df_train["Comment"]).apply(lemmatize_text)
    test_comments= cleanup(df_test["Comment"]).apply(lemmatize_text)

    insult_vectorizer = TfidfVectorizer(
        analyzer="word",
        ngram_range=(1,2),
        max_df=0.7,
        min_df=0.002)
    
    df_train_insult = df_train[df_train["Insult"] == 1]
    train_insult_comments = cleanup(df_train_insult["Comment"]).apply(lemmatize_text)
    insult_vectorizer.fit(train_insult_comments)
    

    neutral_vectorizer = TfidfVectorizer(
        analyzer="word",
        ngram_range=(1,2),
        max_df=0.7,
        min_df=0.002)
    
    df_train_neutral = df_train[df_train["Insult"] == 0]
    train_neutral_comments = cleanup(df_train_neutral["Comment"]).apply(lemmatize_text)

    neutral_vectorizer.fit(train_neutral_comments)
    
    vectorizer = TfidfVectorizer(
        analyzer="word",
        max_df=0.2,
        min_df=0.0008)
    
    train_bow = vectorizer.fit_transform(train_comments)
    train_neubow = neutral_vectorizer.transform(train_comments)
    train_inbow = insult_vectorizer.transform(train_comments)
    train_pos = parts_of_speech(train_comments).values

    test_bow = vectorizer.transform(test_comments)
    test_neubow = neutral_vectorizer.transform(test_comments)
    test_inbow = insult_vectorizer.transform(test_comments)
    test_pos = parts_of_speech(test_comments, fit=False)

    train_data = sparse.hstack([train_neubow, train_inbow, train_pos]).todense()
    test_data = sparse.hstack([test_neubow, test_inbow, test_pos]).todense()
    
    print(train_bow.shape[-1], train_pos.shape[-1], train_neubow.shape[-1], train_inbow.shape[-1], train_data.shape[-1])

    return train_data, test_data
train_data, test_data = svm_data()

2601 32 1522 1058 2612


# SVM

In [0]:
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem

def run_svm():
    classifier = SVC()
    classifier.fit(train_data, df_train["Insult"])

    y_pred = classifier.predict(test_data)
    evaluate(classifier, df_test["Insult"], y_pred)

run_svm()    

Accuracy : 0.6595
Recall : 0.3445
F1 Score : 0.4937


# Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

def run_random_forest():
    classifier = RandomForestClassifier(max_depth=150)
    classifier.fit(train_data, df_train["Insult"])

    y_pred = classifier.predict(test_data)
    evaluate(classifier, df_test["Insult"], y_pred)

run_random_forest()    

Accuracy : 0.6837
Recall : 0.4178
F1 Score : 0.56


# Optimization

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.svm import OneClassSVM, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier


def run_opt():
    # a_classifier = BernoulliNB()
    # a_classifier.fit(train_data, df_train["Insult"])

    # b_classifier = RandomForestClassifier()
    # b_classifier.fit(train_data, df_train["Insult"])

    # classifier = VotingClassifier(estimators=[('BernouliNB', a_classifier), ('RF', b_classifier)],
    #                         voting='soft',
    #                         weights=[4, 1])

    classifier = BernoulliNB(alpha=0.8)
    classifier.fit(train_data, df_train["Insult"])

    y_pred = classifier.predict(test_data)
    evaluate(classifier, df_test["Insult"], y_pred)



run_opt()

Accuracy : 0.7123
Recall : 0.6555
F1 Score : 0.6871
