# LinearSVC and NaiveBayes for Offensive Text Identification

## Content

- 1 Read in and Preprocess Data
    - 1.1 Read in Data
    - 1.2 Tokenize and Normalize Data
    - 1.3 Split Training and Tests Sets
- 2 LinearSVC
    - 2.1 With CountVectorizer only
    - 2.2 With CountVectorizer and TfidfTransformer
- 3 Complement Naive Bayes
    - 3.1 With CountVectorizer only
    

In [168]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
import nltk
import emoji
import re
import random
import spacy

## 1 Read in and Preprocess Data

### 1.1 Read in Data

In [169]:
# read in corpus

import csv
filename = "../../Danish/Danish/offenseval-da-training-v1.tsv"

corpus_raw, labels = [], []

with open(filename, encoding="utf-8") as tsv_file:
    rd = csv.reader(tsv_file, delimiter="\t", quotechar='"')
    for row in rd:
        if len(row) != 3:
            continue
        if row[2] == "subtask_a":
            continue
        corpus_raw.append(row[1])
        labels.append(row[2])
        #print(row)
        
print(len(corpus_raw)==len(labels))
print(len(corpus_raw))
print(type(corpus_raw[1]))

True
2960
<class 'str'>


### 1.2 Tokenize and Normalize Data

In [170]:
import spacy
import emoji
import re

In [171]:
# preprocess
nlp = spacy.load("da_core_news_sm")

corpus = []
for sample in corpus_raw:
    doc = nlp(sample)
    #tokens = [token.text for token in doc]
    tokens = []
    for token in doc:
        tok = token.text
        tok = tok.lower()
        tok = emoji.demojize(tok)
        tok = re.sub(r"(.)\1+", r"\1\1", tok)
        tokens.append(tok)
    tokens = " ".join(tokens)
    corpus.append(tokens)

print(corpus)



In [172]:
le = preprocessing.LabelEncoder()
le.fit(["NOT", "OFF"])
labels = le.transform(labels)
labels

array([0, 0, 1, ..., 0, 0, 1])

### 1.3 Split into Training and Test Sets

In [173]:
# split into test and training sets
train_texts, test_texts, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.1, random_state=7)

## 2 LinearSVC

### 2.1 With CountVectorizer only

In [174]:
# fit and transform Texts with CountVectorizer
# customize token_pattern parameter to preserve punctuation when tokenizing:
pattern = r'(?u)\s?(\S+)\s?'
vectorizer = CountVectorizer(token_pattern=pattern)

vectorizer.fit(corpus) #

# transform training and test set separately
trainX = vectorizer.transform(train_texts)
testX = vectorizer.transform(test_texts)

train_and_test = vectorizer.transform(corpus) #+test_texts
print("shape trainX: ", trainX.shape, "\n")
print("shape testX: ", testX.shape)

shape trainX:  (2664, 10158) 

shape testX:  (296, 10158)


#### Training

In [175]:
# train with sklearn's linearSVC

linear_svc = LinearSVC()

linear_svc.fit(trainX, train_labels)

score = linear_svc.score(trainX, train_labels)
print("Score: ", score)

Score:  0.9996246246246246


#### Evaluation

In [176]:
# predict the test data

predictions = linear_svc.predict(testX)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0])

In [177]:
report = classification_report(test_labels, predictions)
accuracy = accuracy_score(test_labels, predictions)
print(report)
print(f"More accurate accuracy: {accuracy}")

              precision    recall  f1-score   support

           0       0.93      0.97      0.94       258
           1       0.67      0.47      0.55        38

    accuracy                           0.90       296
   macro avg       0.80      0.72      0.75       296
weighted avg       0.89      0.90      0.89       296

More accurate accuracy: 0.902027027027027


### 2.2 With CountVectorizer and TfidfTransformer

In [178]:
# fit and transform with CountVectorizer
# customize token_pattern parameter to preserve punctuation when tokenizing:
pattern = r'(?u)\s?(\S+)\s?'
vectorizer = CountVectorizer(token_pattern=pattern)

vectorizer.fit(corpus) #+test_texts

# transform training and test set separately
trainX = vectorizer.transform(train_texts)
testX = vectorizer.transform(test_texts)

train_and_test = vectorizer.transform(corpus) #+test_texts
print("shape trainX: ", trainX.shape, "\n")
print("shape testX: ", testX.shape)

shape trainX:  (2664, 10158) 

shape testX:  (296, 10158)


In [179]:
# same for tfidf fit and transformation
transformer = TfidfTransformer()

transformer.fit(train_and_test)

trainX = transformer.transform(trainX)
testX = transformer.transform(testX)

#### Training

In [180]:
# train with sklearn's linearSVC

linear_svc = LinearSVC()

linear_svc.fit(trainX, train_labels)

score = linear_svc.score(trainX, train_labels)
print("Score: ", score)

Score:  0.9973723723723724


#### Evaluation

In [181]:
# predict the test data

predictions = linear_svc.predict(testX)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0])

In [182]:
report = classification_report(test_labels, predictions)
accuracy = accuracy_score(test_labels, predictions)
print(report)
print(f"More accurate accuracy: {accuracy}")

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       258
           1       0.81      0.34      0.48        38

    accuracy                           0.91       296
   macro avg       0.86      0.67      0.71       296
weighted avg       0.90      0.91      0.89       296

More accurate accuracy: 0.9054054054054054


## 3 Complement Naive Bayes

### 3.1 With CountVectorizer only

In [183]:
# fit and transform Texts with CountVectorizer
pattern = r'(?u)\s?(\S+)\s?'
vectorizer = CountVectorizer(token_pattern=pattern)

vectorizer.fit(corpus)

# transform training and test set separately
trainX = vectorizer.transform(train_texts)
testX = vectorizer.transform(test_texts)

train_and_test = vectorizer.transform(corpus)
print("shape trainX: ", trainX.shape, "\n")
print("shape testX: ", testX.shape)

shape trainX:  (2664, 10158) 

shape testX:  (296, 10158)


#### Training

In [184]:
from sklearn.naive_bayes import ComplementNB

In [185]:
cnb = ComplementNB()

dense_trainX = trainX.toarray()

cnb.fit(dense_trainX, train_labels)

score = cnb.score(dense_trainX, train_labels)
print("Score: ", score)

Score:  0.9215465465465466


#### Evaluation

In [186]:
cnb_preds = cnb.predict(testX.toarray())
cnb_preds

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0])

In [187]:
cnb_report = classification_report(test_labels, cnb_preds)
print(cnb_report)

              precision    recall  f1-score   support

           0       0.90      0.82      0.86       258
           1       0.23      0.37      0.28        38

    accuracy                           0.76       296
   macro avg       0.56      0.59      0.57       296
weighted avg       0.81      0.76      0.78       296

