In [37]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC

In [39]:
# read in corpus

import csv
filename = "../../Danish/Danish/offenseval-tr-training-v1.tsv"

corpus_raw, labels = [], []

with open(filename, encoding="utf-8") as tsv_file:
    rd = csv.reader(tsv_file, delimiter="\t", quotechar='"')
    for row in rd:
        if len(row) != 3:
            continue
        if row[2] == "subtask_a":
            continue
        corpus_raw.append(row[1])
        labels.append(row[2])
        #print(row)
        
print(len(corpus_raw)==len(labels))
print(type(corpus_raw[1]))

True
<class 'str'>


In [3]:
import spacy
import emoji
import re

In [40]:
# preprocess
nlp = spacy.load("da_core_news_sm")

corpus = []
for sample in corpus_raw:
    doc = nlp(sample)
    #tokens = [token.text for token in doc]
    tokens = []
    for token in doc:
        tok = token.text
        tok = tok.lower()
        tok = emoji.demojize(tok)
        tok = re.sub(r"(.)\1+", r"\1\1", tok)
        tokens.append(tok)
    tokens = " ".join(tokens)
    corpus.append(tokens)

print(corpus)

KeyboardInterrupt: 

In [19]:
le = preprocessing.LabelEncoder()
le.fit(["NOT", "OFF"])
labels = le.transform(labels)
labels

array([0, 0, 1, ..., 0, 0, 1])

In [20]:
# split into test and training sets
train_texts, test_texts, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.1, random_state=7)

In [15]:
# to test the regex pattern that should be passed to CountVectorizer so that it doesn't change the tokenization done before
test_lst = ['jeg', 'har', 'gået', 'et', 'stykke', 'af', 'kungsleden', 'i', 'sverige', '.', 'det', 'kan', 'varmt', 'anbefales', '.', 'det', 'er', 'billigt', 'hvis', 'du', 'selv', 'slæber', 'mad', 'med', '.', ':smiling_face_with_smiling_eyes:']
test_str = " ".join(test_lst)
pattern = r'(?u)\s?(\S+)\s?'
tokens = re.findall(pattern, test_str)
#for tok1, tok2 in zip(test_lst, tokens):
#    print(f"{tok1}\t\t{tok2}")
test_lst == tokens


True

In [21]:
# fit CountVectorizer on train_corpus + test_corpus to have complete vocabulary and avoid out of vocabulary items (???)
# --> not sure this makes sense, this idea is from Manfred

# TODO: customize token_pattern parameter to preserve punctuation when tokenizing
vectorizer = CountVectorizer(token_pattern=pattern)

vectorizer.fit(train_texts+test_texts)

# transform training and test set separately
trainX = vectorizer.transform(train_texts)
testX = vectorizer.transform(test_texts)

train_and_test = vectorizer.transform(train_texts+test_texts)
print("shape trainX: ", trainX.shape, "\n")
print("shape testX: ", testX.shape)
#added = trainX + testX
#print("shape added: ", added)

shape trainX:  (2664, 10158) 

shape testX:  (296, 10158)


In [22]:
# same for tfidf fit and transformation
transformer = TfidfTransformer()

transformer.fit(train_and_test)

trainX = transformer.transform(trainX)
testX = transformer.transform(testX)

In [23]:
# train with sklearn's linearSVC

linear_svc = LinearSVC()

linear_svc.fit(trainX, train_labels)

score = linear_svc.score(trainX, train_labels)
print("Score: ", score)

Score:  0.9973723723723724


In [24]:
# predict the test data

predictions = linear_svc.predict(testX)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0])

In [25]:
report = classification_report(test_labels, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       258
           1       0.81      0.34      0.48        38

    accuracy                           0.91       296
   macro avg       0.86      0.67      0.71       296
weighted avg       0.90      0.91      0.89       296



## Naive Bayes

In [26]:
# train with sklearn's Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB


In [27]:
gnb = GaussianNB()

dense_trainX = trainX.toarray()
#dense_labels = train_labels.toarray()

gnb.fit(dense_trainX, train_labels)

score = gnb.score(dense_trainX, train_labels)
print("Score: ", score)

Score:  0.9159159159159159


In [28]:
gnb_preds = gnb.predict(testX.toarray())
gnb_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0])

In [29]:
gnb_report = classification_report(test_labels, gnb_preds)
print(gnb_report)

              precision    recall  f1-score   support

           0       0.88      0.67      0.76       258
           1       0.14      0.37      0.20        38

    accuracy                           0.63       296
   macro avg       0.51      0.52      0.48       296
weighted avg       0.78      0.63      0.69       296



## KNeighbors

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [31]:
knn = KNeighborsClassifier(50, weights="distance")

knn.fit(trainX, train_labels)

score = knn.score(trainX, train_labels)
print("Score: ", score)

Score:  0.9996246246246246


In [32]:
knn_preds = knn.predict(testX)
knn_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [33]:
knn_report = classification_report(test_labels, knn_preds)
print(knn_report)

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       258
           1       0.00      0.00      0.00        38

    accuracy                           0.87       296
   macro avg       0.44      0.50      0.47       296
weighted avg       0.76      0.87      0.81       296



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVC

In [34]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [35]:
svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc.fit(trainX.toarray(), train_labels)

score = svc.score(trainX.toarray(), train_labels)
print("Score: ", score)

Score:  0.9204204204204204


In [36]:
svc_preds = svc.predict(testX.toarray())
svc_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [63]:
svc_report = classification_report(test_labels, svc_preds)
print(svc_report)

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       258
           1       0.00      0.00      0.00        38

    accuracy                           0.87       296
   macro avg       0.44      0.50      0.47       296
weighted avg       0.76      0.87      0.81       296



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
vectorizer = CountVectorizer()


X = vectorizer.fit_transform(corpus)
print(X)

# alphabetically ordered
#vectorizer.get_feature_names() == (
#    ['darauf','dass','die','gefällt','jeder','kiefer','mein','mir','nicht','pocht','wird'])

print(vectorizer.get_feature_names())
print(X.shape)

# the vectors

X.toarray()

  (0, 8708)	1
  (1, 3949)	2
  (1, 8629)	1
  (1, 1498)	2
  (1, 9318)	2
  (1, 9472)	1
  (1, 1435)	1
  (1, 4607)	1
  (1, 5223)	1
  (1, 3196)	1
  (1, 5197)	2
  (1, 8174)	1
  (1, 8225)	1
  (1, 6266)	1
  (1, 824)	1
  (1, 7470)	1
  (1, 4975)	1
  (1, 7667)	1
  (1, 1465)	1
  (2, 8219)	2
  (2, 4319)	1
  (2, 1421)	1
  (2, 5707)	1
  (2, 8430)	1
  (2, 508)	1
  :	:
  (2957, 5169)	1
  (2957, 1456)	1
  (2957, 3722)	1
  (2957, 3255)	1
  (2957, 8187)	1
  (2957, 4567)	1
  (2957, 8596)	1
  (2957, 6935)	1
  (2958, 2095)	1
  (2959, 1498)	1
  (2959, 3182)	1
  (2959, 3639)	1
  (2960, 1498)	3
  (2960, 4319)	1
  (2960, 6579)	1
  (2960, 1912)	2
  (2960, 3959)	2
  (2960, 5169)	1
  (2960, 5701)	1
  (2960, 2644)	1
  (2960, 545)	1
  (2960, 9171)	1
  (2960, 6992)	1
  (2960, 5570)	1
  (2960, 9638)	1
(2961, 9735)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])