In [91]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
import emoji
import re
import random

# Reading and Preprocessing the Data

In [116]:
data = pd.read_csv("../../Danish/Danish/offenseval-da-training-v1.tsv", sep='\t')[:-1]

texts = data["tweet"].apply(str)
labels = data["subtask_a"].apply(str)
#texts

In [117]:
#tokenization and replacing emojis
texts_tokenized = []
for el in texts:
    texts_tokenized.append([emoji.demojize(el).lower() for el in el.split()]) #el.split()nltk.word_tokenize(el)
    
#texts_tokenized[2883]

In [102]:
#normalization
texts_normalized = []
for sent in texts_tokenized:
    # remove emojis
    #sent = [el for el in sent if not bool(re.search(r":.*:", el))] #or replace with just name? 
    
    # remove repeating characters
    sent = [re.sub(r"(.)\1{2,}", r"\1", el) for el in sent]
    
    # remove punctuation
    #sent = [re.sub(r"[!?\(\)\\&.,:><_#\[\]/]+", "", el)for el in sent]
    
    texts_normalized.append([el for el in sent if el != ""])
        
print(texts_normalized[2883])

['@user', 'næste', 'gang', 'pastaen', 'er', 'brændt', 'på', '!', ':face_with_tears_of_joy:', 'det', 'jo', 'sygt.']


In [103]:
# add the preprocessed texts back into the dataframe
texts_normalized2 = [" ".join(el) for el in texts_normalized]
data["tweet"] = texts_normalized2
data

Unnamed: 0,id,tweet,subtask_a
0,3131,"jeg tror det vil være dejlig køligt, men jeg v...",NOT
1,711,så kommer de nok til at investere i en ny cyke...,NOT
2,2500,nu er det jo også de ikea-aber der har lavet s...,OFF
3,2678,"128 varme emails, er vi enige om at det er sex...",NOT
4,784,"desværre tyder det på, at amerikanerne er helt...",NOT
...,...,...,...
2955,170,har sgu lidt en anelse om. det her kunne måske...,NOT
2956,1226,"ind og ruske tremmer med hende,den syge kælling!!",OFF
2957,2596,fedtmule,NOT
2958,1802,##har i hørt det?,NOT


In [104]:
# extract the tweets and labels
texts = data["tweet"].apply(str)
texts, labels

(0       jeg tror det vil være dejlig køligt, men jeg v...
 1       så kommer de nok til at investere i en ny cyke...
 2       nu er det jo også de ikea-aber der har lavet s...
 3       128 varme emails, er vi enige om at det er sex...
 4       desværre tyder det på, at amerikanerne er helt...
                               ...                        
 2955    har sgu lidt en anelse om. det her kunne måske...
 2956    ind og ruske tremmer med hende,den syge kælling!!
 2957                                             fedtmule
 2958                                    ##har i hørt det?
 2959    kommer det bag på nogen? det er jo fucking var...
 Name: tweet, Length: 2960, dtype: object,
 0       NOT
 1       NOT
 2       OFF
 3       NOT
 4       NOT
        ... 
 2955    NOT
 2956    OFF
 2957    NOT
 2958    NOT
 2959    OFF
 Name: subtask_a, Length: 2960, dtype: object)

## Split data in test and training sets

In [105]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.1, random_state=123)
texts_train, texts_test, labels_train, labels_test

(711                                              not bad!
 1412    ragnerok nærmer sig? jeg har svært ved at se f...
 1779                  *långyxa* på riktigt välfärdsspråk.
 773     hrmph. lånte fjer! er intet for helligt for ka...
 1286    det er sikkert svensken der er skyld i der ald...
                               ...                        
 1147    det er jo ikke sjovt! nogen ting spøger man ba...
 2154                                  det har du os rat i
 1766                          grønt er altså for bornholm
 1122    de skal bare kvæles langsomt så de ligger stil...
 1346    syntes man begynder at høre mere om partering ...
 Name: tweet, Length: 2664, dtype: object,
 418     ikke noget andre kvinder ikke har men ser bedr...
 1224    foretrækker dog isterningebakken, den kan brug...
 1617    troede at vores flag altid brandte?.. det gør ...
 1745                                                mere?
 2139                      også kendt, som opium med brus.
             

In [106]:
data['subtask_a'].value_counts()

NOT    2576
OFF     384
Name: subtask_a, dtype: int64

In [107]:
le =LabelEncoder()
le.fit(labels_train)
print(le.classes_)
train_labels=le.transform(labels_train)
#print(train_labels)
test_labels=le.transform(labels_test)
#print(test_labels)

['NOT' 'OFF']


In [108]:
vectorizer = TfidfVectorizer() #CountVectorizer()
vectorizer.fit_transform(texts)
#print(texts_train)
train_texts = vectorizer.transform(texts_train)
test_texts = vectorizer.transform(texts_test)
#print(train_texts.toarray())
#print(test_texts.toarray())

In [109]:
input_dim = len(train_texts.toarray()[0])
input_dim

9738

# Simple FFN with a linear layer and the sigmoid activation function

In [110]:
class ClassificationNet(nn.Module):

    def __init__(self):
        super(ClassificationNet, self).__init__()
        '''
        Defining layers of neural network
        '''
        self.fc1 = nn.Linear(input_dim, 100)
        self.fc2 = nn.Linear(100, 2)

    def forward(self, x):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (data_points, num_features)
        Returns:
            the resulting tensor.
        """
        x = self.fc1(x)
        x = torch.sigmoid(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        
        return x

In [111]:
net = ClassificationNet()

# define learning rate

n = 0.05

# define an optimizer

optimizer = optim.SGD(net.parameters(), lr=n)

# define loss function
#loss_func = nn.BCELoss() #nn.CrossEntropyLoss() 
loss_func = nn.CrossEntropyLoss()

# define number of epochs
epochs= 500 

#converting train and test set arrays to tensor
train_texts_tensor=torch.tensor(train_texts.toarray()).float()
print(len(train_texts_tensor))
train_labels_tensor=torch.tensor(train_labels)
test_texts_tensor=torch.tensor(test_texts.toarray()).float()
test_labels_tensor=torch.tensor(test_labels)
#print(train_texts_tensor, test_labels_tensor)

2664


In [112]:
def evaluation_metrics(pred_label,true_label):
    '''
    Returns accuracy and f1 score metrics for evaluation
    '''
    accuracy=accuracy_score(true_label,pred_label)
    f1score=f1_score(true_label,pred_label,average='macro')
    precision= precision_score(true_label, pred_label,average='macro')
    recall=recall_score(true_label, pred_label,average='macro')
    
    return f"Acc: {accuracy}, Prec: {precision}, Rec: {recall}, F1: {f1score}"

In [113]:
def step(x):
    if x < 0.5:
        return 0
    else:
        return 1

## Training Loop

In [114]:
for i in range(epochs):
    
    optimizer.zero_grad()
    output = net(train_texts_tensor)
    #print(output, train_labels_tensor.unsqueeze(1))
    #print(output.shape, train_labels_tensor.shape)
    loss = loss_func(output, train_labels_tensor.long()) #float().unsqueeze(1)
    #loss = loss_func(output, train_labels_tensor.float().unsqueeze(1))
    #print(loss)
    loss.backward()
    
    optimizer.step()
    
    
    with torch.no_grad():
            
        output = net(test_texts_tensor)
            
        loss_val = loss_func(output, test_labels_tensor.long()) #)float()).unsqueeze(1)
        #loss_val = loss_func(output,test_labels_tensor.float().unsqueeze(1))
        
        predict_label= output.data.max(1, keepdim=True)[1]
        #predict_label=[step(el) for el in output]
        #print(predict_label)
        
        #accuracy,f1score=evaluation_metrics(predict_label,test_labels_tensor.long())
        eval_metrics=evaluation_metrics(predict_label,test_labels_tensor.long())
        
    if i%10 == 0:
        print(i, eval_metrics)
print(i, eval_metrics)

  _warn_prf(average, modifier, msg_start, len(result))


0 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
10 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
20 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
30 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
40 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
50 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
60 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
70 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
80 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
90 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
100 Acc: 0.9054054054054054, Prec: 0.4527027027027027, Rec: 0.5, F1: 0.475177304964539
110 Acc: 0.9054054054054054, Prec: 0.4527027027027027,

KeyboardInterrupt: 

In [115]:
# count how many tweets get labeled correctly, how many are labeled wrongly, the amount of off tweets in the test set
# and the amount of tweets in total in the test set
output = net(test_texts_tensor)
predict_label= output.data.max(1, keepdim=True)[1]
#predict_label=[step(el) for el in output]
#print(output)
tp = 0
all_off = 0
all = 0
f = 0
for el, e in zip(predict_label, test_labels):
    #print(el.item(), e)
    if e == 1:
        all_off += 1
    if el.item() == e:
        tp += 1
    else:
        f += 1
    all += 1
print(tp, f, all_off, all)

268 28 28 296
