In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
import emoji
import re

In [3]:
data = pd.read_csv("Danish/Danish/offenseval-da-training-v1.tsv", sep='\t')[:-1]

texts = data["tweet"].apply(str)
labels = data["subtask_a"].apply(str)
#print(data.loc[2883])
texts

0       Jeg tror det vil være dejlig køligt, men jeg v...
1       Så kommer de nok til at investere i en ny cyke...
2       Nu er det jo også de Ikea-aber der har lavet s...
3       128 Varme emails, er vi enige om at det er sex...
4       Desværre tyder det på, at amerikanerne er helt...
                              ...                        
2955    Har sgu lidt en anelse om... det her kunne mås...
2956    Ind og ruske tremmer med hende,Den syge kælling!!
2957                                             fedtmule
2958                                    ##HAR I HØRT DET?
2959    Kommer det bag på nogen? Det er jo fucking var...
Name: tweet, Length: 2960, dtype: object

In [4]:
#tokenization
texts_tokenized = []
for el in texts:
    texts_tokenized.append([emoji.demojize(el) for el in el.split()]) #nltk.word_tokenize(el)

    
texts_tokenized[2883]

['@USER',
 'næste',
 'gang',
 'pastaen',
 'er',
 'brændt',
 'på',
 '!',
 ':face_with_tears_of_joy:',
 'det',
 'jo',
 'sygt...']

In [5]:
#normalization
texts_normalized = []
for sent in texts_tokenized:
    # remove emojis
    sent = [el.lower() for el in sent if not bool(re.search(r":.*:", el))] #or replace with just name?
    # remove repeating characters
    sent = [re.sub(r"(.)\1{2,}", r"\1", el) for el in sent]
    sent = [re.sub(r"[!?\(\)\\&.,:><_#\[\]/]+", "", el)for el in sent]
    texts_normalized.append([el for el in sent if el != ""])
        
print(texts_normalized[2883])

['@user', 'næste', 'gang', 'pastaen', 'er', 'brændt', 'på', 'det', 'jo', 'sygt']


In [6]:
texts_normalized2 = [" ".join(el) for el in texts_normalized]
texts_normalized3 = pd.DataFrame(texts_normalized2, columns=["tweet"])
texts_normalized3
data["tweet"] = texts_normalized2
data

Unnamed: 0,id,tweet,subtask_a
0,3131,jeg tror det vil være dejlig køligt men jeg vi...,NOT
1,711,så kommer de nok til at investere i en ny cyke...,NOT
2,2500,nu er det jo også de ikea-aber der har lavet s...,OFF
3,2678,128 varme emails er vi enige om at det er sext...,NOT
4,784,desværre tyder det på at amerikanerne er helt ...,NOT
...,...,...,...
2955,170,har sgu lidt en anelse om det her kunne måske ...,NOT
2956,1226,ind og ruske tremmer med hendeden syge kælling,OFF
2957,2596,fedtmule,NOT
2958,1802,har i hørt det,NOT


In [7]:
texts = data["tweet"].apply(str)
texts, labels

(0       jeg tror det vil være dejlig køligt men jeg vi...
 1       så kommer de nok til at investere i en ny cyke...
 2       nu er det jo også de ikea-aber der har lavet s...
 3       128 varme emails er vi enige om at det er sext...
 4       desværre tyder det på at amerikanerne er helt ...
                               ...                        
 2955    har sgu lidt en anelse om det her kunne måske ...
 2956       ind og ruske tremmer med hendeden syge kælling
 2957                                             fedtmule
 2958                                       har i hørt det
 2959    kommer det bag på nogen det er jo fucking varm...
 Name: tweet, Length: 2960, dtype: object,
 0       NOT
 1       NOT
 2       OFF
 3       NOT
 4       NOT
        ... 
 2955    NOT
 2956    OFF
 2957    NOT
 2958    NOT
 2959    OFF
 Name: subtask_a, Length: 2960, dtype: object)

In [8]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.25, random_state=123)
texts_train, texts_test, labels_train, labels_test

(2734    jeg vil foreslå to andre billeder af københavn...
 2886    og der bliver stadigt flereurl danmark længe leve
 2738                                                  lol
 2937    ah og vi har allerede fået bygget en mur mod i...
 2671    vi må da sige han går da all in når man skal t...
                               ...                        
 1147    det er jo ikke sjovt nogen ting spøger man bar...
 2154                                  det har du os rat i
 1766                          grønt er altså for bornholm
 1122    de skal bare kvæles langsomt så de ligger stil...
 1346    syntes man begynder at høre mere om partering ...
 Name: tweet, Length: 2220, dtype: object,
 418     ikke noget andre kvinder ikke har men ser bedr...
 1224    foretrækker dog isterningebakken den kan bruge...
 1617    troede at vores flag altid brandte det gør det...
 1745                                                 mere
 2139                        også kendt som opium med brus
             

In [9]:
data['subtask_a'].value_counts()

NOT    2576
OFF     384
Name: subtask_a, dtype: int64

In [18]:
le =LabelEncoder()
le.fit(labels_train)
print(le.classes_)
train_labels=le.transform(labels_train)
print(train_labels)
test_labels=le.transform(labels_test)
print(test_labels)

['NOT' 'OFF']
[0 0 0 ... 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0

In [24]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(texts_train)
#print(texts_train)
train_texts = vectorizer.transform(texts_train)
test_texts = vectorizer.transform(texts_test)
print(train_texts.toarray())
print(test_texts.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [49]:
input_dim = len(train_texts.toarray()[0])
input_dim


8127

In [177]:
class ClassificationNet(nn.Module):

    def __init__(self):
        super(ClassificationNet, self).__init__()
        '''
        Defining layers of neural network
        '''
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 2)


    def forward(self, x):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (data_points, num_features)
        Returns:
            the resulting tensor.
        """
        x = self.fc1(x)
        #print(x)
        x = torch.sigmoid(x)
        #print(x)
        x = self.fc2(x)
        #print(x)
        #x = torch.sigmoid(x)
        #x = torch.sigmoid(self.fc2(torch.sigmoid(self.fc1(x))))
        
        
        return x

In [178]:
net = ClassificationNet()

# learning rate

n = 0.05

# optimizer

optimizer = optim.SGD(net.parameters(), lr=n)

# loss function
loss_func = nn.CrossEntropyLoss() #nn.BCELoss()


epochs= 500 

#converting train and test set arrays to tensor
train_texts_tensor=torch.tensor(train_texts.toarray()).float()
print(len(train_texts_tensor))
train_labels_tensor=torch.tensor(train_labels)
test_texts_tensor=torch.tensor(test_texts.toarray()).float()
test_labels_tensor=torch.tensor(test_labels)
print(test_labels_tensor)

2220
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 

In [179]:
def evaluation_metrics(pred_label,true_label):
    '''
    Returns accuracy and f1 score metrics for evaluation
    '''
    accuracy=accuracy_score(true_label,pred_label)
    f1score=f1_score(true_label,pred_label,average='macro')
    
    return (accuracy,f1score)

In [180]:
# training
for i in range(epochs):
    
    #optimizer.zero_grad()
    output = net(train_texts_tensor)
    #print(output)
    #print(output.shape, train_labels_tensor.shape)
    loss = loss_func(output, train_labels_tensor.long())
    
    loss.backward()
    
    optimizer.step()
    
    with torch.no_grad():
            
        output = net(test_texts_tensor)
            
        loss_val = loss_func(output, test_labels_tensor.long())
        
        predict_label= output.data.max(1, keepdim=True)[1]
        
        accuracy,f1score=evaluation_metrics(predict_label,test_labels_tensor.long())
        
    if i%10 == 0:
        print(i, accuracy, f1score)
#for el in zip(predict_label, test_labels):
    #print(el)
        #pred = torch.stack(predict_y, 1).squeeze()
        #print(pred)
        #ev = evaluation_metrics(pred, test_labels_tensor.float())
        #print(ev)

0 0.904054054054054 0.47480482611781405
10 0.904054054054054 0.47480482611781405
20 0.904054054054054 0.47480482611781405
30 0.904054054054054 0.5363531270131223
40 0.9 0.4736842105263158
50 0.9 0.48680461838356576
60 0.904054054054054 0.546877560348768
70 0.904054054054054 0.47480482611781405
80 0.904054054054054 0.47480482611781405
90 0.904054054054054 0.47480482611781405
100 0.904054054054054 0.47480482611781405
110 0.904054054054054 0.47480482611781405
120 0.904054054054054 0.47480482611781405
130 0.904054054054054 0.47480482611781405
140 0.904054054054054 0.47480482611781405
150 0.904054054054054 0.47480482611781405
160 0.9027027027027027 0.4744318181818182
170 0.904054054054054 0.47480482611781405
180 0.9013513513513514 0.5542573293396373
190 0.8878378378378379 0.5819237497532519
200 0.9067567567567567 0.5154448398576513
210 0.9067567567567567 0.5494136023085272
220 0.8783783783783784 0.615153476331361
230 0.9 0.6304394773782529
240 0.9148648648648648 0.6311679681010134
250 0.916

In [187]:
output = net(test_texts_tensor)
print(output)
predict_label= output.data.max(1, keepdim=True)[1]
tp = 0
all = 0
f = 0
for el, e in zip(predict_label, test_labels):
    if el.item() == e:
        tp += 1
    else:
        f += 1
    all += 1
print(tp, f, all)

tensor([[  7.2928,  -7.3610],
        [ 12.8236, -12.8625],
        [  2.7378,  -2.8031],
        ...,
        [ -1.9575,   1.9652],
        [  6.8403,  -6.8913],
        [  4.2639,  -4.2754]], grad_fn=<AddmmBackward>)
678 62 740
