In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC

In [2]:
# read in corpus

import csv
filename = "../Danish/Danish/offenseval-da-training-v1.tsv"

corpus, labels = [], []

with open(filename, encoding="utf-8") as tsv_file:
    rd = csv.reader(tsv_file, delimiter="\t", quotechar='"')
    for row in rd:
        if len(row) != 3:
            continue
        if row[2] == "subtask_a":
            continue
        corpus.append(row[1])
        labels.append(row[2])
        #print(row)
        
print(len(corpus)==len(labels))
print(type(corpus[1]))

True
<class 'str'>


In [3]:
le = preprocessing.LabelEncoder()
le.fit(["NOT", "OFF"])
labels = le.transform(labels)
labels

array([0, 0, 1, ..., 0, 0, 1])

In [4]:
# split into test and training sets
train_texts, test_texts, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.1, random_state=7)

In [5]:
# fit CountVectorizer on train_corpus + test_corpus to have complete vocabulary and avoid out of vocabulary items (???)
# --> not sure this makes sense, this idea is from Manfred
vectorizer = CountVectorizer()

vectorizer.fit(train_texts+test_texts)

# transform training and test set separately
trainX = vectorizer.transform(train_texts)
testX = vectorizer.transform(test_texts)

train_and_test = vectorizer.transform(train_texts+test_texts)

In [6]:
# same for tfidf fit and transformation
transformer = TfidfTransformer()

transformer.fit(train_and_test)

trainX = transformer.transform(trainX)
testX = transformer.transform(testX)

## Shallow FFN with one linear layer and sigmoid activation function

In [10]:
# define the model 

import torch
import torch.nn as nn

class Net(nn.Module):   
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dim,1)  
       # self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.fc1.weight.data.uniform_(-initrange, initrange)
        
        
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))  

        return x

input_dim = len(trainX.toarray()[0])
#net=Net()    

#print(list(net.parameters()))
#len(train_labels),len(trainX.toarray())

In [11]:
from torch.utils.data import Dataset, DataLoader

class Data(torch.utils.data.Dataset):
  def __init__(self, data, labels):
        'Initialization'
        self.labels = labels
        self.data = data

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

  def __getitem__(self, index):
        'Generates one sample of data'
     
        X = self.data[index]
        y = self.labels[index]

        return torch.tensor(X,dtype=torch.float32,requires_grad=True), torch.tensor(y,dtype=torch.float32)
    
data = Data(trainX.toarray(),train_labels)

In [25]:
# training loop
import torch.optim as optim

dataloader = DataLoader(dataset=data,batch_size=20,shuffle=True)

shallow_net=Net()

# define stochastic gradient descent as optimizer
optimizer = optim.SGD(shallow_net.parameters(), lr=0.01)

# define loss function
loss_func = nn.BCELoss()

# training loop
for epoch in range(400):
    for batch_of_tensors, batch_labels  in  dataloader:
        optimizer.zero_grad()       
    
        outputs = shallow_net(batch_of_tensors) # forward pass
    
        loss = loss_func(outputs, batch_labels.view(-1,1))

        loss.backward()
        optimizer.step()

In [15]:
def step(x):  
    if x < 0.5:
        return 0
    else:
        return 1

In [26]:
# use dataloader for predictions: batch_size = testdata size

testdata = Data(testX.toarray(),test_labels)

testdataloader = DataLoader(dataset=testdata,batch_size=len(testdata))

testdata,_ = next(iter(testdataloader))  # only one batch, label not needed since test_y are gold

y_test_predict = shallow_net(testdata)     # make the prediction
y_test_predict = [step(val) for val in y_test_predict.squeeze().detach().numpy()]  # step function

print(y_test_predict)

accuracy_score(test_labels,y_test_predict)  # sklearn scorer

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


0.8716216216216216

In [27]:
shallow_net_report = classification_report(test_labels, y_test_predict, zero_division=0)
print(shallow_net_report)

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       258
           1       0.00      0.00      0.00        38

    accuracy                           0.87       296
   macro avg       0.44      0.50      0.47       296
weighted avg       0.76      0.87      0.81       296

