In [1]:
with open("klej_polemo2.0-in/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [2]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    for doc in raw_data:
        text, target = doc.strip().split("\t")
        if "plus" in target:
            label = 0
        elif "minus" in target:
            label = 1
        else:
            label = 2
        corpus.append(text)
        labels.append(label)
    return corpus, labels

In [3]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

# CLARIN-PL - Analizator wydźwięku

https://ws.clarin-pl.eu/sentyment.shtml

http://nlp.pwr.wroc.pl/redmine/projects/nlprest2/wiki

http://nlp.pwr.wroc.pl/redmine/projects/nlprest2/wiki/Tools

In [4]:
import json
from urllib.request import urlopen, Request
import os
import time
from collections import Counter
import pandas as pd
from sklearn.metrics import classification_report

In [5]:
if not os.path.exists("output"):
    os.mkdir("output")

In [6]:
user = "moj@adres.email" 
task = 'any2txt|wcrft2({"morfeusz2":true})|wsd|sentiment|out("senti")|sentimerge({"split_paragraphs":"False"})'
url = "http://ws.clarin-pl.eu/nlprest2/base"

out_path = 'output/'

In [7]:
def upload(doc, url):
    return urlopen(Request(url + "/upload/", doc.encode(), {'Content-Type': 'binary/octet-stream'})).read()

In [8]:
def process(data, url):
    doc = json.dumps(data).encode('utf-8')
    taskid = urlopen(Request(url + '/startTask/', doc, {'Content-Type': 'application/json'})).read()
    time.sleep(0.2)
    resp = urlopen(Request(url + '/getStatus/' + taskid.decode("utf-8")))
    data = json.load(resp)
    while data["status"] == "QUEUE" or data["status"] == "PROCESSING":
        time.sleep(0.5)
        resp = urlopen(Request(url + '/getStatus/' + taskid.decode("utf-8")))
        data = json.load(resp)
    if data["status"]=="ERROR":
        print("Error " + data["value"])
        return None
    return data["value"]

In [9]:
global_time = time.time()
for idx, doc in enumerate(test_corpus[:3]):
    doc_id = upload(doc, url)
    print(f"Processing: {idx}")
    data = {'lpmn' : task, 'user' : user, 'file' : doc_id.decode("utf-8")}
    data = process(data, url)
    if data is None:
        continue
    data = data["result"][0]["fileID"]
    content = urlopen(Request(url + '/download' + data)).read()
    with open(out_path + str(idx) + '.csv', "w") as f:
        f.write(content.decode("utf-8"))
print(f"GLOBAL {time.time() - global_time} seconds")

Processing: 0
Processing: 1
Processing: 2
GLOBAL 55.801297664642334 seconds


In [10]:
pd.read_csv("output/0.csv", sep=";").head(12)

Unnamed: 0,Lemma,Syn_id,Polarity,Emotion_names,Emotion_valuations,Units in synset
0,pani,129,0,,,białogłowa.1(15:os) wenusjanka.3(15:os) pani.2...
1,doktor,231013,0,,,doktor.3(7:por) dr.1(7:por)
2,bardzo,460680,0,,,bardzo.1(48:48)
3,miła,227066,1,"radość,zaufanie","dobro,piękno,szczęście",luba.1(15:os) dulcynea.2(15:os) miła.1(15:os) ...
4,szkła,31312,0,,,szkła.1(3:wytw) patrzałki.1(3:wytw) binokle.2(...
5,dobrany,104472,1,"radość,zaufanie","dobro,szczęście,użyteczność",dobrany.1(42:jak)
6,mama,6298,0,,,matka.1(15:os) mama.1(15:os) rodzicielka.1(15:os)
7,źle,454949,0,,,negatywnie.2(48:48) niedobrze.2(48:48) źle.3(4...
8,gabinet,5583,0,,,biuro.2(12:msc) kancelaria.1(12:msc) gabinet.4...
9,być,250920,0,,,być.9(40:cst)


In [11]:
pd.read_csv("output/0.csv", sep=";")["Polarity"].values

array([ 0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0])

In [12]:
scores = []
for i in range(len(test_corpus)):
    scores.append(pd.read_csv(f"output/{i}.csv", sep=";")["Polarity"].values)

In [13]:
scores_sum = [sum(doc) for doc in scores]

In [14]:
scores_avg = [sum(doc) / len(doc) for doc in scores]

In [15]:
max(scores_sum), min(scores_sum)

(11, -10)

In [16]:
max(scores_avg), min(scores_avg)

(0.2571428571428571, -0.27586206896551724)

In [17]:
def predictor(val, pos_t, neg_t):
    if val > pos_t:
        return 0
    elif val < neg_t:
        return 1
    else:
        return 2

In [18]:
Counter(test_labels)

Counter({1: 271, 0: 209, 2: 243})

In [19]:
sum_preds = [predictor(doc, 2, 1) for doc in scores_sum]
Counter(sum_preds)

Counter({1: 99, 2: 507, 0: 117})

In [20]:
avg_preds = [predictor(doc, 0.03, 0.01) for doc in scores_avg]
Counter(avg_preds)

Counter({1: 99, 2: 466, 0: 158})

In [21]:
print(classification_report(test_labels, sum_preds))

              precision    recall  f1-score   support

           0       0.50      0.28      0.36       209
           1       0.66      0.24      0.35       271
           2       0.33      0.70      0.45       243

    accuracy                           0.40       723
   macro avg       0.50      0.40      0.39       723
weighted avg       0.50      0.40      0.39       723



In [22]:
print(classification_report(test_labels, avg_preds))

              precision    recall  f1-score   support

           0       0.46      0.35      0.40       209
           1       0.66      0.24      0.35       271
           2       0.33      0.63      0.43       243

    accuracy                           0.40       723
   macro avg       0.48      0.41      0.39       723
weighted avg       0.49      0.40      0.39       723

