In [1]:
with open("klej_polemo2.0-in/train.tsv", "r") as f:
    raw_train = f.readlines()

In [2]:
with open("klej_polemo2.0-in/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [3]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    for doc in raw_data:
        text, target = doc.strip().split("\t")
        if "plus" in target:
            label = 0
        elif "minus" in target:
            label = 1
        else:
            label = 2
        corpus.append(text)
        labels.append(label)
    return corpus, labels

In [4]:
train_corpus, train_labels = prepare_data(raw_train[1:])

In [5]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

# SVM + słownik

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from scipy.sparse.csr import csr_matrix
from scipy.sparse import hstack
import pandas as pd
import spacy

nlp = spacy.load('pl_core_news_md')

In [7]:
senti_df = pd.read_csv("slownikWydzwieku01.csv", sep="\t", header=None)
senti_dict = senti_df[[0, 4]].set_index(0).to_dict()[4]

In [8]:
stopwords = set(pd.read_csv("https://raw.githubusercontent.com/bieli/stopwords/master/polish.stopwords.txt", 
                            header=None).values[:,0])
stopwords

{'a',
 'aby',
 'ach',
 'acz',
 'aczkolwiek',
 'aj',
 'albo',
 'ale',
 'alez',
 'ależ',
 'ani',
 'az',
 'aż',
 'bardziej',
 'bardzo',
 'beda',
 'bede',
 'bedzie',
 'bez',
 'bo',
 'bowiem',
 'by',
 'byc',
 'byl',
 'byla',
 'byli',
 'bylo',
 'byly',
 'bynajmniej',
 'być',
 'był',
 'była',
 'było',
 'były',
 'będzie',
 'będą',
 'będę',
 'cala',
 'cali',
 'caly',
 'cała',
 'cały',
 'ci',
 'cie',
 'ciebie',
 'cię',
 'co',
 'cokolwiek',
 'cos',
 'coś',
 'czasami',
 'czasem',
 'czemu',
 'czy',
 'czyli',
 'daleko',
 'deda',
 'dla',
 'dlaczego',
 'dlatego',
 'do',
 'dobrze',
 'dokad',
 'dokąd',
 'dosc',
 'dość',
 'duzo',
 'dużo',
 'dwa',
 'dwaj',
 'dwie',
 'dwoje',
 'dzis',
 'dzisiaj',
 'dziś',
 'gdy',
 'gdyby',
 'gdyz',
 'gdyż',
 'gdzie',
 'gdziekolwiek',
 'gdzies',
 'gdzieś',
 'go',
 'i',
 'ich',
 'ile',
 'im',
 'inna',
 'inne',
 'inny',
 'innych',
 'iz',
 'iż',
 'ja',
 'jak',
 'jakas',
 'jakaś',
 'jakby',
 'jaki',
 'jakichs',
 'jakichś',
 'jakie',
 'jakis',
 'jakiz',
 'jakiś',
 'jakiż',
 'jak

In [9]:
doc_train_corpus = list(nlp.pipe(train_corpus, disable=["ner"]))
doc_test_corpus = list(nlp.pipe(test_corpus, disable=["ner"]))

In [10]:
norm_train_corpus = [[token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stopwords] 
                     for doc in doc_train_corpus]
norm_test_corpus = [[token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stopwords] 
                    for doc in doc_test_corpus]

In [11]:
norm_train_corpus[0]

['super',
 'lekarz',
 'człowiek',
 'duży',
 'c',
 'duży',
 'doświadczenie',
 'trafny',
 'diagnoza',
 'wielki',
 'cierpliwość',
 'człowiek',
 'stary',
 'rok',
 'opiekować',
 'mama',
 'staruszka',
 'twierdzić',
 'mieć',
 'duży',
 'szczęście',
 'mieć',
 'lekarz',
 'naprawdę',
 'wiedzieć',
 'cobyśmy',
 'zrobić',
 'doktor',
 'dzięki',
 'mama',
 'żyć',
 'wizyta',
 'specjalista',
 'konsultować',
 'uważać',
 'dobry',
 'mieć',
 'ograniczony',
 'zaufanie',
 'dobry',
 'doktór',
 'napisać',
 'niestety',
 'mieć',
 'pacjent',
 'przepracować',
 'powód',
 'obawiać',
 'zdrowie',
 'dostęp',
 'trudny',
 'możliwy']

In [12]:
vect = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                       ngram_range=(1, 2), max_df=0.9, min_df=0.002)

In [13]:
X_train = vect.fit_transform(norm_train_corpus)

In [14]:
X_train.shape

(5783, 5223)

In [15]:
X_test = vect.transform(norm_test_corpus)

In [16]:
scores_train = [sum(senti_dict.get(token, 0) for token in doc) / len(doc) for doc in norm_train_corpus]
scores_test = [sum(senti_dict.get(token, 0) for token in doc) / len(doc) for doc in norm_test_corpus]

In [17]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [18]:
scores_train = csr_matrix(scores_train).T
scores_test = csr_matrix(scores_test).T

In [19]:
scores_train.shape

(5783, 1)

In [20]:
X_train_with_scores = hstack([X_train, scores_train])
X_test_with_scores = hstack([X_test, scores_test])

In [21]:
svm = LinearSVC(random_state=42, C=0.06)

In [22]:
svm.fit(X_train_with_scores, train_labels)

LinearSVC(C=0.06, random_state=42)

In [23]:
print(classification_report(train_labels, svm.predict(X_train_with_scores)))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      1568
           1       0.82      0.96      0.89      2194
           2       0.92      0.75      0.83      2021

    accuracy                           0.87      5783
   macro avg       0.88      0.87      0.87      5783
weighted avg       0.87      0.87      0.87      5783



In [24]:
print(classification_report(test_labels, svm.predict(X_test_with_scores)))

              precision    recall  f1-score   support

           0       0.86      0.75      0.80       209
           1       0.76      0.91      0.83       271
           2       0.80      0.72      0.76       243

    accuracy                           0.80       723
   macro avg       0.81      0.79      0.80       723
weighted avg       0.80      0.80      0.80       723

