In [1]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import spacy

nlp = spacy.load("pl_core_news_md")

In [2]:
df_train = pd.read_csv("klej_polemo2.0-in/train.tsv", sep="\t")
df_test = pd.read_csv("klej_polemo2.0-in/dev.tsv", sep="\t")

train_corpus = df_train["sentence"].tolist()
train_labels = df_train["target"].tolist()
test_corpus = df_test["sentence"].tolist()
test_labels = df_test["target"].tolist()

In [3]:
mapping = {"__label__meta_minus_m": 0, 
           "__label__meta_zero": 1, 
           "__label__meta_amb": 1, 
           "__label__meta_plus_m": 2}
train_labels = [mapping[label] for label in train_labels]
test_labels = [mapping[label] for label in test_labels]

In [4]:
doc_train_corpus = list(nlp.pipe(train_corpus, disable=["ner"]))
doc_test_corpus = list(nlp.pipe(test_corpus, disable=["ner"]))

norm_train_corpus = [[token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop] 
                     for doc in doc_train_corpus]
norm_test_corpus = [[token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop] 
                    for doc in doc_test_corpus]

In [5]:
vect = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x,
                       max_df=0.95, min_df=0.01)
X_train = vect.fit_transform(norm_train_corpus)
X_test = vect.transform(norm_test_corpus)

# Klasyfikacja

## Support Vector Machines

<img src="svm.png" alt="svm" align="center" width=400/>

<img src="both.png" alt="both" align="center" width=800/>

In [10]:
from sklearn.svm import LinearSVC, SVC

In [11]:
svm_linear = LinearSVC(random_state=42, max_iter=2000)

In [12]:
svm_linear.fit(X_train, train_labels)

LinearSVC(max_iter=2000, random_state=42)

In [13]:
train_predictions = svm_linear.predict(X_train)

In [14]:
test_predictions = svm_linear.predict(X_test)

In [15]:
accuracy_score(train_labels, train_predictions)

0.8842270194986073

In [16]:
accuracy_score(test_labels, test_predictions)

0.7786998616874136

In [17]:
svm_linear = LinearSVC(random_state=42, max_iter=2000, C=0.06)

In [18]:
svm_linear.fit(X_train, train_labels)

LinearSVC(C=0.06, max_iter=2000, random_state=42)

In [19]:
train_predictions = svm_linear.predict(X_train)

In [20]:
test_predictions = svm_linear.predict(X_test)

In [21]:
accuracy_score(train_labels, train_predictions)

0.8260793871866295

In [22]:
accuracy_score(test_labels, test_predictions)

0.7883817427385892

In [23]:
labels = ["negative", "neutral", "positive"]
pd.DataFrame(confusion_matrix(train_labels, train_predictions), index=labels, columns=labels)

Unnamed: 0,negative,neutral,positive
negative,2026,116,42
neutral,366,1448,193
positive,151,131,1271


In [24]:
pd.DataFrame(confusion_matrix(test_labels, test_predictions), index=labels, columns=labels)

Unnamed: 0,negative,neutral,positive
negative,237,24,10
neutral,46,176,21
positive,22,30,157


In [25]:
import joblib

In [26]:
joblib.dump(svm_linear, "svm.joblib")

['svm.joblib']

In [27]:
saved_svm = joblib.load("svm.joblib")

In [28]:
saved_svm.predict(X_test[:10])

array([0, 0, 0, 1, 1, 2, 2, 0, 1, 0])

In [6]:
svm_rbf = SVC(kernel="rbf", random_state=42)

In [7]:
svm_rbf.fit(X_train, train_labels)

SVC(random_state=42)

In [8]:
train_preds = svm_rbf.predict(X_train)

In [9]:
test_preds = svm_rbf.predict(X_test)

In [29]:
accuracy_score(train_labels, train_preds)

0.9717966573816156

In [30]:
accuracy_score(test_labels, test_preds)

0.7925311203319502