# POS classification with SVM

frequency dictionary from [IBL](https://dcl.bas.bg/frequency.html)

3 labels:
* 1 - noun
* 2 - verb
* 3 - adjective

only one label - no captirung of ambiguity/polysemy *става* as noun or as verb, personal bias of the anotator

stop words, abbreviations and named entities removed

In [2]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
word_data = pd.read_csv('data/train_data.csv', sep='\t', names=['word', 'label'], index_col=False)

In [4]:
word_data

Unnamed: 0,word,label
0,година,1
1,кажа,2
2,време,1
3,страна,1
4,дейност,1
...,...,...
4995,отмъстя,2
4996,засилване,1
4997,прощавам,2
4998,вана,1


In [5]:
word_data.label.value_counts()

1    2642
2    1340
3    1018
Name: label, dtype: int64

In [6]:
train_words, validate_words, train_labels, validate_labels = model_selection.train_test_split(word_data['word'], word_data['label'], test_size=0.05)

In [7]:
Encoder = LabelEncoder()
train_labels = Encoder.fit_transform(train_labels)
validate_labels = Encoder.fit_transform(validate_labels)

In [8]:
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(word_data['word'])
train_word_tfidf = tfidf_vect.transform(train_words)
validate_word_tfidf = tfidf_vect.transform(validate_words)

In [9]:
print(tfidf_vect.vocabulary_)

{'година': 551, 'кажа': 1483, 'време': 421, 'страна': 4201, 'дейност': 651, 'ням': 2330, 'нов': 2306, 'нещо': 2298, 'път': 3568, 'знам': 1164, 'решение': 3780, 'имам': 1407, 'български': 256, 'дружество': 805, 'ден': 669, 'съд': 4288, 'голям': 556, 'част': 4872, 'работа': 3578, 'член': 4904, 'искам': 1464, 'под': 2885, 'лице': 1857, 'цял': 4862, 'ръка': 3830, 'име': 1408, 'връзка': 425, 'човек': 4907, 'място': 2092, 'цел': 4838, 'край': 1734, 'дума': 810, 'случай': 4014, 'основание': 2549, 'жена': 928, 'взема': 317, 'търговски': 4531, 'глава': 525, 'град': 578, 'последен': 3064, 'вид': 324, 'живот': 936, 'управление': 4625, 'око': 2467, 'закон': 1022, 'съвет': 4277, 'става': 4141, 'въпрос': 489, 'започна': 1090, 'капитал': 1508, 'използвам': 1334, 'видя': 330, 'сила': 3950, 'право': 3146, 'направя': 2162, 'дете': 678, 'държавен': 833, 'представлявам': 3235, 'мисля': 2008, 'адрес': 26, 'свят': 3897, 'хор': 4804, 'услуга': 4646, 'час': 4869, 'европейски': 848, 'дам': 624, 'група': 605, '

In [10]:
print(train_word_tfidf)

  (0, 3370)	1.0
  (1, 104)	1.0
  (2, 3824)	1.0
  (3, 1344)	1.0
  (4, 877)	1.0
  (5, 1123)	1.0
  (6, 173)	1.0
  (7, 3480)	1.0
  (8, 2287)	1.0
  (9, 2636)	1.0
  (10, 510)	1.0
  (11, 679)	1.0
  (12, 937)	1.0
  (13, 2985)	1.0
  (14, 4112)	1.0
  (15, 1928)	1.0
  (16, 2042)	1.0
  (17, 3287)	1.0
  (18, 4149)	1.0
  (19, 3990)	1.0
  (20, 848)	1.0
  (21, 2536)	1.0
  (22, 4780)	1.0
  (23, 711)	1.0
  (24, 4795)	1.0
  :	:
  (4725, 2693)	1.0
  (4726, 2418)	1.0
  (4727, 42)	1.0
  (4728, 3902)	1.0
  (4729, 3126)	1.0
  (4730, 97)	1.0
  (4731, 3843)	1.0
  (4732, 1575)	1.0
  (4733, 806)	1.0
  (4734, 1655)	1.0
  (4735, 1934)	1.0
  (4736, 2444)	1.0
  (4737, 2235)	1.0
  (4738, 4708)	1.0
  (4739, 1138)	1.0
  (4740, 3445)	1.0
  (4741, 1509)	1.0
  (4742, 3047)	1.0
  (4743, 2132)	1.0
  (4744, 654)	1.0
  (4745, 4174)	1.0
  (4746, 2164)	1.0
  (4747, 1169)	1.0
  (4748, 2558)	1.0
  (4749, 3571)	1.0


In [11]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_word_tfidf, train_labels)
# predict the labels on validation dataset
predictions_NB = Naive.predict(validate_word_tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, validate_labels)*100)
print("Naive Bayes Precision Score -> ", precision_score(predictions_NB, validate_labels, average='weighted')*100)
print("Naive Bayes Recall Score -> ", recall_score(predictions_NB, validate_labels, average='weighted')*100)
print("Naive Bayes F1 Score -> ", f1_score(predictions_NB, validate_labels, average='weighted')*100)

Naive Bayes Accuracy Score ->  57.99999999999999
Naive Bayes Precision Score ->  100.0
Naive Bayes Recall Score ->  57.99999999999999
Naive Bayes F1 Score ->  73.41772151898734


  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_word_tfidf, train_labels)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(validate_word_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, validate_labels)*100)
print("SVM Precision Score -> ", precision_score(predictions_SVM, validate_labels, average='weighted')*100)
print("SVM Recall Score -> ", recall_score(predictions_SVM, validate_labels, average='weighted')*100)
print("SVM F1 Score -> ", f1_score(predictions_SVM, validate_labels, average='weighted')*100)

SVM Accuracy Score ->  57.99999999999999
SVM Precision Score ->  100.0
SVM Recall Score ->  57.99999999999999
SVM F1 Score ->  73.41772151898734


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
manual_eval = pd.read_csv('data/manual_eval.csv', sep='\t', names=['word', 'label'], index_col=False)

In [14]:
manual_eval

Unnamed: 0,word,label
0,златист,3
1,тумор,1
2,раздяла,1
3,заместник-министър,1
4,любител,1
...,...,...
95,напускане,1
96,ехо,1
97,оток,1
98,понасея,2


In [15]:
me_words = manual_eval.word

In [16]:
tfidf_vect.fit(manual_eval['word'])
predict_word_tfidf = tfidf_vect.transform(me_words)

In [17]:
new_predictions_SVM = SVM.predict(predict_word_tfidf)

ValueError: X has 101 features, but SVC is expecting 4997 features as input.