In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
## install and load dataset
!pip install conllu

Collecting conllu
  Downloading https://files.pythonhosted.org/packages/e4/81/6a0166de55990d32ac4dd8375c83d918486ea2a3ef9036f39620f5bf7f94/conllu-4.3-py2.py3-none-any.whl
Installing collected packages: conllu
Successfully installed conllu-4.3


In [4]:
cd "/content/drive/MyDrive/POS_tagging"

/content/drive/MyDrive/POS_tagging


In [5]:
!wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz

--2021-01-26 04:26:18--  https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz
Resolving lindat.mff.cuni.cz (lindat.mff.cuni.cz)... 195.113.20.140
Connecting to lindat.mff.cuni.cz (lindat.mff.cuni.cz)|195.113.20.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 355216681 (339M) [application/x-gzip]
Saving to: ‘ud-treebanks-v2.5.tgz’


2021-01-26 04:26:37 (18.6 MB/s) - ‘ud-treebanks-v2.5.tgz’ saved [355216681/355216681]



In [6]:
!ls

'pos_tagger(Penn Treebank data).ipynb'	     ud-treebanks-v2.5.tgz
'pos_tagger(universal dependencies).ipynb'


In [10]:
!tar -zxvf ud-treebanks-v2.5.tgz

ud-treebanks-v2.5/
ud-treebanks-v2.5/UD_Portuguese-GSD/
ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.txt
ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-train.conllu
ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-test.conllu
ud-treebanks-v2.5/UD_Portuguese-GSD/README.md
ud-treebanks-v2.5/UD_Portuguese-GSD/stats.xml
ud-treebanks-v2.5/UD_Portuguese-GSD/LICENSE.txt
ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.conllu
ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-test.txt
ud-treebanks-v2.5/UD_Portuguese-GSD/pt_gsd-ud-dev.txt
ud-treebanks-v2.5/UD_Portuguese-PUD/
ud-treebanks-v2.5/UD_Portuguese-PUD/LICENSE.txt
ud-treebanks-v2.5/UD_Portuguese-PUD/stats.xml
ud-treebanks-v2.5/UD_Portuguese-PUD/README.md
ud-treebanks-v2.5/UD_Portuguese-PUD/pt_pud-ud-test.txt
ud-treebanks-v2.5/UD_Portuguese-PUD/pt_pud-ud-test.conllu
ud-treebanks-v2.5/UD_Telugu-MTG/
ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.txt
ud-treebanks-v2.5/UD_Telugu-MTG/stats.xml
ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-dev.tx

In [12]:
from io import open
from conllu import parse_incr

In [28]:
#Open the file and load the sentences to a list.
data_file = open("ud-treebanks-v2.5/UD_English-GUM/en_gum-ud-train.conllu", "r", encoding="utf-8")

ud_files = []
for tokenlist in parse_incr(data_file):
  ud_files.append(tokenlist)

print(f"Total number of token files :- {len(ud_files)}")
print(f"One example from the dataset :- \n {ud_files[9]}")

Total number of token files :- 3753
One example from the dataset :- 
 TokenList<The, viewing, experience, of, art, is, a, complex, one, ,, involving, issues, of, perception, ,, attention, ,, memory, ,, decision-making, ,, affect, ,, and, emotion, .>


In [33]:
## creating tokens and corresponding tags 
ud_treebank = []
for sentence in ud_files:
  tokens = []
  tags = []
  for token in sentence:
    tokens.append(token['form'])
    tags.append(token['upostag'])
  ud_treebank.append((tokens, tags))

In [35]:
print(f"First Sentence & associated tags")
print(f"Sentence :- \n {ud_treebank[0][0]}")
print(f"Tags :- \n {ud_treebank[0][1]}")

First Sentence & associated tags
Sentence :- 
 ['Aesthetic', 'Appreciation', 'and', 'Spanish', 'Art', ':']
Tags :- 
 ['ADJ', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'PUNCT']


In [36]:
## split whole dataset into train and test datasets
train_dataset_size = int(0.8*len(ud_treebank)) ## taken 80% of data as traning dataset
ud_treebank_training_dataset = ud_treebank[:train_dataset_size]
ud_treebank_testing_dataset = ud_treebank[train_dataset_size:]

print(f"Size of Penn treebank training dataset :- {len(ud_treebank_training_dataset)}")
print(f"Size of Penn treebank testing dataset :- {len(ud_treebank_testing_dataset)}")

Size of Penn treebank training dataset :- 3002
Size of Penn treebank testing dataset :- 751


In [38]:
import re

In [37]:
## extract features from every word in the sentence
def extract_word_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
      }

In [39]:
## creating dataset from raw dataset with all word features
def transform_rawdata_dataset_format(tagged_sentences):
  X,y = [],[]
  for sentence, tags in tagged_sentences:
    sent_word_features, sent_tags = [], []
    for index in range(len(sentence)):
      sent_word_features.append(extract_word_features(sentence=sentence, index=index))
      sent_tags.append(tags[index])
    X.append(sent_word_features)
    y.append(sent_tags)
  return X,y

In [40]:
## create train, test datasets of features and target variables
X_ud_train, y_ud_train = transform_rawdata_dataset_format(ud_treebank_training_dataset)
X_ud_test, y_ud_test = transform_rawdata_dataset_format(ud_treebank_testing_dataset)

In [42]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 5.2MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [43]:
import warnings 
warnings.filterwarnings('ignore')
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics, scorers

In [44]:
## initialize CRF model
ud_treebank_crf = CRF(algorithm='lbfgs',
                        c1=0.01,
                        c2=0.1,
                        max_iterations= 100,
                        all_possible_transitions = True)


In [46]:
## training a model
print("Starting CRF model traing on Penn Treebank dataset ")
ud_treebank_crf.fit(X_ud_train, y_ud_train)
print("Completed CRF model traning on Penn Treebank Dataset sucessfully")

Starting CRF model traing on Penn Treebank dataset 
Completed CRF model traning on Penn Treebank Dataset sucessfully


In [47]:
## evaluate CRF model on test dataset
y_ud_predict_test_results =  ud_treebank_crf.predict(X_ud_test)
print("F1-score on test dataset ")
f1_score_testdata = metrics.flat_f1_score(y_ud_test, y_ud_predict_test_results, average='weighted', labels=ud_treebank_crf.classes_)
print(f1_score_testdata)

## evaluate CRF model on training dataset
y_ud_predict_train_results =  ud_treebank_crf.predict(X_ud_train)
print("F1-score on train dataset ")
f1_score_testdata = metrics.flat_f1_score(y_ud_train, y_ud_predict_train_results, average='weighted', labels=ud_treebank_crf.classes_)
print(f1_score_testdata)

F1-score on test dataset 
0.9042422401050215
F1-score on train dataset 
0.9889075355782778


In [48]:
## classification report
print("Class wise score:")
print(metrics.flat_classification_report(
    y_ud_test, y_ud_predict_test_results, labels=ud_treebank_crf.classes_, digits=3
))

Class wise score:
              precision    recall  f1-score   support

         ADJ      0.823     0.782     0.802       864
        NOUN      0.878     0.898     0.888      2338
       CCONJ      0.988     0.991     0.989       425
       PUNCT      0.998     0.992     0.995      1552
         ADP      0.907     0.932     0.919      1090
       PROPN      0.370     0.944     0.531       144
       SCONJ      0.841     0.782     0.810       385
         AUX      0.959     0.963     0.961       675
        VERB      0.910     0.786     0.843      1689
         DET      0.979     0.983     0.981       996
        PRON      0.970     0.965     0.967       990
         NUM      0.909     0.974     0.940       154
         ADV      0.826     0.880     0.852       560
           X      1.000     0.067     0.125        45
         SYM      0.545     0.600     0.571        10
        PART      0.943     0.929     0.936       408
        INTJ      0.800     0.500     0.615         8

    accu

Alphabetical listing


ADJ: adjective

1.   ADP: adposition
2. ADV: adverb
3. AUX: auxiliary
4. CCONJ: coordinating conjunction
5. DET: determiner
6. INTJ: interjection
7. NOUN: noun
8. NUM: numeral
9. PART: particle
10. PRON: pronoun
11. PROPN: proper noun
12. PUNCT: punctuation
13. SCONJ: subordinating conjunction
14. SYM: symbol
15. VERB: verb
16. X: other











In [49]:
## apply trained model on new text data
example_sentence = "The tagger produced good results"
word_features = [extract_word_features(example_sentence.split(), index) for index in range(len(example_sentence.split()))]
results = ud_treebank_crf.predict_single(word_features)
ud_tups = [(example_sentence.split()[index], results[index]) for index in range(len(example_sentence.split()))]
print(ud_tups)

[('The', 'DET'), ('tagger', 'NOUN'), ('produced', 'VERB'), ('good', 'ADJ'), ('results', 'NOUN')]
