<center><h2>Program for POS Tagging and Word Embeddings.</center></h2>

# Importing Packages

In [None]:
import nltk
nltk.download('all')

In [None]:
from nltk import word_tokenize
from nltk.tag import pos_tag
nltk.download( 'averaged_perceptron_tagger')
nltk.download( 'tagsets')

# POS Tagging

In [6]:
# Using nltk.pos_tag
text = 'Ramoji Film City. Ramoji Film City is India’s only thematic holiday destination with cine-magic. \
Certified as the World’s Largest Film Studio complex by Guinness World Records,\
it spreads across 2000 acres. Millions of tourists visit the amusement park to live their dream vacation.'
tokens = nltk.word_tokenize(text)
print(f'Parts of Speech (POS) Tags are: \n{nltk.pos_tag(tokens)}')

Parts of Speech (POS) Tags are: 
[('Ramoji', 'NNP'), ('Film', 'NNP'), ('City', 'NNP'), ('.', '.'), ('Ramoji', 'NNP'), ('Film', 'NNP'), ('City', 'NNP'), ('is', 'VBZ'), ('India', 'NNP'), ('’', 'NNP'), ('s', 'VBZ'), ('only', 'RB'), ('thematic', 'JJ'), ('holiday', 'NN'), ('destination', 'NN'), ('with', 'IN'), ('cine-magic', 'JJ'), ('.', '.'), ('Certified', 'VBN'), ('as', 'IN'), ('the', 'DT'), ('World', 'NNP'), ('’', 'NNP'), ('s', 'RB'), ('Largest', 'NNP'), ('Film', 'NNP'), ('Studio', 'NNP'), ('complex', 'NN'), ('by', 'IN'), ('Guinness', 'NNP'), ('World', 'NNP'), ('Records', 'NNP'), (',', ','), ('it', 'PRP'), ('spreads', 'VBZ'), ('across', 'IN'), ('2000', 'CD'), ('acres', 'NNS'), ('.', '.'), ('Millions', 'NNS'), ('of', 'IN'), ('tourists', 'NNS'), ('visit', 'VBP'), ('the', 'DT'), ('amusement', 'NN'), ('park', 'NN'), ('to', 'TO'), ('live', 'VB'), ('their', 'PRP$'), ('dream', 'NN'), ('vacation', 'NN'), ('.', '.')]


In [7]:
# Averaged Perceptron tagger
from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger(load=False)
tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')],[('yes','NNS'),('it','PRP'),('beautiful','JJ')]])

tagger.tag(['today','is','a','beautiful','day'])

[('today', 'NN'),
 ('is', 'PRP'),
 ('a', 'PRP'),
 ('beautiful', 'JJ'),
 ('day', 'NN')]

In [8]:
# Percepton tagger
pretrain = PerceptronTagger()
tag1 = pretrain.tag('The quick brown fox jumps over the lazy dog'.split())
print(tag1)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


In [9]:
# Using Spacy
import spacy

nlp = spacy.load('en_core_web_sm')

text= 'Ramoji Film City. Ramoji Film City is India’s only thematic holiday destination with cine-magic. \
Certified as the World’s Largest Film Studio complex by Guinness World Records,\
it spreads across 2000 acres. Millions of tourists visit the amusement park to live their dream vacation.'

doc = nlp(text)

print(f"{'text':{8}} {'POS':{6}} {'TAG':{6}} {'Dep':{6}} {'POS explained':{20}} {'tag explained'} ")


for token in doc:
    print(f'{token.text:{8}} {token.pos_:{6}} {token.tag_:{6}} {token.dep_:{6}} {spacy.explain(token.pos_):{20}} {spacy.explain(token.tag_)}')

text     POS    TAG    Dep    POS explained        tag explained 
Ramoji   PROPN  NNP    compound proper noun          noun, proper singular
Film     PROPN  NNP    compound proper noun          noun, proper singular
City     PROPN  NNP    ROOT   proper noun          noun, proper singular
.        PUNCT  .      punct  punctuation          punctuation mark, sentence closer
Ramoji   PROPN  NNP    compound proper noun          noun, proper singular
Film     PROPN  NNP    compound proper noun          noun, proper singular
City     PROPN  NNP    nsubj  proper noun          noun, proper singular
is       AUX    VBZ    ROOT   auxiliary            verb, 3rd person singular present
India    PROPN  NNP    attr   proper noun          noun, proper singular
’s       PART   POS    punct  particle             possessive ending
only     ADJ    JJ     advmod adjective            adjective
thematic ADJ    JJ     amod   adjective            adjective
holiday  NOUN   NN     compound noun                 n

# Word Embedding

## Genism

In [10]:
!pip install gensim



In [12]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
model = Word2Vec(sentences=common_texts,  window=5, min_count=1, workers=4)
model.save("word2vec.model")



In [13]:
model = Word2Vec.load("word2vec.model")
model.train([["hello", "world"]], total_examples=1, epochs=1)



(0, 2)

In [15]:
vector = model.wv['computer']  # get numpy vector of a word
sims = model.wv.most_similar('computer', topn=10)  # get other similar words
sims

[('user', 0.20155365765094757),
 ('human', 0.10132145881652832),
 ('graph', 0.07621297240257263),
 ('response', 0.07165258377790451),
 ('trees', 0.02122265100479126),
 ('survey', 0.010687898844480515),
 ('eps', -0.03111916035413742),
 ('minors', -0.05052899569272995),
 ('interface', -0.05236271023750305),
 ('system', -0.06287829577922821)]

In [16]:
import gensim.downloader

In [17]:
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [21]:
# Uploading file and processing 
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
  
warnings.filterwarnings(action = 'ignore')
  
import gensim
from gensim.models import Word2Vec

In [25]:
# Continuous Bag-of-Word Model (CBOW)
sample = open("/content/charminar.txt", "r")
s = sample.read()
  
  
# Replaces escape character with space
f = s.replace("\n", " ")
  
data = []
  
# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []
      
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
  
    data.append(temp)
  
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count = 1, 
                              size = 100, window = 5)
  
# Print results
print("Cosine similarity between 'Charminar' " + 
               "and 'Golcanda' - CBOW : ",
    model1.similarity('charminar', 'golconda'))



Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.008664893


In [26]:
# Skip-Gram Model
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100,
                                             window = 5, sg = 1)
  
# Print results
print("Cosine similarity between 'Charminar' " + 
               "and 'Golcanda' - CBOW : ",
    model2.similarity('charminar', 'golconda'))



Cosine similarity between 'Charminar' and 'Golcanda' - CBOW :  0.017693968


# Inference

<h2> The skip gram is showing higher similarity than CBOW. Moreover, we can define our own models and refine it based on the requirement. In the POS tagging using percepton tagger we can chaange the POS of word. Using genism, we can find the different context of the given word. </h2>