Part 1 : Exploring Gensim

In [1]:
import gensim
from gensim.utils import simple_preprocess

In [2]:
text="Machine learning is the study of computer algorithms that improve automatically through experience and by the use of data.It is seen as a part of artificial intelligence. Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks. A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning. In its application across business problems, machine learning is also referred to as predictive analytics."

In [3]:
# Tokenization of the corpus 
tokenized =[]
for sentence in text.split('.'):
  tokenized.append(simple_preprocess(sentence, deacc = True))
  
print(tokenized)

[['machine', 'learning', 'is', 'the', 'study', 'of', 'computer', 'algorithms', 'that', 'improve', 'automatically', 'through', 'experience', 'and', 'by', 'the', 'use', 'of', 'data'], ['it', 'is', 'seen', 'as', 'part', 'of', 'artificial', 'intelligence'], ['machine', 'learning', 'algorithms', 'build', 'model', 'based', 'on', 'sample', 'data', 'known', 'as', 'training', 'data', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'do', 'so'], ['machine', 'learning', 'algorithms', 'are', 'used', 'in', 'wide', 'variety', 'of', 'applications', 'such', 'as', 'in', 'medicine', 'email', 'filtering', 'and', 'computer', 'vision', 'where', 'it', 'is', 'difficult', 'or', 'unfeasible', 'to', 'develop', 'conventional', 'algorithms', 'to', 'perform', 'the', 'needed', 'tasks'], ['subset', 'of', 'machine', 'learning', 'is', 'closely', 'related', 'to', 'computational', 'statistics', 'which', 'focuses', 'on', 'making', 'predictions', 'using',

In [4]:
# Created a dictionary out of the set of tokens
from gensim import corpora

my_dictionary = corpora.Dictionary(tokenized)
print(my_dictionary)

Dictionary(96 unique tokens: ['algorithms', 'and', 'automatically', 'by', 'computer']...)


In [5]:
# Converted dictionary into a Bag of Words corpus
BoW_corpus =[my_dictionary.doc2bow(doc, allow_update = True) for doc in tokenized]
print(BoW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1)], [(8, 1), (11, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(0, 1), (5, 2), (9, 1), (10, 1), (18, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1)], [(0, 2), (1, 1), (4, 1), (8, 1), (9, 1), (10, 1), (11, 1), (14, 1), (18, 1), (20, 1), (29, 2), (34, 1), (40, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1)], [(8, 2), (9, 3), (10, 2), (11, 1), (33, 1), (36, 1), (40, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1)], [(1, 1), (9, 1), (10, 1), (11, 2), (12, 1), (14, 2), (40, 1), (75, 1), (76, 1), (77, 1), (78

In [6]:
# Printing every word's frequency
from gensim import models
import numpy as np
  
# Word weight in Bag of Words corpus
word_weight =[]
for doc in BoW_corpus:
  for id, freq in doc:
    word_weight.append([my_dictionary[id], freq])
print(word_weight)

[['algorithms', 1], ['and', 1], ['automatically', 1], ['by', 1], ['computer', 1], ['data', 1], ['experience', 1], ['improve', 1], ['is', 1], ['learning', 1], ['machine', 1], ['of', 2], ['study', 1], ['that', 1], ['the', 2], ['through', 1], ['use', 1], ['is', 1], ['of', 1], ['artificial', 1], ['as', 1], ['intelligence', 1], ['it', 1], ['part', 1], ['seen', 1], ['algorithms', 1], ['data', 2], ['learning', 1], ['machine', 1], ['as', 1], ['based', 1], ['being', 1], ['build', 1], ['decisions', 1], ['do', 1], ['explicitly', 1], ['in', 1], ['known', 1], ['make', 1], ['model', 1], ['on', 1], ['or', 1], ['order', 1], ['predictions', 1], ['programmed', 1], ['sample', 1], ['so', 1], ['to', 2], ['training', 1], ['without', 1], ['algorithms', 2], ['and', 1], ['computer', 1], ['is', 1], ['learning', 1], ['machine', 1], ['of', 1], ['the', 1], ['as', 1], ['it', 1], ['in', 2], ['or', 1], ['to', 2], ['applications', 1], ['are', 1], ['conventional', 1], ['develop', 1], ['difficult', 1], ['email', 1], ['f

In [7]:
# Create TF-IDF model
tfIdf = models.TfidfModel(BoW_corpus, smartirs ='ntc')
  
# TF-IDF Word Weight
weight_tfidf =[]
for doc in tfIdf[BoW_corpus]:
  for id, freq in doc:
    weight_tfidf.append([my_dictionary[id], np.around(freq, decimals = 3)])
print(weight_tfidf) 

[['algorithms', 0.165], ['and', 0.165], ['automatically', 0.331], ['by', 0.331], ['computer', 0.226], ['data', 0.165], ['experience', 0.331], ['improve', 0.331], ['is', 0.061], ['learning', 0.038], ['machine', 0.061], ['of', 0.122], ['study', 0.165], ['that', 0.331], ['the', 0.331], ['through', 0.226], ['use', 0.331], ['is', 0.085], ['of', 0.085], ['artificial', 0.463], ['as', 0.171], ['intelligence', 0.463], ['it', 0.317], ['part', 0.463], ['seen', 0.463], ['algorithms', 0.117], ['data', 0.235], ['learning', 0.027], ['machine', 0.043], ['as', 0.087], ['based', 0.235], ['being', 0.235], ['build', 0.235], ['decisions', 0.235], ['do', 0.235], ['explicitly', 0.235], ['in', 0.117], ['known', 0.235], ['make', 0.235], ['model', 0.235], ['on', 0.117], ['or', 0.161], ['order', 0.235], ['predictions', 0.161], ['programmed', 0.235], ['sample', 0.235], ['so', 0.235], ['to', 0.126], ['training', 0.235], ['without', 0.235], ['algorithms', 0.211], ['and', 0.106], ['computer', 0.144], ['is', 0.039], 

In [8]:
# Text summarization using gensim
from gensim.summarization import summarize, keywords
  
# Summarize the paragraph
print(summarize(text, word_count = 25))

The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.


Part 2: Exploring spaCy

In [9]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Create an nlp object
text = nlp("Artificial intelligence refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions.")

In [10]:
# POS Tagging
for token in text:
    # Print the token and its part-of-speech tag
    print(token.text, "-->", token.pos_)

Artificial --> ADJ
intelligence --> NOUN
refers --> VERB
to --> ADP
the --> DET
simulation --> NOUN
of --> ADP
human --> ADJ
intelligence --> NOUN
in --> ADP
machines --> NOUN
that --> DET
are --> AUX
programmed --> VERB
to --> PART
think --> VERB
like --> SCONJ
humans --> NOUN
and --> CCONJ
mimic --> VERB
their --> DET
actions --> NOUN
. --> PUNCT


In [11]:
# Dependency parsing
for token in text:
    print(token.text, "-->", token.dep_)

Artificial --> amod
intelligence --> nsubj
refers --> ROOT
to --> prep
the --> det
simulation --> pobj
of --> prep
human --> amod
intelligence --> pobj
in --> prep
machines --> pobj
that --> nsubjpass
are --> auxpass
programmed --> relcl
to --> aux
think --> xcomp
like --> prep
humans --> pobj
and --> cc
mimic --> conj
their --> poss
actions --> dobj
. --> punct


In [12]:
# Named Entity Recognition

text2 = nlp("Indian population originated in three migration waves from Africa, Iran and Asia.")
 
for ent in text2.ents:
    print(ent.text, ent.label_)

Indian NORP
three CARDINAL
Africa LOC
Iran GPE
Asia LOC


Part 3: Exploring Wordnet with NLTK

In [13]:
from nltk.corpus import wordnet as wn

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# Synsets of the word "good" in wordnet
wn.synsets('good')

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [16]:
print(wn.synset('good.n.03').definition())

that which is pleasing or valuable or useful


In [17]:
print(wn.synset('good.n.03').examples()[0])

weigh the good against the bad


In [18]:
wn.synset('good.n.03').lemmas()

[Lemma('good.n.03.good'), Lemma('good.n.03.goodness')]

In [19]:
cat = wn.synset('cat.n.01')
print("Hypernyms of cat-->")
print(cat.hypernyms())
print("Hyponyms of cat-->")
print(cat.hyponyms())
print("Lowest common hypernyms of cat and dog -->")
print(wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')))
good = wn.synset('good.a.01')
print("Antonyms of good -->")
print(good.lemmas()[0].antonyms())

Hypernyms of cat-->
[Synset('feline.n.01')]
Hyponyms of cat-->
[Synset('domestic_cat.n.01'), Synset('wildcat.n.03')]
Lowest common hypernyms of cat and dog -->
[Synset('carnivore.n.01')]
Antonyms of good -->
[Lemma('bad.a.01.bad')]


In [20]:
# Similarity between two synsets
hit = wn.synset('hit.v.01')
slap = wn.synset('slap.v.01')

hit.path_similarity(slap)

0.14285714285714285