# Text Mining

In [17]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [85]:
import nltk
string = 'At Waterloo, where there were many trains, \
we were fortunate in catching a train for Leatherhead.'

In [86]:
tokens = nltk.tokenize.word_tokenize(string)
print(tokens)

['At', 'Waterloo', ',', 'where', 'there', 'were', 'many', 'trains', ',', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead', '.']


In [87]:
from nltk.corpus import RegexpTokenizer as regextoken
tokenizer = regextoken(r'\w+')
tokens = tokenizer.tokenize(string)
print(tokens)

['At', 'Waterloo', 'where', 'there', 'were', 'many', 'trains', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead']


In [88]:
tokens = [token.lower() for token in tokens]
print(tokens,end=" ")

['at', 'waterloo', 'where', 'there', 'were', 'many', 'trains', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'leatherhead'] 

In [89]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
print(tokens[:20])

['waterloo', 'many', 'trains', 'fortunate', 'catching', 'train', 'leatherhead']


In [90]:
from nltk.stem.wordnet import WordNetLemmatizer
WordNetLemmatizer().lemmatize('lanes')

'lane'

In [91]:
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(token) for token in tokens]
print(tokens)

['waterloo', 'many', 'train', 'fortunate', 'catching', 'train', 'leatherhead']


In [102]:
for ngram in nltk.ngrams(tokens, 2):
    print(ngram)

('waterloo', 'many')
('many', 'train')
('train', 'fortunate')
('fortunate', 'catching')
('catching', 'train')
('train', 'leatherhead')


In [105]:
ngram_freq = nltk.FreqDist() 
ngram_sent=nltk.ngrams(tokens, 2, 
pad_right = True, right_pad_symbol='</s>',
pad_left=True, left_pad_symbol='<s>')
for ngram in ngram_sent:
    ngram_freq[ngram] += 1
ngram_freq

FreqDist({('<s>', 'waterloo'): 1,
          ('catching', 'train'): 1,
          ('fortunate', 'catching'): 1,
          ('leatherhead', '</s>'): 1,
          ('many', 'train'): 1,
          ('train', 'fortunate'): 1,
          ('train', 'leatherhead'): 1,
          ('waterloo', 'many'): 1})

In [106]:
ngram_freq.most_common(4)

[(('<s>', 'waterloo'), 1),
 (('waterloo', 'many'), 1),
 (('many', 'train'), 1),
 (('train', 'fortunate'), 1)]

In [111]:
from nltk.tokenize import sent_tokenize, word_tokenize
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing \
today? The weather is great, and Python is awesome. The sky \
is pinkish-blue. You shouldn't eat cardboard."
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [113]:
from nltk.corpus import stopwords
print(set(stopwords.words('english')), end="")

{'she', "weren't", 'for', 'more', "you've", 'or', "shan't", 'herself', 'a', 'these', 'out', 'other', 'our', 'under', 'to', 'he', 'his', 'off', 'm', 'those', 'there', 'me', 'above', 'but', "aren't", "haven't", 'having', 'how', "didn't", 'its', 'my', "you'll", 'where', 'myself', 'you', 'hasn', 'doesn', 'd', 'if', 've', 'have', 'is', 'themselves', 'by', 'hadn', 'now', 'will', 'that', 'not', 'won', 'in', 'such', 'why', 'were', 'through', 'any', "hasn't", 'during', 'ourselves', 'your', 'they', 'than', "that'll", 'about', 'and', 'shouldn', 'too', 'just', 'this', 'was', 'their', 'into', 'didn', "it's", 'theirs', 'i', 'what', 'all', 'had', 'as', 'needn', 'the', 'before', 'can', 'isn', 'same', 'so', "don't", 'wasn', 'o', 'below', 'it', 'when', "hadn't", 'while', 'down', 'from', 'own', "should've", 'after', 'nor', "wouldn't", 'did', 'here', "mightn't", 'mustn', 'should', 'being', 'whom', 'because', "you'd", 'am', 'been', "isn't", 'himself', 'weren', 'an', 'most', 'no', "won't", "couldn't", 'whic

In [178]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
print(stemmer.stem("studying"))
print(lemmatizer.lemmatize("studying"))
print(lemmatizer.lemmatize("studying", pos="v"))
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(stemmer.stem("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

studi
studying
study
cat
cactus
cacti
goose
rock
python
good
best
run
run


In [125]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
s = "This is a simple sentence"
tokens = word_tokenize(s) 
tokens_pos = pos_tag(tokens)  
print(tokens_pos)

[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('simple', 'JJ'), ('sentence', 'NN')]


In [147]:
import nltk
from nltk.corpus import state_union
sample_text = "Narendra Modi met Rahul Gandhi in the Cubbon Park. Mark Douglas sat on the chair."
tokenized = sent_tokenize(sample_text)
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)
    except Exception as e:
        print(str(e))

process_content()

(S
  (Chunk Narendra/NNP Modi/NNP)
  (Chunk met/VBD Rahul/NNP Gandhi/NNP)
  in/IN
  the/DT
  (Chunk Cubbon/NNP Park/NNP)
  ./.)
(Chunk Narendra/NNP Modi/NNP)
(Chunk met/VBD Rahul/NNP Gandhi/NNP)
(Chunk Cubbon/NNP Park/NNP)
(S (Chunk Mark/NNP Douglas/NNP) sat/VBD on/IN the/DT chair/NN ./.)
(Chunk Mark/NNP Douglas/NNP)


In [173]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
sample_text = "Narendra Modi met Barack Obama on 15th August 2017.\
 Mark Douglas works at Microsoft."
tokenized = sent_tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
            namedEnt.draw()
            print(namedEnt)
    except Exception as e:
        print(str(e))


process_content()

(S
  (PERSON Narendra/NNP)
  (PERSON Modi/NNP)
  met/VBD
  (PERSON Barack/NNP Obama/NNP)
  on/IN
  15th/CD
  August/NNP
  2017/CD
  ./.)
(S
  (PERSON Mark/NNP)
  (PERSON Douglas/NNP)
  works/VBZ
  at/IN
  (ORGANIZATION Microsoft/NNP)
  ./.)


In [182]:
from nltk.corpus import wordnet
syns = wordnet.synsets("program")
print(syns[0].name())
print(syns[0].lemmas()[0].name())
print(syns[0].definition())
print(syns[0].examples())

plan.n.01
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [184]:
synonyms = []
antonyms = []
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))

{'full', 'serious', 'near', 'honest', 'estimable', 'skilful', 'secure', 'salutary', 'safe', 'respectable', 'beneficial', 'soundly', 'expert', 'right', 'unspoiled', 'adept', 'well', 'dependable', 'undecomposed', 'skillful', 'dear', 'goodness', 'practiced', 'in_effect', 'good', 'just', 'commodity', 'upright', 'sound', 'proficient', 'in_force', 'unspoilt', 'thoroughly', 'trade_good', 'honorable', 'effective', 'ripe'}
{'evil', 'bad', 'ill', 'evilness', 'badness'}


In [185]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [186]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))

0.6956521739130435


In [187]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


In [190]:
from pywsd.lesk import simple_lesk
sent = 'I went to the bank to deposit my money'
ambiguous = 'bank'
answer = simple_lesk(sent, ambiguous, pos='n')
print(answer)
print(answer.definition())

Synset('depository_financial_institution.n.01')
a financial institution that accepts deposits and channels the money into lending activities


In [191]:
sent = 'The cow was drinking some water from the bank.'
ambiguous = 'bank'
answer = simple_lesk(sent, ambiguous, pos='n')
print(answer)
print(answer.definition())

Synset('bank.n.01')
sloping land (especially the slope beside a body of water)


In [193]:
from pywsd import disambiguate
from pywsd.similarity import max_similarity as maxsim
disambiguate('I went to the bank to deposit my money')

[('I', None),
 ('went', Synset('run_low.v.01')),
 ('to', None),
 ('the', None),
 ('bank', Synset('depository_financial_institution.n.01')),
 ('to', None),
 ('deposit', Synset('deposit.v.02')),
 ('my', None),
 ('money', Synset('money.n.03'))]