### Tokenizing words and sentences

In [1]:
import nltk

In [2]:
paragraph = """Thank you all so very much. Thank you to the Academy. 
               Thank you to all of you in this room. I have to congratulate 
               the other incredible nominees this year. The Revenant was 
               the product of the tireless efforts of an unbelievable cast
               and crew. First off, to my brother in this endeavor, Mr. Tom 
               Hardy. Tom, your talent on screen can only be surpassed by 
               your friendship off screen … thank you for creating a t
               ranscendent cinematic experience. Thank you to everybody at 
               Fox and New Regency … my entire team. I have to thank 
               everyone from the very onset of my career … To my parents; 
               none of this would be possible without you. And to my 
               friends, I love you dearly; you know who you are. And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world. A world that we
               collectively felt in 2015 as the hottest year in recorded
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating. We need to
               support leaders around the world who do not speak for the 
               big polluters, but who speak for all of humanity, for the
               indigenous people of the world, for the billions and 
               billions of underprivileged people out there who would be
               most affected by this. For our children’s children, and 
               for those people out there whose voices have been drowned
               out by the politics of greed. I thank you all for this 
               amazing award tonight. Let us not take this planet for 
               granted. I do not take tonight for granted. Thank you so very much."""

In [7]:
sentences = nltk.sent_tokenize(paragraph)
print("total sentences :",len(sentences))
sentences

total sentences : 21


['Thank you all so very much.',
 'Thank you to the Academy.',
 'Thank you to all of you in this room.',
 'I have to congratulate \n               the other incredible nominees this year.',
 'The Revenant was \n               the product of the tireless efforts of an unbelievable cast\n               and crew.',
 'First off, to my brother in this endeavor, Mr. Tom \n               Hardy.',
 'Tom, your talent on screen can only be surpassed by \n               your friendship off screen … thank you for creating a t\n               ranscendent cinematic experience.',
 'Thank you to everybody at \n               Fox and New Regency … my entire team.',
 'I have to thank \n               everyone from the very onset of my career … To my parents; \n               none of this would be possible without you.',
 'And to my \n               friends, I love you dearly; you know who you are.',
 "And lastly,\n               I just want to say this: Making The Revenant was about\n               man's

In [8]:
words = nltk.word_tokenize(paragraph)
print("Total words :",len(words))
words

Total words : 347


['Thank',
 'you',
 'all',
 'so',
 'very',
 'much',
 '.',
 'Thank',
 'you',
 'to',
 'the',
 'Academy',
 '.',
 'Thank',
 'you',
 'to',
 'all',
 'of',
 'you',
 'in',
 'this',
 'room',
 '.',
 'I',
 'have',
 'to',
 'congratulate',
 'the',
 'other',
 'incredible',
 'nominees',
 'this',
 'year',
 '.',
 'The',
 'Revenant',
 'was',
 'the',
 'product',
 'of',
 'the',
 'tireless',
 'efforts',
 'of',
 'an',
 'unbelievable',
 'cast',
 'and',
 'crew',
 '.',
 'First',
 'off',
 ',',
 'to',
 'my',
 'brother',
 'in',
 'this',
 'endeavor',
 ',',
 'Mr.',
 'Tom',
 'Hardy',
 '.',
 'Tom',
 ',',
 'your',
 'talent',
 'on',
 'screen',
 'can',
 'only',
 'be',
 'surpassed',
 'by',
 'your',
 'friendship',
 'off',
 'screen',
 '…',
 'thank',
 'you',
 'for',
 'creating',
 'a',
 't',
 'ranscendent',
 'cinematic',
 'experience',
 '.',
 'Thank',
 'you',
 'to',
 'everybody',
 'at',
 'Fox',
 'and',
 'New',
 'Regency',
 '…',
 'my',
 'entire',
 'team',
 '.',
 'I',
 'have',
 'to',
 'thank',
 'everyone',
 'from',
 'the',
 've

### Stemming and Lemmatization

In [9]:
#Stemming : process of reducing infected/derived words to their word stem, base or root form.
#Lemmatization : same as stemming but intermediate representation/root form has a meaning.

#### stemming

In [11]:
from nltk.stem import PorterStemmer

sentences = nltk.sent_tokenize(paragraph)
stemmer = PorterStemmer()

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [stemmer.stem(word) for word in words]
    sentences[i] = ' '.join(newwords)

sentences #useful in spam classifications

['thank you all so veri much .',
 'thank you to the academi .',
 'thank you to all of you in thi room .',
 'I have to congratul the other incred nomine thi year .',
 'the reven wa the product of the tireless effort of an unbeliev cast and crew .',
 'first off , to my brother in thi endeavor , mr. tom hardi .',
 'tom , your talent on screen can onli be surpass by your friendship off screen … thank you for creat a t ranscend cinemat experi .',
 'thank you to everybodi at fox and new regenc … my entir team .',
 'I have to thank everyon from the veri onset of my career … To my parent ; none of thi would be possibl without you .',
 'and to my friend , I love you dearli ; you know who you are .',
 "and lastli , I just want to say thi : make the reven wa about man 's relationship to the natur world .",
 'A world that we collect felt in 2015 as the hottest year in record histori .',
 'our product need to move to the southern tip of thi planet just to be abl to find snow .',
 'climat chang is r

#### lemmatization

In [26]:
from nltk.stem import WordNetLemmatizer

sentences = nltk.sent_tokenize(paragraph)
lemmatizer = WordNetLemmatizer()

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [lemmatizer.lemmatize(word) for word in words]
    sentences[i] = ' '.join(newwords)

sentences #uesful in chatbots

['Thank you all so very much .',
 'Thank you to the Academy .',
 'Thank you to all of you in this room .',
 'I have to congratulate the other incredible nominee this year .',
 'The Revenant wa the product of the tireless effort of an unbelievable cast and crew .',
 'First off , to my brother in this endeavor , Mr. Tom Hardy .',
 'Tom , your talent on screen can only be surpassed by your friendship off screen … thank you for creating a t ranscendent cinematic experience .',
 'Thank you to everybody at Fox and New Regency … my entire team .',
 'I have to thank everyone from the very onset of my career … To my parent ; none of this would be possible without you .',
 'And to my friend , I love you dearly ; you know who you are .',
 "And lastly , I just want to say this : Making The Revenant wa about man 's relationship to the natural world .",
 'A world that we collectively felt in 2015 a the hottest year in recorded history .',
 'Our production needed to move to the southern tip of this p

### Stop word removal

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Shashank
[nltk_data]     Prakash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
from nltk.corpus import stopwords

sentences = nltk.sent_tokenize(paragraph)

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [word for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(newwords)

sentences #useful mainly in text classifications, sentiment analysis

['Thank much .',
 'Thank Academy .',
 'Thank room .',
 'I congratulate incredible nominees year .',
 'The Revenant product tireless efforts unbelievable cast crew .',
 'First , brother endeavor , Mr. Tom Hardy .',
 'Tom , talent screen surpassed friendship screen … thank creating ranscendent cinematic experience .',
 'Thank everybody Fox New Regency … entire team .',
 'I thank everyone onset career … To parents ; none would possible without .',
 'And friends , I love dearly ; know .',
 "And lastly , I want say : Making The Revenant man 's relationship natural world .",
 'A world collectively felt 2015 hottest year recorded history .',
 'Our production needed move southern tip planet able find snow .',
 'Climate change real , happening right .',
 'It urgent threat facing entire species , need work collectively together stop procrastinating .',
 'We need support leaders around world speak big polluters , speak humanity , indigenous people world , billions billions underprivileged people 

### Parts of speech Tagging

In [31]:
words = nltk.word_tokenize(paragraph)
tagged_words= nltk.pos_tag(words)
tagged_words

[('Thank', 'NNP'),
 ('you', 'PRP'),
 ('all', 'DT'),
 ('so', 'RB'),
 ('very', 'RB'),
 ('much', 'JJ'),
 ('.', '.'),
 ('Thank', 'VB'),
 ('you', 'PRP'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('Academy', 'NNP'),
 ('.', '.'),
 ('Thank', 'NNP'),
 ('you', 'PRP'),
 ('to', 'TO'),
 ('all', 'DT'),
 ('of', 'IN'),
 ('you', 'PRP'),
 ('in', 'IN'),
 ('this', 'DT'),
 ('room', 'NN'),
 ('.', '.'),
 ('I', 'PRP'),
 ('have', 'VBP'),
 ('to', 'TO'),
 ('congratulate', 'VB'),
 ('the', 'DT'),
 ('other', 'JJ'),
 ('incredible', 'JJ'),
 ('nominees', 'NNS'),
 ('this', 'DT'),
 ('year', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('Revenant', 'NNP'),
 ('was', 'VBD'),
 ('the', 'DT'),
 ('product', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('tireless', 'NN'),
 ('efforts', 'NNS'),
 ('of', 'IN'),
 ('an', 'DT'),
 ('unbelievable', 'JJ'),
 ('cast', 'NN'),
 ('and', 'CC'),
 ('crew', 'NN'),
 ('.', '.'),
 ('First', 'NNP'),
 ('off', 'RB'),
 (',', ','),
 ('to', 'TO'),
 ('my', 'PRP$'),
 ('brother', 'NN'),
 ('in', 'IN'),
 ('this', 'DT'),
 ('endeavor'

In [32]:
word_tags = []
for tw in tagged_words:
    word_tags.append(tw[0]+"_"+tw[1])

tagged_paragraph=' '.join(word_tags)
tagged_paragraph

"Thank_NNP you_PRP all_DT so_RB very_RB much_JJ ._. Thank_VB you_PRP to_TO the_DT Academy_NNP ._. Thank_NNP you_PRP to_TO all_DT of_IN you_PRP in_IN this_DT room_NN ._. I_PRP have_VBP to_TO congratulate_VB the_DT other_JJ incredible_JJ nominees_NNS this_DT year_NN ._. The_DT Revenant_NNP was_VBD the_DT product_NN of_IN the_DT tireless_NN efforts_NNS of_IN an_DT unbelievable_JJ cast_NN and_CC crew_NN ._. First_NNP off_RB ,_, to_TO my_PRP$ brother_NN in_IN this_DT endeavor_NN ,_, Mr._NNP Tom_NNP Hardy_NNP ._. Tom_NNP ,_, your_PRP$ talent_NN on_IN screen_NN can_MD only_RB be_VB surpassed_VBN by_IN your_PRP$ friendship_NN off_IN screen_JJ …_NNP thank_NN you_PRP for_IN creating_VBG a_DT t_JJ ranscendent_NN cinematic_JJ experience_NN ._. Thank_NNP you_PRP to_TO everybody_VB at_IN Fox_NNP and_CC New_NNP Regency_NNP …_NNP my_PRP$ entire_JJ team_NN ._. I_PRP have_VBP to_TO thank_VB everyone_NN from_IN the_DT very_RB onset_NN of_IN my_PRP$ career_NN …_NN To_TO my_PRP$ parents_NNS ;_: none_NN of_

### Named entry recognition

In [8]:
"""
Some examples of Named entities :
ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	Eddy Bonte, President Obama
LOCATION	Murray River, Mount Everest
DATE	June, 2008-06-29
TIME	two fifty a m, 1:30 p.m.
MONEY	175 million Canadian Dollars, GBP 10.40
PERCENT	twenty pct, 18.75 %
FACILITY	Washington Monument, Stonehenge
GPE	South East Asia, Midlothian
"""

paragraph2 = "The Taj Mahal was built by Emperor Shah Jahan"

words = nltk.word_tokenize(paragraph2)
tagged_words = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged_words)
namedEnt.draw() #draws tree in a new window

### Bag of Words Model

In [3]:
import re
import heapq
import numpy as np

In [9]:
#Preprocessing
dataset = nltk.sent_tokenize(paragraph)

for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r"\W",' ',dataset[i])
    dataset[i] = re.sub(r"\s+",' ',dataset[i])

#Creating the histogram
word2count={}
for data in dataset:
    words=nltk.word_tokenize(data)
    for word in words:
        word2count[word] = word2count.get(word,0) + 1

print("Total number of words :",len(word2count))
word2count

Total number of words : 157


{'thank': 8,
 'you': 12,
 'all': 4,
 'so': 2,
 'very': 3,
 'much': 2,
 'to': 16,
 'the': 17,
 'academy': 1,
 'of': 10,
 'in': 4,
 'this': 9,
 'room': 1,
 'i': 6,
 'have': 3,
 'congratulate': 1,
 'other': 1,
 'incredible': 1,
 'nominees': 1,
 'year': 2,
 'revenant': 2,
 'was': 2,
 'product': 1,
 'tireless': 1,
 'efforts': 1,
 'an': 1,
 'unbelievable': 1,
 'cast': 1,
 'and': 8,
 'crew': 1,
 'first': 1,
 'off': 2,
 'my': 5,
 'brother': 1,
 'endeavor': 1,
 'mr': 1,
 'tom': 2,
 'hardy': 1,
 'your': 2,
 'talent': 1,
 'on': 1,
 'screen': 2,
 'can': 1,
 'only': 1,
 'be': 4,
 'surpassed': 1,
 'by': 3,
 'friendship': 1,
 'for': 10,
 'creating': 1,
 'a': 2,
 't': 1,
 'ranscendent': 1,
 'cinematic': 1,
 'experience': 1,
 'everybody': 1,
 'at': 1,
 'fox': 1,
 'new': 1,
 'regency': 1,
 'entire': 2,
 'team': 1,
 'everyone': 1,
 'from': 1,
 'onset': 1,
 'career': 1,
 'parents': 1,
 'none': 1,
 'would': 2,
 'possible': 1,
 'without': 1,
 'friends': 1,
 'love': 1,
 'dearly': 1,
 'know': 1,
 'who': 4,
 '

In [13]:
freq_words = heapq.nlargest(100,word2count,word2count.get) #100 most frequent words

x=[]
for data in dataset:
    vector=[]
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
        x.append(vector)

x=np.asarray(x)
x #bag of words model - 21 documents(sentences in our small example),100 columns

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

### TF-IDF Model

In [14]:
#Unlike bag of words model, some semantic information is preserved
#as uncommon words are given more importance than common words.

#TF=Term Frequency  IDF=Inverse Document Frequency 
#TF=(# of occurrences of a word in a doc.)/(total # of words in that doc.)
#IDF=ln((# of doc. in whole corpus)/(# of doc. containing word))
#TFIDF(word)=TF(doc.,word)*IDF(word)

#useful in text classification, opinion mining

In [15]:
#IDF matrix/dictionary
word_idfs={}
for word in freq_words:
    doc_count=0
    for data in dataset:
        if word in nltk.word_tokenize(data):
            doc_count+=1
    word_idfs[word] = np.log((len(dataset)/doc_count)+1) #1 is the bias(a standard followed in most libraries)
word_idfs

{'the': 1.1314021114911006,
 'to': 1.067840630001356,
 'you': 1.2039728043259361,
 'of': 1.5040773967762742,
 'for': 1.5040773967762742,
 'this': 1.2039728043259361,
 'thank': 1.2878542883066382,
 'and': 1.3862943611198906,
 'i': 1.5040773967762742,
 'my': 1.8325814637483102,
 'all': 1.8325814637483102,
 'in': 2.0794415416798357,
 'be': 1.8325814637483102,
 'who': 2.4423470353692043,
 'world': 2.0794415416798357,
 'very': 2.0794415416798357,
 'have': 2.0794415416798357,
 'by': 2.0794415416798357,
 'we': 2.0794415416798357,
 'our': 2.0794415416798357,
 'is': 2.4423470353692043,
 'not': 2.0794415416798357,
 'people': 2.4423470353692043,
 'out': 2.4423470353692043,
 'so': 2.4423470353692043,
 'much': 2.4423470353692043,
 'year': 2.4423470353692043,
 'revenant': 2.4423470353692043,
 'was': 2.4423470353692043,
 'off': 2.4423470353692043,
 'tom': 2.4423470353692043,
 'your': 3.091042453358316,
 'screen': 3.091042453358316,
 'a': 2.4423470353692043,
 'entire': 2.4423470353692043,
 'would': 2.

In [17]:
#TF matrix/dictionary
tf_matrix={}
for word in freq_words:
    doc_tf=[]
    for data in dataset:
        freq=0
        for w in nltk.word_tokenize(data):
            if w==word:
                freq+=1
        tf_word = freq/len(nltk.word_tokenize(data))
        doc_tf.append(tf_word)
    tf_matrix[word] = doc_tf
tf_matrix

{'the': [0.0,
  0.2,
  0.0,
  0.1,
  0.2,
  0.0,
  0.0,
  0.0,
  0.043478260869565216,
  0.0,
  0.1,
  0.06666666666666667,
  0.05263157894736842,
  0.0,
  0.05,
  0.10638297872340426,
  0.045454545454545456,
  0.0,
  0.0,
  0.0,
  0.0],
 'to': [0.0,
  0.2,
  0.1111111111111111,
  0.1,
  0.0,
  0.09090909090909091,
  0.0,
  0.08333333333333333,
  0.08695652173913043,
  0.07692307692307693,
  0.1,
  0.0,
  0.21052631578947367,
  0.0,
  0.05,
  0.02127659574468085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'you': [0.16666666666666666,
  0.2,
  0.2222222222222222,
  0.0,
  0.0,
  0.0,
  0.043478260869565216,
  0.08333333333333333,
  0.043478260869565216,
  0.23076923076923078,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.1111111111111111,
  0.0,
  0.0,
  0.2],
 'of': [0.0,
  0.0,
  0.1111111111111111,
  0.0,
  0.13333333333333333,
  0.0,
  0.0,
  0.0,
  0.08695652173913043,
  0.0,
  0.0,
  0.0,
  0.05263157894736842,
  0.0,
  0.0,
  0.06382978723404255,
  0.045454545454545456,
  0.0,
 

In [18]:
#TF-IDF Calculation
tfidf_matrix=[]
for word in tf_matrix.keys():
    tfidf=[]
    for value in tf_matrix[word]:
        score = value*word_idfs[word]
        tfidf.append(score)
    tfidf_matrix.append(tfidf)
tfidf_matrix

[[0.0,
  0.22628042229822012,
  0.0,
  0.11314021114911006,
  0.22628042229822012,
  0.0,
  0.0,
  0.0,
  0.049191396151786984,
  0.0,
  0.11314021114911006,
  0.07542680743274004,
  0.059547479552163184,
  0.0,
  0.05657010557455503,
  0.1203619267543724,
  0.051427368704140934,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.21356812600027122,
  0.11864895888903956,
  0.10678406300013561,
  0.0,
  0.09707642090921419,
  0.0,
  0.08898671916677967,
  0.09285570695663965,
  0.08214158692318124,
  0.10678406300013561,
  0.0,
  0.224808553684496,
  0.0,
  0.053392031500067806,
  0.022720013404284173,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.20066213405432268,
  0.24079456086518725,
  0.2675495120724302,
  0.0,
  0.0,
  0.0,
  0.05234664366634505,
  0.10033106702716134,
  0.05234664366634505,
  0.2778398779213699,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.1337747560362151,
  0.0,
  0.0,
  0.24079456086518725],
 [0.0,
  0.0,
  0.16711971075291934,
  0.0,
  0.2005436529035032,
  0.0,
  0.

In [20]:
X = np.asarray(tfidf_matrix)
X = np.transpose(X)
X #TFIDF model(an extension of binary bag of words model) - 21rows, 100columns

array([[0.        , 0.22628042, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.21356813, 0.11864896, ..., 0.        , 0.        ,
        0.        ],
       [0.20066213, 0.24079456, 0.26754951, ..., 0.        , 0.        ,
        0.24079456],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### N-Gram Model

#### Character N-Gram model

In [3]:
import random

In [5]:
#sample data
text = """Global warming or climate change has become a worldwide concern. It is gradually developing into an unprecedented environmental crisis evident in melting glaciers, changing weather patterns, rising sea levels, floods, cyclones and droughts. Global warming implies an increase in the average temperature of the Earth due to entrapment of greenhouse gases in the earth’s atmosphere."""

In [7]:
n=3 #so it will be trigrams
ngrams={}

#create the n-grams
for i in range(len(text)-n):
    gram = text[i:i+n]
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(text[i+n])
ngrams

{'Glo': ['b', 'b'],
 'lob': ['a', 'a'],
 'oba': ['l', 'l'],
 'bal': [' ', ' '],
 'al ': ['w', 'c', 'w'],
 'l w': ['a', 'a'],
 ' wa': ['r', 'r'],
 'war': ['m', 'm'],
 'arm': ['i', 'i'],
 'rmi': ['n', 'n'],
 'min': ['g', 'g'],
 'ing': [' ', ' ', ' ', ' ', ' ', ' '],
 'ng ': ['o', 'i', 'g', 'w', 's', 'i'],
 'g o': ['r'],
 ' or': [' '],
 'or ': ['c'],
 'r c': ['l'],
 ' cl': ['i'],
 'cli': ['m'],
 'lim': ['a'],
 'ima': ['t'],
 'mat': ['e'],
 'ate': [' '],
 'te ': ['c'],
 'e c': ['h', 'o'],
 ' ch': ['a', 'a'],
 'cha': ['n', 'n'],
 'han': ['g', 'g'],
 'ang': ['e', 'i'],
 'nge': [' '],
 'ge ': ['h', 't'],
 'e h': ['a'],
 ' ha': ['s'],
 'has': [' '],
 'as ': ['b'],
 's b': ['e'],
 ' be': ['c'],
 'bec': ['o'],
 'eco': ['m'],
 'com': ['e'],
 'ome': [' '],
 'me ': ['a'],
 'e a': [' ', 'v'],
 ' a ': ['w'],
 'a w': ['o'],
 ' wo': ['r'],
 'wor': ['l'],
 'orl': ['d'],
 'rld': ['w'],
 'ldw': ['i'],
 'dwi': ['d'],
 'wid': ['e'],
 'ide': [' ', 'n'],
 'de ': ['c'],
 ' co': ['n'],
 'con': ['c'],
 'onc': ['

In [8]:
#Testing our N-Gram model(autocomlete application)
currentGram = text[0:n] #we are setting first 3 grams as initial input
result = currentGram
for i in range(100):
    if currentGram not in ngrams.keys():
        break
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))] #so next item will be a random item from possibilities
    result += nextItem
    currentGram = result[len(result)-n:len(result)]
print(result) #clearly its returning gibberish

Global crising in the Earth due temperage has become a worldwide concern. It is evidentrapment is evide


In [11]:
#same code as above with n=6 gives a much better prediction.

n=6
ngrams={}

#create the n-grams
for i in range(len(text)-n):
    gram = text[i:i+n]
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(text[i+n])

#Testing our N-Gram model
currentGram = text[0:n] #we are setting first 3 grams as initial input
result = currentGram
for i in range(100):
    if currentGram not in ngrams.keys():
        break
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))] #so next item will be a random item from possibilities
    result += nextItem
    currentGram = result[len(result)-n:len(result)]
print(result) #although, due to randrange, run this cell again and it might go gibberish

Global warming implies an increase in the earth’s atmosphere.


#### Word N-Gram model

In [12]:
n=3
ngrams={}

words = nltk.word_tokenize(text)
for i in range(len(words)-n):
    gram = ' '.join(words[i:i+n])
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(words[i+n])
ngrams

{'Global warming or': ['climate'],
 'warming or climate': ['change'],
 'or climate change': ['has'],
 'climate change has': ['become'],
 'change has become': ['a'],
 'has become a': ['worldwide'],
 'become a worldwide': ['concern'],
 'a worldwide concern': ['.'],
 'worldwide concern .': ['It'],
 'concern . It': ['is'],
 '. It is': ['gradually'],
 'It is gradually': ['developing'],
 'is gradually developing': ['into'],
 'gradually developing into': ['an'],
 'developing into an': ['unprecedented'],
 'into an unprecedented': ['environmental'],
 'an unprecedented environmental': ['crisis'],
 'unprecedented environmental crisis': ['evident'],
 'environmental crisis evident': ['in'],
 'crisis evident in': ['melting'],
 'evident in melting': ['glaciers'],
 'in melting glaciers': [','],
 'melting glaciers ,': ['changing'],
 'glaciers , changing': ['weather'],
 ', changing weather': ['patterns'],
 'changing weather patterns': [','],
 'weather patterns ,': ['rising'],
 'patterns , rising': ['sea

In [13]:
currentGram = ' '.join(words[0:n])
result = currentGram
for i in range(30):
    if currentGram not in ngrams.keys():
        break
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))]
    result += ' '+nextItem
    rwords = nltk.word_tokenize(result)
    currentGram = ' '.join(rwords[len(rwords)-n:len(rwords)])
result #Perfectly working, but only because we have a small single sentence

'Global warming or climate change has become a worldwide concern . It is gradually developing into an unprecedented environmental crisis evident in melting glaciers , changing weather patterns , rising sea levels ,'

### Latent Semantic Analysis

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [4]:
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

In [5]:
dataset = [line.lower() for line in dataset]
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset) #TF-IDF model
print(X[0]) #check the tfidf values for document 0

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444


In [7]:
lsa = TruncatedSVD(n_components=4, n_iter=100) #so,it will search for existence of 4 different concepts
lsa.fit(X) #SVD done; creates the Vtranspose matrix of SVD(U*S*Vtr.) for tfidf matrix
row1 = lsa.components_[0] #first concept/row,all the words; there will be total of 4 such components
row1

array([ 1.24191973e-01,  1.78240252e-01,  1.14460798e-01, -1.34360133e-16,
        1.24191973e-01,  1.14460798e-01, -1.34360133e-16,  3.44988739e-01,
       -2.32221620e-16,  2.28921595e-01,  1.24191973e-01, -1.34360133e-16,
        9.72770950e-02, -2.32221620e-16,  3.00124026e-01, -1.34360133e-16,
        1.78240252e-01,  1.14460798e-01,  9.72770950e-02,  1.75760635e-01,
        2.37365829e-01, -1.34360133e-16, -2.32221620e-16,  9.72770950e-02,
        2.95798061e-01, -1.34360133e-16,  1.14460798e-01,  1.24191973e-01,
       -2.32221620e-16,  1.24191973e-01, -2.32221620e-16,  1.78240252e-01,
       -1.34360133e-16,  1.83838346e-01,  3.76098295e-01, -3.04294337e-16,
        1.24191973e-01,  1.78240252e-01, -1.34360133e-16,  2.37365829e-01,
       -1.34360133e-16,  1.78240252e-01])

In [8]:
terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10] #10 most important terms in every concept, sorted by concept value of words
    print("\nConcept",i,":")
    for term in sortedTerms:
        print(term)


Concept 0 :
('the', 0.37609829529263766)
('concert', 0.34498873923306633)
('great', 0.300124025894874)
('of', 0.29579806095266653)
('just', 0.2373658292979126)
('was', 0.2373658292979126)
('day', 0.228921595415045)
('technology', 0.18383834567413418)
('all', 0.17824025175628966)
('in', 0.17824025175628966)

Concept 1 :
('to', 0.41578844396700665)
('cook', 0.2835916579351063)
('gordon', 0.2835916579351063)
('love', 0.2835916579351063)
('ramsay', 0.2835916579351063)
('see', 0.2835916579351063)
('and', 0.2173064471129254)
('campaigns', 0.2173064471129254)
('global', 0.2173064471129254)
('have', 0.2173064471129254)

Concept 2 :
('technology', 0.3779180676714406)
('is', 0.3419614380631984)
('google', 0.3413969441909748)
('introducing', 0.3413969441909748)
('new', 0.3413969441909748)
('day', 0.14112432680994666)
('are', 0.11387892195373071)
('examples', 0.11387892195373071)
('present', 0.11387892195373071)
('robots', 0.11387892195373071)

Concept 3 :
('day', 0.46542676790411297)
('amount', 

In [9]:
# Now, we want to classify all sentences in diff concepts
# same code as above, we are just storing the concept words now.
concept_words = {}

terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10] #10 most important terms in every concept, sorted by concept value of words
    concept_words["Concept"+str(i)] = sortedTerms
concept_words
# The four concepts seem to be music, food, tech and perhaps pollution/weather.

{'Concept0': [('the', 0.37609829529263766),
  ('concert', 0.34498873923306633),
  ('great', 0.300124025894874),
  ('of', 0.29579806095266653),
  ('just', 0.2373658292979126),
  ('was', 0.2373658292979126),
  ('day', 0.228921595415045),
  ('technology', 0.18383834567413418),
  ('all', 0.17824025175628966),
  ('in', 0.17824025175628966)],
 'Concept1': [('to', 0.41578844396700665),
  ('cook', 0.2835916579351063),
  ('gordon', 0.2835916579351063),
  ('love', 0.2835916579351063),
  ('ramsay', 0.2835916579351063),
  ('see', 0.2835916579351063),
  ('and', 0.2173064471129254),
  ('campaigns', 0.2173064471129254),
  ('global', 0.2173064471129254),
  ('have', 0.2173064471129254)],
 'Concept2': [('technology', 0.3779180676714406),
  ('is', 0.3419614380631984),
  ('google', 0.3413969441909748),
  ('introducing', 0.3413969441909748),
  ('new', 0.3413969441909748),
  ('day', 0.14112432680994666),
  ('are', 0.11387892195373071),
  ('examples', 0.11387892195373071),
  ('present', 0.11387892195373071),

In [11]:
#checking concept values for each sentences
for key in concept_words.keys():
    sentence_scores = []
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        sentence_scores.append(score)
    print("\n" + key + ":")
    for sentence_score in sentence_scores:
        print(sentence_score)


Concept0:
1.1297395470753941
1.4959427190164032
0
0.18383834567413418
0.7797604325216747
1.37336559899095
0

Concept1:
0
0
1.8337467336425384
0
0
0
1.2850142324187082

Concept2:
0.6242100916830917
0
0
1.7440703383075635
0.8334337554863634
0
0

Concept3:
2.2015937554478953
0.1272421318069429
0
0.21264455202450072
0
0.29658207438873596
0


### Synonyms and Antonyms (wordnet)

In [12]:
from nltk.corpus import wordnet

In [32]:
synonyms = []
antonyms = []

wordnet.synsets("good")

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [29]:
for syn in wordnet.synsets("good"):
    for s in syn.lemmas():
        synonyms.append(s.name())
synonyms

['good',
 'good',
 'goodness',
 'good',
 'goodness',
 'commodity',
 'trade_good',
 'good',
 'good',
 'full',
 'good',
 'good',
 'estimable',
 'good',
 'honorable',
 'respectable',
 'beneficial',
 'good',
 'good',
 'good',
 'just',
 'upright',
 'adept',
 'expert',
 'good',
 'practiced',
 'proficient',
 'skillful',
 'skilful',
 'good',
 'dear',
 'good',
 'near',
 'dependable',
 'good',
 'safe',
 'secure',
 'good',
 'right',
 'ripe',
 'good',
 'well',
 'effective',
 'good',
 'in_effect',
 'in_force',
 'good',
 'good',
 'serious',
 'good',
 'sound',
 'good',
 'salutary',
 'good',
 'honest',
 'good',
 'undecomposed',
 'unspoiled',
 'unspoilt',
 'good',
 'well',
 'good',
 'thoroughly',
 'soundly',
 'good']

In [30]:
print(set(synonyms))

{'effective', 'soundly', 'expert', 'skilful', 'near', 'respectable', 'well', 'in_effect', 'salutary', 'skillful', 'dependable', 'dear', 'commodity', 'upright', 'honest', 'sound', 'unspoiled', 'goodness', 'undecomposed', 'full', 'just', 'estimable', 'thoroughly', 'secure', 'unspoilt', 'serious', 'ripe', 'proficient', 'honorable', 'safe', 'good', 'right', 'adept', 'practiced', 'beneficial', 'trade_good', 'in_force'}


In [33]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for s in syn.lemmas():
        synonyms.append(s.name())
        for a in s.antonyms():
            antonyms.append(a.name())
print("Synonyms :",set(synonyms))
print("Antonyms :",set(antonyms))

Synonyms : {'effective', 'soundly', 'expert', 'skilful', 'near', 'respectable', 'well', 'in_effect', 'salutary', 'skillful', 'dependable', 'dear', 'commodity', 'upright', 'honest', 'sound', 'unspoiled', 'goodness', 'undecomposed', 'full', 'just', 'estimable', 'thoroughly', 'secure', 'unspoilt', 'serious', 'ripe', 'proficient', 'honorable', 'safe', 'good', 'right', 'adept', 'practiced', 'beneficial', 'trade_good', 'in_force'}
Antonyms : {'bad', 'ill', 'badness', 'evilness', 'evil'}


### Word negation tracking

In [38]:
sentence = "I was not happy with the team's performance"

In [36]:
words = nltk.word_tokenize(sentence)
new_words = []
temp_word=""
for word in words:
    if word == "not":
        temp_word = "not_"
    elif temp_word == "not_":
        word = temp_word + word
        temp_word = ""
    if word != "not":
        new_words.append(word)
sentence = ' '.join(new_words)
sentence

"I was not_happy with the team 's performance"

In [41]:
sentence = "I was not happy with the team's performance"
words = nltk.word_tokenize(sentence)
new_words = []
temp_word=""
for word in words:
    antonyms=[]
    if word == "not":
        temp_word = "not_"
    elif temp_word == "not_":
        for syn in wordnet.synsets(word):
            for s in syn.lemmas():
                for a in s.antonyms():
                    antonyms.append(a.name())
        if len(antonyms) >= 1:
            word = antonyms[0]
        else:
            word = temp_word +word
        temp_word = ""
    if word != "not":
        new_words.append(word)
sentence = ' '.join(new_words)
sentence

"I was unhappy with the team 's performance"