In [1]:
import nltk
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer

In [2]:
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

In [3]:
sentences = sent_tokenize(EXAMPLE_TEXT)

In [4]:
sentences

['Hello Mr. Smith, how are you doing today?',
 'The weather is great, and Python is awesome.',
 'The sky is pinkish-blue.',
 "You shouldn't eat cardboard."]

In [5]:
# POS tag list:

# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there (like: "there is" ... think of it like "there exists")
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	adjective	'big'
# JJR	adjective, comparative	'bigger'
# JJS	adjective, superlative	'biggest'
# LS	list marker	1)
# MD	modal	could, will
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'
# PDT	predeterminer	'all the kids'
# POS	possessive ending	parent\'s
# PRP	personal pronoun	I, he, she
# PRP$	possessive pronoun	my, his, hers
# RB	adverb	very, silently,
# RBR	adverb, comparative	better
# RBS	adverb, superlative	best
# RP	particle	give up
# TO	to	go 'to' the store.
# UH	interjection	errrrrrrrm
# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# WDT	wh-determiner	which
# WP	wh-pronoun	who, what
# WP$	possessive wh-pronoun	whose
# WRB	wh-abverb	where, when

In [6]:
tokenized = []

In [7]:
def token():
    try:
        for i in sentences:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            tokenized.append(tagged)

    except Exception as e:
        print(str(e))


token()

In [8]:
tokenized

[[('Hello', 'NNP'),
  ('Mr.', 'NNP'),
  ('Smith', 'NNP'),
  (',', ','),
  ('how', 'WRB'),
  ('are', 'VBP'),
  ('you', 'PRP'),
  ('doing', 'VBG'),
  ('today', 'NN'),
  ('?', '.')],
 [('The', 'DT'),
  ('weather', 'NN'),
  ('is', 'VBZ'),
  ('great', 'JJ'),
  (',', ','),
  ('and', 'CC'),
  ('Python', 'NNP'),
  ('is', 'VBZ'),
  ('awesome', 'JJ'),
  ('.', '.')],
 [('The', 'DT'),
  ('sky', 'NN'),
  ('is', 'VBZ'),
  ('pinkish-blue', 'JJ'),
  ('.', '.')],
 [('You', 'PRP'),
  ('should', 'MD'),
  ("n't", 'RB'),
  ('eat', 'VB'),
  ('cardboard', 'NN'),
  ('.', '.')]]

In [9]:
# chunking -- NOT USED --

for i in tokenized:
    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(i)
    print(chunked)
    
    for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
        print(subtree)
        
    chunked.draw()
    
## NOT USED 

(S
  (Chunk Hello/NNP Mr./NNP Smith/NNP)
  ,/,
  how/WRB
  are/VBP
  you/PRP
  doing/VBG
  today/NN
  ?/.)
(Chunk Hello/NNP Mr./NNP Smith/NNP)
(S
  The/DT
  weather/NN
  is/VBZ
  great/JJ
  ,/,
  and/CC
  (Chunk Python/NNP)
  is/VBZ
  awesome/JJ
  ./.)
(Chunk Python/NNP)
(S The/DT sky/NN is/VBZ pinkish-blue/JJ ./.)
(S You/PRP should/MD n't/RB eat/VB cardboard/NN ./.)


In [10]:
# Named Entity recognition 

for i in tokenized:
    namedEnt = nltk.ne_chunk(i, binary=True)
    namedEnt.draw()

In [11]:
# Lammatizing -- for finding root word of noun

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

for i in tokenized:
    for word in i:
        print(lemmatizer.lemmatize(word[0]))


Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
,
and
Python
is
awesome
.
The
sky
is
pinkish-blue
.
You
should
n't
eat
cardboard
.


In [12]:
from nltk.corpus import wordnet

In [13]:
# wordnet
syns = wordnet.synsets("program")
print(syns[2].name)
print(syns[0].lemmas()[0].name())

<bound method Synset.name of Synset('broadcast.n.02')>
plan


In [14]:
# wornet for synonyms

synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for i in syn.lemmas():
        synonyms.append(i.name())
        if i.antonyms():
            antonyms.append(i.antonyms()[0].name())

In [15]:
while synonyms.count('good') > 0:
    synonyms.remove('good')

In [16]:
synonyms

['goodness',
 'goodness',
 'commodity',
 'trade_good',
 'full',
 'estimable',
 'honorable',
 'respectable',
 'beneficial',
 'just',
 'upright',
 'adept',
 'expert',
 'practiced',
 'proficient',
 'skillful',
 'skilful',
 'dear',
 'near',
 'dependable',
 'safe',
 'secure',
 'right',
 'ripe',
 'well',
 'effective',
 'in_effect',
 'in_force',
 'serious',
 'sound',
 'salutary',
 'honest',
 'undecomposed',
 'unspoiled',
 'unspoilt',
 'well',
 'thoroughly',
 'soundly']

In [17]:
# similarity using wordnet
w1 = wordnet.synset('ship.n.1')
w2 = wordnet.synset('boat.n.1')
print(w1.wup_similarity(w2))

0.9090909090909091


In [18]:
list_of_triple_strings = []
for sent in tokenized:  
    pos = [ i[1] for i in sent ]
    n = len(pos)
    for i in range(0,n-3):
        t = "-".join(pos[i:i+3]) # pull out 3 list item from counter, convert to string
        list_of_triple_strings.append(t)

    print("sequences of triples:", list_of_triple_strings)
    print()

sequences of triples: ['NNP-NNP-NNP', 'NNP-NNP-,', 'NNP-,-WRB', ',-WRB-VBP', 'WRB-VBP-PRP', 'VBP-PRP-VBG', 'PRP-VBG-NN']

sequences of triples: ['NNP-NNP-NNP', 'NNP-NNP-,', 'NNP-,-WRB', ',-WRB-VBP', 'WRB-VBP-PRP', 'VBP-PRP-VBG', 'PRP-VBG-NN', 'DT-NN-VBZ', 'NN-VBZ-JJ', 'VBZ-JJ-,', 'JJ-,-CC', ',-CC-NNP', 'CC-NNP-VBZ', 'NNP-VBZ-JJ']

sequences of triples: ['NNP-NNP-NNP', 'NNP-NNP-,', 'NNP-,-WRB', ',-WRB-VBP', 'WRB-VBP-PRP', 'VBP-PRP-VBG', 'PRP-VBG-NN', 'DT-NN-VBZ', 'NN-VBZ-JJ', 'VBZ-JJ-,', 'JJ-,-CC', ',-CC-NNP', 'CC-NNP-VBZ', 'NNP-VBZ-JJ', 'DT-NN-VBZ', 'NN-VBZ-JJ']

sequences of triples: ['NNP-NNP-NNP', 'NNP-NNP-,', 'NNP-,-WRB', ',-WRB-VBP', 'WRB-VBP-PRP', 'VBP-PRP-VBG', 'PRP-VBG-NN', 'DT-NN-VBZ', 'NN-VBZ-JJ', 'VBZ-JJ-,', 'JJ-,-CC', ',-CC-NNP', 'CC-NNP-VBZ', 'NNP-VBZ-JJ', 'DT-NN-VBZ', 'NN-VBZ-JJ', 'PRP-MD-RB', 'MD-RB-VB', 'RB-VB-NN']



In [19]:
# Its might be better to strip "," and other punctuation mark.. 

# list_of_triple_strings = []
# for sent in tokenized:
#     msent
#     pos = [ i[1] for i in msent ]
#     n = len(pos)
#     for i in range(0,n-3):
#         t = "-".join(pos[i:i+3]) # pull out 3 list item from counter, convert to string
#         list_of_triple_strings.append(t)

#     print("sequences of triples:", list_of_triple_strings)
#     print()


In [20]:
# read senctence.csv

import pandas as pd
df = pd.read_csv('sentences.csv')
df.head()

Unnamed: 0,SENTENCE,CLASS
0,"Sorry, I don't know about the weather.",S
1,That is a tricky question to answer.,C
2,What does OCM stand for,Q
3,MAX is a Mobile Application Accelerator,S
4,Can a dog see in colour?,Q


In [21]:
def triple(tokenize):
    list_of_triple = []
    triple_strings = []
    for sent in tokenize:  
        pos = [ i[1] for i in sent ]
        n = len(pos)
        for i in range(0,n-3):
            t = "-".join(pos[i:i+3]) # pull out 3 list item from counter, convert to string
            triple_strings.append(t)

        list_of_triple.append(triple_strings)
    return list_of_triple

In [22]:
triple(tokenized)

[['NNP-NNP-NNP',
  'NNP-NNP-,',
  'NNP-,-WRB',
  ',-WRB-VBP',
  'WRB-VBP-PRP',
  'VBP-PRP-VBG',
  'PRP-VBG-NN',
  'DT-NN-VBZ',
  'NN-VBZ-JJ',
  'VBZ-JJ-,',
  'JJ-,-CC',
  ',-CC-NNP',
  'CC-NNP-VBZ',
  'NNP-VBZ-JJ',
  'DT-NN-VBZ',
  'NN-VBZ-JJ',
  'PRP-MD-RB',
  'MD-RB-VB',
  'RB-VB-NN'],
 ['NNP-NNP-NNP',
  'NNP-NNP-,',
  'NNP-,-WRB',
  ',-WRB-VBP',
  'WRB-VBP-PRP',
  'VBP-PRP-VBG',
  'PRP-VBG-NN',
  'DT-NN-VBZ',
  'NN-VBZ-JJ',
  'VBZ-JJ-,',
  'JJ-,-CC',
  ',-CC-NNP',
  'CC-NNP-VBZ',
  'NNP-VBZ-JJ',
  'DT-NN-VBZ',
  'NN-VBZ-JJ',
  'PRP-MD-RB',
  'MD-RB-VB',
  'RB-VB-NN'],
 ['NNP-NNP-NNP',
  'NNP-NNP-,',
  'NNP-,-WRB',
  ',-WRB-VBP',
  'WRB-VBP-PRP',
  'VBP-PRP-VBG',
  'PRP-VBG-NN',
  'DT-NN-VBZ',
  'NN-VBZ-JJ',
  'VBZ-JJ-,',
  'JJ-,-CC',
  ',-CC-NNP',
  'CC-NNP-VBZ',
  'NNP-VBZ-JJ',
  'DT-NN-VBZ',
  'NN-VBZ-JJ',
  'PRP-MD-RB',
  'MD-RB-VB',
  'RB-VB-NN'],
 ['NNP-NNP-NNP',
  'NNP-NNP-,',
  'NNP-,-WRB',
  ',-WRB-VBP',
  'WRB-VBP-PRP',
  'VBP-PRP-VBG',
  'PRP-VBG-NN',
  'DT-NN-VBZ',
  'NN-

In [23]:
t = df.SENTENCE.head()
for i in t:
    print(triple(i))

IndexError: string index out of range

In [24]:
feature = pd.read_csv('featuresDump.csv')

In [25]:
feature.head()

Unnamed: 0,id,wordCount,stemmedCount,stemmedEndNN,CD,NN,NNP,NNPS,NNS,PRP,...,startTuple0,endTuple0,endTuple1,endTuple2,verbBeforeNoun,qMark,qVerbCombo,qTripleScore,sTripleScore,class
0,44d8a78d2ca66b1b,7,5,0,0,1,1,0,0,1,...,0,0,0,0,0,0,1,0,1,S
1,a9133770c79b2c43,7,4,1,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,2,C
2,246cf41a55627762,5,3,0,0,0,1,0,0,0,...,0,0,0,0,1,0,1,1,0,Q
3,53ac5757399632e8,6,4,0,0,0,3,0,0,0,...,0,0,0,0,0,0,1,0,2,S
4,78e580bde0b4396e,6,4,0,0,3,0,0,0,0,...,0,1,0,0,1,1,1,0,0,Q


In [1]:
import databaseOperation

<sqlite3.Cursor object at 0x0000023D03AB7340>


In [2]:
databaseOperation.insert_answer('hi', 'thei a')

In [3]:
print(databaseOperation.no_of_rows())

<sqlite3.Cursor object at 0x0000023D03AB7420>
<sqlite3.Cursor object at 0x0000023D03AB7420>
