In [None]:
                                       ### 2.1 SENTENCE SEGMENTATION ###

In [None]:
'''Sentence segmentation can be viewed as a classification task for punctuation: 
whenever we encounter a symbol that could possibly end a sentence, such as a period or a question mark, 
we have to decide whether it terminates the preceding sentence'''

In [1]:
import nltk

In [2]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [3]:
tokens[1:15]

['START',
 'Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a']

In [None]:
# Next, we need to specify the features of the data that will be used in order 
# to decide whether punctuation indicates a sentence-boundary:

In [4]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [5]:
featuresets = [(punct_features(tokens, i), (i in boundaries))for i in range(1, len(tokens)-1)if tokens[i] in '.?!']

In [6]:
featuresets[1:5]

[({'next-word-capitalized': True,
   'prev-word': '29',
   'punct': '.',
   'prev-word-is-one-char': False},
  True),
 ({'next-word-capitalized': True,
   'prev-word': 'mr',
   'punct': '.',
   'prev-word-is-one-char': False},
  False),
 ({'next-word-capitalized': True,
   'prev-word': 'n',
   'punct': '.',
   'prev-word-is-one-char': True},
  False),
 ({'next-word-capitalized': False,
   'prev-word': 'group',
   'punct': '.',
   'prev-word-is-one-char': False},
  True)]

In [None]:
# Using these featuresets, we can train and evaluate a punctuation classifier:

In [7]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [None]:
                                    ### 2.2 IDENTIFYING DIALOGUE ACT TYPE ###

In [None]:
#  When processing dialogue, it can be useful to think of utterances as a type of action performed by the speaker.
# This interpretation is most straightforward for performative statements such as 
# "I forgive you" or "I bet you can't climb that hill." 
# dialogue act types, such as "Statement," "Emotion," "ynQuestion", and "Continuer."
# We can therefore use this data to build a classifier that can identify the dialogue act types for new instant messaging posts.

In [55]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [57]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
        return features

In [10]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.749


In [61]:
featuresets[1:20]

[({'contains(:)': True, 'contains(p)': True}, 'Emotion'),
 ({'contains(part)': True}, 'System'),
 ({'contains(hey)': True, 'contains(everyone)': True}, 'Greet'),
 ({'contains(ah)': True, 'contains(well)': True}, 'Statement'),
 ({'contains(nick)': True, 'contains(:10-19-20suser7)': True}, 'System'),
 ({'contains(10-19-20suser7)': True,
   'contains(is)': True,
   'contains(a)': True,
   'contains(gay)': True,
   'contains(name)': True,
   'contains(.)': True},
  'Accept'),
 ({'contains(.action)': True,
   'contains(gives)': True,
   'contains(10-19-20suser121)': True,
   'contains(a)': True,
   'contains(golf)': True,
   'contains(clap)': True,
   'contains(.)': True},
  'System'),
 ({'contains(:)': True, 'contains())': True}, 'Emotion'),
 ({'contains(join)': True}, 'System'),
 ({'contains(hi)': True, 'contains(10-19-20suser59)': True}, 'Greet'),
 ({'contains(26/)': True,
   'contains(m/)': True,
   'contains(ky)': True,
   'contains(women)': True,
   'contains(that)': True,
   'contain

In [62]:
classifier.classify(dialogue_act_features('how are you?'))


'whQuestion'

In [50]:
classifier.classify(dialogue_act_features('hii'))

'Greet'

In [None]:
                                ### 2.3 RECOGNIZING TEXT ENTAILMENT ###

In [None]:
# RTE is the task of determining whether a given piece of text T entails another text called the "hypothesis".
# Here are a couple of examples:
#    An example of a True TEXT ENTAILTAINMENT (text entails hypothesis) is:
        
#    text: If you help the needy, God will reward you.
#    hypothesis: Giving money to a poor man has good consequences.

#    An example of a False TEXT ENTAILTAINMENT (text contradicts hypothesis) is:
        
#    text: If you help the needy, God will reward you.
#    hypothesis: Giving money to a poor man has no consequences.
    
#  We can treat RTE as a classification task, in which we try to predict the True/False label for each pair.
#  It seems likely that successful approaches to this task will involve a combination of parsing,semantics and real world knowledge.
#  RTE achieved reasonably good results with shallow analysis, based on similarity between the text and hypothesis at the word level.


In [37]:
def _rte_features(rtepair):
  # builds a bag of words for both text and hypothesis
  # after throwing away some stopwords
  extractor = nltk.RTEFeatureExtractor(rtepair)
  return {
    "word_overlap" : len(extractor.overlap("word")),
    "word_hyp_extra" : len(extractor.hyp_extra("word")),
    "ne_overlap" : len(extractor.overlap("ne")),
    "ne_hyp_overlap" : len(extractor.hyp_extra("ne"))}

In [52]:
def recognize_text_entailment():
  rtepair = nltk.corpus.rte.pairs(["rte3_dev.xml"])[33]
  extractor = nltk.RTEFeatureExtractor(rtepair)
  # all important words in hypothesis is contained in text => entailment
  print ("text-words=", extractor.text_words)
  print ("hyp-words=", extractor.hyp_words)
  #print ("overlap(word)=", extractor.overlap("word"))
  print ("overlap(ne)=", extractor.overlap("ne"))
  print ("hyp_extra(word)=", extractor.hyp_extra("word"))
  print ("hyp_extra(ne)=", extractor.hyp_extra("ne"))

In [53]:
recognize_text_entailment()

text-words= {'together', 'central', 'that', 'Parviz', 'Asia', 'was', 'SCO', 'fight', 'association', 'Davudi', 'Soviet', 'meeting', 'China', 'representing', 'terrorism.', 'Shanghai', 'operation', 'Russia', 'binds', 'fledgling', 'Organisation', 'former', 'four', 'republics', 'Iran', 'at', 'Co'}
hyp-words= {'member', 'China', 'SCO.'}
overlap(ne)= {'China'}
hyp_extra(word)= {'member'}
hyp_extra(ne)= {'SCO.'}


In [None]:
                                ### 2.4  SCALING UPTO LARGE DATAS ###

In [None]:
''' Python provides an excellent environment for performing basic text processing and feature extraction. 
    However, it is not able to perform the numerically intensive calculations required by machine learning methods 
    nearly as quickly as lower-level languages such as C.
    
     Thus, if you attempt to use the pure-Python machine learning implementations (such as nltk.NaiveBayesClassifier) 
     on large datasets, you may find that the learning algorithm takes an unreasonable amount of time and memory to complete.'''