# NLTK Exploration

In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# change this to read in your data
finder = BigramCollocationFinder.from_words(
   nltk.corpus.genesis.words('english-web.txt'))

# only bigrams that appear 3+ times
finder.apply_freq_filter(3) 

# return the 10 n-grams with the highest PMI
finder.nbest(bigram_measures.pmi, 10)  

[(u'Beer', u'Lahai'),
 (u'Lahai', u'Roi'),
 (u'gray', u'hairs'),
 (u'Most', u'High'),
 (u'ewe', u'lambs'),
 (u'many', u'colors'),
 (u'burnt', u'offering'),
 (u'Paddan', u'Aram'),
 (u'east', u'wind'),
 (u'living', u'creature')]

In [2]:
import time

In [3]:
import pandas as pd

In [5]:
df = pd.read_json('./TrainingData/2013b_TrainingData.json')

In [6]:
# LOOK AT TOPIC DICTIONARY AND GET A TOPIC COUNT
topic_file = open('./topicDictionary.txt', 'r')
topics = topic_file.read().split('\r\n')

# SPLITTING THE ELEMENTS OF THE JSON INTO TEXT, PUBLICATION DATE AND TOPICS
df['text'] = df.TrainingData.apply(lambda x: x['bodyText'])
df['pubdate'] = df.TrainingData.apply(lambda x: x['webPublicationDate'])
df['topics'] = df.TrainingData.apply(lambda x: x['topics'])

# DROP FIRST TWO COLUMNS
df.reset_index(inplace=True, drop=True)
df.drop('TrainingData', axis=1, inplace=True)

# DEFINE FUNCTION TO CREATE OUR DATAFRAME
def topic_col(x):
    a = 0
    for elem in x:
        if elem == topic:
            a = 1
    return a

# RUN TOPIC COL FUNCTION ON ALL DATA
for topic in topics:
    time1 = time.time()
    df[topic] = df['topics'].map(topic_col)
    print topic
    time2 = time.time()
    time_in_s = (time2-time1)
    print 'Function takes around %0.3f seconds to run' % (time_in_s)

activism
Function takes around 0.062 seconds to run
afghanistan
Function takes around 0.064 seconds to run
aid
Function takes around 0.050 seconds to run
algerianhostagecrisis
Function takes around 0.060 seconds to run
alqaida
Function takes around 0.055 seconds to run
alshabaab
Function takes around 0.065 seconds to run
antiwar
Function takes around 0.049 seconds to run
arabandmiddleeastprotests
Function takes around 0.068 seconds to run
armstrade
Function takes around 0.063 seconds to run
australianguncontrol
Function takes around 0.077 seconds to run
australiansecurityandcounterterrorism
Function takes around 0.075 seconds to run
bastilledaytruckattack
Function takes around 0.072 seconds to run
belgium
Function takes around 0.054 seconds to run
berlinchristmasmarketattack
Function takes around 0.060 seconds to run
bigdata
Function takes around 0.048 seconds to run
biometrics
Function takes around 0.054 seconds to run
bokoharam
Function takes around 0.072 seconds to run
bostonmaratho

undercoverpoliceandpolicing
Function takes around 0.063 seconds to run
unitednations
Function takes around 0.070 seconds to run
usguncontrol
Function takes around 0.064 seconds to run
values
Function takes around 0.058 seconds to run
warcrimes
Function takes around 0.058 seconds to run
warreporting
Function takes around 0.062 seconds to run
weaponstechnology
Function takes around 0.068 seconds to run
womeninbusiness
Function takes around 0.094 seconds to run
woolwichattack
Function takes around 0.072 seconds to run
worldmigration
Function takes around 0.058 seconds to run
zikavirus
Function takes around 0.055 seconds to run


In [9]:
afghantext = df[df['biometrics'] == 1]['text']

In [10]:
afghantext

25190    The rumour mill around Apple's California head...
48889    Privacy and security are seriously hot topics ...
48945    A German security company says spoofing the iP...
53000    A German security company says spoofing the iP...
53833    Privacy and security are seriously hot topics ...
53929    A German security company says spoofing the iP...
Name: text, dtype: object

In [12]:
# change this to read in your data
finder = BigramCollocationFinder.from_words(afghantext[25190])

# only bigrams that appear 3+ times
finder.apply_freq_filter(3) 

# return the 10 n-grams with the highest PMI
finder.nbest(bigram_measures.pmi, 10)  

[(u'q', u'u'),
 (u'T', u'h'),
 (u'A', u'p'),
 (u'"', u','),
 (u'H', u'e'),
 (u'G', u'i'),
 (u'e', u'x'),
 (u'I', u'n'),
 (u"'", u's'),
 (u't', u'h')]

In [13]:
tokenizer = RegexpTokenizer(r'\w+')
tokens2 = tokenizer.tokenize(afghantext[25190])
tokens = nltk.wordpunct_tokenize(afghantext[25190])
finder = BigramCollocationFinder.from_words(tokens2)
scored = finder.score_ngrams(bigram_measures.raw_freq)
bigrams1 = sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

In [14]:
scored

[((u'in', u'the'), 0.005112474437627812),
 ((u'such', u'as'), 0.005112474437627812),
 ((u'Giles', u'is'), 0.003067484662576687),
 ((u'and', u'other'), 0.003067484662576687),
 ((u'and', u'so'), 0.003067484662576687),
 ((u'for', u'the'), 0.003067484662576687),
 ((u'how', u'they'), 0.003067484662576687),
 ((u'Apps', u'such'), 0.002044989775051125),
 ((u'a', u'city'), 0.002044989775051125),
 ((u'and', u'the'), 0.002044989775051125),
 ((u'as', u'Endomondo'), 0.002044989775051125),
 ((u'as', u'the'), 0.002044989775051125),
 ((u'as', u'well'), 0.002044989775051125),
 ((u'city', u'and'), 0.002044989775051125),
 ((u'data', u'providers'), 0.002044989775051125),
 ((u'data', u'to'), 0.002044989775051125),
 ((u'enabled', u'scales'), 0.002044989775051125),
 ((u'for', u'example'), 0.002044989775051125),
 ((u'gold', u'dust'), 0.002044989775051125),
 ((u'he', u'says'), 0.002044989775051125),
 ((u'information', u'economy'), 0.002044989775051125),
 ((u'is', u'a'), 0.002044989775051125),
 ((u'of', u'publi

In [15]:
ukcrime = df[df['ukcrime'] == 1]['text']

In [17]:
tokens3 = [tokenizer.tokenize(article) for article in ukcrime]

In [47]:
# trialtokens = [item for sublist in l for item in sublist]
trialtokens = [item.lower() for sublist in tokens3 for item in sublist]

In [48]:
finder = BigramCollocationFinder.from_words(trialtokens)
scored = finder.score_ngrams(bigram_measures.raw_freq)
bigrams1 = sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

In [49]:
scored

[((u'of', u'the'), 0.005996605170158889),
 ((u'in', u'the'), 0.004925285917021454),
 ((u'to', u'the'), 0.0030108244984278043),
 ((u'he', u'was'), 0.002095333500292178),
 ((u'to', u'be'), 0.0019395052452903692),
 ((u'on', u'the'), 0.0019228093608258897),
 ((u'the', u'police'), 0.0016306313826974985),
 ((u'that', u'the'), 0.001586109024125553),
 ((u'by', u'the'), 0.0015833263767148064),
 ((u'in', u'a'), 0.0015694131396610735),
 ((u'of', u'a'), 0.0015221081336783816),
 ((u'at', u'the'), 0.0015137601914461418),
 ((u'have', u'been'), 0.0014497593009989704),
 ((u'for', u'the'), 0.001385758410551799),
 ((u'and', u'the'), 0.0013607145838550798),
 ((u'with', u'the'), 0.0013356707571583605),
 ((u'had', u'been'), 0.0013106269304616412),
 ((u'it', u'was'), 0.0012021036814425245),
 ((u'has', u'been'), 0.0011130589642986337),
 ((u'it', u'is'), 0.0011130589642986337),
 ((u'from', u'the'), 0.0011074936694771406),
 ((u'as', u'a'), 0.0010629713109051953),
 ((u'he', u'had'), 0.0010212315997439965),
 ((u'

In [50]:
bigrams1

[(u'0', u'00001667'),
 (u'0', u'06'),
 (u'0', u'1ml'),
 (u'0', u'2'),
 (u'0', u'22'),
 (u'0', u'3'),
 (u'0', u'5'),
 (u'0', u'7'),
 (u'0', u'72'),
 (u'0', u'8'),
 (u'0', u'8g'),
 (u'0', u'9'),
 (u'00', u'on'),
 (u'000', u'154'),
 (u'000', u'34'),
 (u'000', u'43'),
 (u'000', u'50'),
 (u'000', u'a'),
 (u'000', u'according'),
 (u'000', u'african'),
 (u'000', u'and'),
 (u'000', u'animals'),
 (u'000', u'annual'),
 (u'000', u'around'),
 (u'000', u'at'),
 (u'000', u'being'),
 (u'000', u'between'),
 (u'000', u'bill'),
 (u'000', u'bmw'),
 (u'000', u'bridging'),
 (u'000', u'bullets'),
 (u'000', u'but'),
 (u'000', u'by'),
 (u'000', u'calls'),
 (u'000', u'cases'),
 (u'000', u'casualties'),
 (u'000', u'cautions'),
 (u'000', u'cctv'),
 (u'000', u'children'),
 (u'000', u'credit'),
 (u'000', u'detective'),
 (u'000', u'did'),
 (u'000', u'donation'),
 (u'000', u'dvds'),
 (u'000', u'each'),
 (u'000', u'fewer'),
 (u'000', u'flat'),
 (u'000', u'followers'),
 (u'000', u'for'),
 (u'000', u'francesca'),
 (u'0

In [19]:
finder = BigramCollocationFinder.from_words(tokens3)
scored = finder.score_ngrams(bigram_measures.raw_freq)
bigrams2 = sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_

TypeError: unhashable type: 'list'

In [33]:
tokens3 = tokenizer.tokenize(afghantext[397])
finder = BigramCollocationFinder.from_words(tokens3)
scored = finder.score_ngrams(bigram_measures.raw_freq)
bigrams2 = sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

In [34]:
tokens4 = tokenizer.tokenize(afghantext[50382])
finder = BigramCollocationFinder.from_words(tokens4)
scored = finder.score_ngrams(bigram_measures.raw_freq)
bigrams3 = sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

In [36]:
allbigrams = bigrams1 + bigrams2 + bigrams3

In [135]:
from collections import Counter

In [136]:
boom = Counter(allbigrams).most_common()

In [137]:
testdf = pd.DataFrame(afghantext)
testdf['tokenized'] = testdf.apply(lambda row: tokenizer.tokenize(row['text']), axis=1)

In [138]:
tokenaf = [item for sublist in testdf['tokenized'] for item in sublist]

In [139]:
# tokens4 = tokenizer.tokenize(afghantext[50382])
finder = BigramCollocationFinder.from_words(tokenaf)
scored = finder.score_ngrams(bigram_measures.raw_freq)
# bigrams3 = sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

In [153]:
removelist = [(u'of', u'the'), (u'in', u'the'), (u'to', u'the'), 
              (u'on', u'the'), (u'and', u'the'), (u'the', u'US'),
             (u'for', u'the'), (u'that', u'the'), (u'by', u'the'),
             (u'by', u'the'), (u'to', u'be'), (u'in', u'a'),
             (u'of', u'a'), (u'with', u'the'), (u'at', u'the'),
             (u'have', u'been'), (u'had', u'been'), (u'as', u'a'),
             (u'has', u'been'), (u'from', u'the'), (u'is', u'a'),
             (u'on', u'a')]

In [163]:
bigramsafghan = pd.DataFrame(scored)

In [173]:
bigramsafghan[50:100] # NOTE: ADDED "in Afghanistan" and "the Taliban" and "the Afghan"

Unnamed: 0,0,1
50,"(It, s)",0.000517
51,"(and, a)",0.000517
52,"(to, have)",0.000505
53,"(about, the)",0.000499
54,"(Marine, A)",0.000494
55,"(with, a)",0.000476
56,"(of, his)",0.00047
57,"(of, Afghanistan)",0.000453
58,"(there, is)",0.000453
59,"(they, are)",0.000453


It seems "In Afghanistan" is a very common bigram for topics pertaining to Afghanistan.

Testing with just these two phrases, we get a score of 0.84 so it's very predictive.

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   19.4s finished
F1 score: 0.844444444444
afghanistan
Function takes around 37.133 seconds to run

We will use these bigrams to augment our vocabulary in problem topics.

Let's try to unpack these tuples.

In [180]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopset = set(stopwords.words('english'))

text = afghantext[155]
tokens=word_tokenize(str(text))
tokens = [w for w in tokens if not w in stopset]
finder = BigramCollocationFinder.from_words(tokens)
scored = finder.score_ngrams(bigram_measures.raw_freq)

In [182]:
test = ['scored', 'asdf', 'hello', 'hello']
finder2 = BigramCollocationFinder.from_words(test)
# DOES NOT WORK WITH A TEST LIST

In [185]:
from spacy.en import English
parser = English()
import pickle

In [188]:
def bigramfinder(topic, df):
    # CREATE A VOCABULARY LIST
    vocabulary = []
    
    # CREATE A NEW DATAFRAME FOR TOPIC TEXT
    topicdf = df[df[topic] == 1]['text']
    topicdf.reset_index(drop=True, inplace=True)
    
#     # USE SPACY TO CREATE VOCAB
#     nouns = Counter()
#     entities = Counter()
    
    for doc in parser.pipe(topicdf, n_threads=16, batch_size=10000):
#         for chunk in doc.noun_chunks:
#             if parser.vocab[chunk.lemma_].prob < -19.5:
#                     nouns[chunk.lemma_] += 1
#         for entity in doc.ents:
#             if parser.vocab[entity.lemma_].prob < -19.5:
#                     entities[entity.lemma_] += 1
        for tok3 in doc.token:
            print tok3

#     # APPEND NOUNS AND ENTITIES TO VOCAB
#     for noun, count in nouns.most_common(10):
#         vocabulary.append(noun)
    
#     for entity, count in entities.most_common(10):
#         vocabulary.append(entity)
        
    vocabulary = list(set(vocabulary))
    
    if vocabulary == set():
        vocabulary = []
        
    return vocabulary

In [189]:
bigramfinder('afghanistan', df)

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'token'