# Class 14 - Starter Code

Natural Language Processing and Topic Modeling

In [1]:
# spacy is used for pre-processing and traditional NLP
import spacy
from spacy.en import English
nlp_toolkit = English()

# Gensim is used for LDA and word2vec
from gensim.models.word2vec import Word2Vec

# Twitter Lab

In this exercise, we will compare some of the classical NLP tools from the last class with these more modern latent variable techniques.  We will do this by comparing information extraction on Twitter using two different methods.

> NOTE:  There is a pre-existing file of captured tweets you can use.  It is located in the class repo for lesson-14.  However, you can also collect your own tweets following the instructions in twitter-instructions.md.

In [2]:
# Loading the twitter data
tweets = [unicode(tweet, errors='ignore') for tweet in \
          open('../../assets/dataset/captured-tweets.txt', 'r')]

# Part 1: Using `spacy`

Use `spacy` to write a function to filter tweets down to those where Google is announcing a product. How might we do this? One way might be to identify verbs, where 'Google' is the noun and there is some action like 'announcing'

In [3]:
# Use spacy to parse each tweet
parsed_tweets = []
for tweet in tweets:
    parsed_tweets.append(nlp_toolkit(tweet))

### 1.a
Write a function that can take a sentence parsed by spacy and identify if it mentions a company named 'Google'. Remember,spacy can find entities and code them as ORG if they are a company. 

### 1.b
BONUS: Make this function work for any company.

Hint: https://spacy.io/docs#examples-entities

In [4]:
# Write a function that can take a take a sentence parsed by `spacy` and 
# identify if it mentions a company named 'Google'. 
# Remember, `spacy` can find entities and code them `ORG` if they are a company.
def mentions_company(parsed, company='Google'):
    for entity in parsed.ents:
        if entity.text == company and entity.label_ == 'ORG':
            return True
    return False

In [5]:
google_tweets = []

# For each tweet, use parsed tweet to check your function
for i, parsed_tweet in enumerate(parsed_tweets):
    if mentions_company(parsed_tweet, 'Google'):
        google_tweets.append(parsed_tweet)

print len(google_tweets)

1328


### 1.c
Write a function that can take a sentence parsed by spacy and return the verbs of the sentence (preferably lemmatized).

Hint: https://spacy.io/docs#examples-pos-tags

In [6]:
def is_verb(token):
    return token.pos == spacy.parts_of_speech.VERB

# Write a function that can take a sentence parsed by `spacy` 
# and return the verbs of the sentence (preferably lemmatized)
def get_actions(parsed):
    actions = [el.lemma_ 
                for el in parsed
                if is_verb(el)
               ]
    return set(actions)

In [7]:
tweets_w_verbs = []

# For each tweet, use parsed tweet to check your function
for i, parsed_tweet in enumerate(parsed_tweets):
    tweets_w_verbs.append(get_actions(parsed_tweet))

print len(filter(None, tweets_w_verbs))

3544


### 1.d

For each tweet that mentions Google, parse it using spacy and print it out if the tweet has 'release' or 'announce' as a verb.

In [8]:
wanted_verbs = set(['release', 'announce'])

filtered_tweets = []

for i, parsed_tweet in enumerate(parsed_tweets):
    try:
        if mentions_company(parsed_tweet, 'Google') and \
            (wanted_verbs & get_actions(parsed_tweet)):
            filtered_tweets.append(parsed_tweet)
    except Exception as e:
        print e
        continue
        
print len(filtered_tweets)

8


### 1.e
Write a function that identifies countries.  HINT: the entity label for countries is GPE (or "GeoPolitical Entity").

Hint: https://spacy.io/docs#annotation-ner

In [9]:
# Write a function that identifies countries - HINT: the entity label for 
# countries is GPE (or GeoPolitical Entity)
def mentions_country(parsed, country):
    for entity in parsed.ents:
        if entity.text == country and entity.label_ == 'GPE':
            return True
    return False

In [10]:
country_tweets = []

for i, parsed_tweet in enumerate(parsed_tweets):
    if mentions_country(parsed_tweet, 'Iran'):
        country_tweets.append(parsed_tweet)

print len(country_tweets)

567


### 1.f
Re-run to find country tweets that discuss 'Iran' announcing or releasing.

In [11]:
filtered_tweets = []

for i, parsed_tweet in enumerate(parsed_tweets):
    try:
        if mentions_country(parsed_tweet, 'Iran') and \
            (wanted_verbs & get_actions(parsed_tweet)):
            filtered_tweets.append(parsed_tweet)
    except Exception as e:
        print e
        continue
        
print len(filtered_tweets)

10


# Part 2: Using `gensim`

Build a `word2vec` model of the tweets we have collected using `gensim`.

### 2.a
First take the collection of tweets and tokenize them using spacy.
Think about how this should be done. 
Should you only use upper-case or lower-case? 
Should you remove punctuations or symbols? 

In [12]:
# Lemmatize the verbs for easier searching and keep symbols and punctuations
split_tweets = [[x.text if x.pos != spacy.parts_of_speech.VERB else x.lemma_ 
                 for x in nlp_toolkit(t)] for t in tweets]

In [13]:
print tweets[0]

I made a(n) Small Tourmaline in Paradise Island! https://t.co/cAoW1b6DRc #Gameinsight #Androidgames #Android



In [14]:
print split_tweets[0]

[u'I', u'make', u'a(n', u')', u'Small', u'Tourmaline', u'in', u'Paradise', u'Island', u'!', u'https://t.co/cAoW1b6DRc', u'#', u'Gameinsight', u'#', u'Androidgames', u'#', u'Android', u'\n']


### 2.b
Build a word2vec model.  
Test the window size as well - this is how many surrounding words need to be used to model a word. What do you think is appropriate for Twitter? 


In [15]:
# Build a `word2vec` model
model = Word2Vec(split_tweets, size=100, window=4, min_count=5, workers=4)

### 2.c
Test your word2vec model with a few similarity functions.  
Find words similar to 'Syria'.  
Find words similar to 'war'.  
Find words similar to 'Iran'.  
Find words similar to 'Verizon'. 

In [16]:
model.most_similar(positive=['Verizon'])

[(u'Microsoft', 0.9997069835662842),
 (u'after', 0.9997051954269409),
 (u'3', 0.9996882677078247),
 (u'Xbox', 0.9996806979179382),
 (u'people', 0.9996719360351562),
 (u'Black', 0.9996699094772339),
 (u'its', 0.9996615648269653),
 (u'should', 0.9996509552001953),
 (u'follow', 0.9996484518051147),
 (u'their', 0.9996421933174133)]

In [17]:
model.most_similar(positive=['Syria'])

[(u'opposition', 0.9992985725402832),
 (u'Paris', 0.9991042613983154),
 (u'UK', 0.9989204406738281),
 (u'StopExecutionsIran', 0.9986642599105835),
 (u'SaudiArabia', 0.9985948801040649),
 (u'More', 0.998586893081665),
 (u'FNS', 0.9985278248786926),
 (u' ', 0.9985249638557434),
 (u'MTP', 0.9985093474388123),
 (u'JACKSON', 0.9985016584396362)]

In [18]:
model.most_similar(positive=['War'])

[(u'Quiz', 0.9985278248786926),
 (u'flights', 0.9985193014144897),
 (u'Muslim', 0.9984932541847229),
 (u'v.', 0.9984868764877319),
 (u'help', 0.9984837174415588),
 (u'drive', 0.9984773397445679),
 (u'no', 0.9984731674194336),
 (u'..', 0.9984666109085083),
 (u'say', 0.9984651803970337),
 (u'fun', 0.9984633922576904)]

In [19]:
model.most_similar(positive=['Iran'])

[(u'regime', 0.9988666772842407),
 (u'News', 0.9979665875434875),
 (u'Syria', 0.9975413084030151),
 (u'Paris', 0.9974744915962219),
 (u'opposition', 0.997397780418396),
 (u'France', 0.9972906708717346),
 (u'UK', 0.9969097971916199),
 (u'StopExecutionsIran', 0.996848464012146),
 (u'Tech', 0.9966347217559814),
 (u'No2Rouhani', 0.9965595602989197)]

In [20]:
model.most_similar(positive=['war', 'Iraq'])

[(u'2015', 0.999721109867096),
 (u'forces', 0.9997110366821289),
 (u'their', 0.9996823072433472),
 (u'relations', 0.9996620416641235),
 (u'Afghanistan', 0.9996528625488281),
 (u'Assad', 0.9996514320373535),
 (u'6', 0.9996485710144043),
 (u'=', 0.9996464848518372),
 (u'its', 0.9996417760848999),
 (u'2016', 0.9996405839920044)]

# Part 3: Comparing `spacy` and `gensim`
Filter tweets to those that mention 'Iran' or similar entities and 'war' or similar entities.

### 3.a
Using `spacy`

In [21]:
# Using spacy
for i, parsed_tweet in enumerate(parsed_tweets):
    if mentions_country(parsed_tweet, 'Iran') \
    or mentions_country(parsed_tweet, 'Iraq'):
        if 'attack' in get_actions(parsed_tweet) \
        or 'war' in parsed_tweet.text:
            print(parsed_tweet)

RT @f396: 'Iran has a long record in attacking foreign diplomatic missions,' Saudi ... - https://t.co/3gaSRB3osT via https://t.co/UjStGmTT2f

#Iran provoked by U.S. into reacting angrily, then we claim they r evil. Iran not attacked another country for 400  https://t.co/3lTO2hgFPr

RT @Mojahedineng: #Iran #News Starvation has become tool of war in #Syria https://t.co/F0NnT87DMc https://t.co/f3J70v47aL

Iran-Saudi sectarian proxy wars set to explode, Israeli experts say - Middle East - Jerusalem Post

RT @iran_policy: Saleh Hamid: Right now many differences exist between #Iran regime + Russia in #Syria war. Iran feels it gives casualties 

RT @Mojahedineng: #Iran #News EUs foreign policy chief warned Iran on renewed tension with Saudi Arabia https://t.co/ifzIVcWwOV https://t.

RT @cerenomri: "Literally every US ally in Mideast is on brink of hot war w/ Iran, so we're going to release $100 billion to Iran this mont

#Iran #News EUs foreign policy chief warned Iran on renewed tension with

### 3.b
Using `gensim`

In [22]:
# Using gensim
for i, split_tweet in enumerate(split_tweets):
    similarity_to_iran = max(
        [model.similarity('Iran', tok) for tok in split_tweet if tok in model.vocab]+[0]
    )
    similarity_to_war = max(
        [model.similarity('war', tok) for tok in split_tweet if tok in model.vocab]+[0]
    )
    if similarity_to_iran > 0.999 and similarity_to_war > 0.999:
        print (similarity_to_iran, similarity_to_war)
        print ' '.join(split_tweet)

(0.99999999999999989, 0.99942523127944716)
RT @f396 : Saudi Arabia sever diplomatic ties with Iran over embassy fire - https://t.co/r0iZugJa3v via https://t.co/UjStGmTT2f 

(0.99999999999999989, 0.99942523127944716)
RT @f396 : ' Iran have a long record in attack foreign diplomatic missions , ' Saudi ... - https://t.co/3gaSRB3osT via https://t.co/UjStGmTT2f 

(0.99999999999999989, 0.99925087724220607)
# # # # Saudi Arabia cut ties with Iran - Mail &amp ; Guardian Online   https://t.co/vxCisN0Hrh 

(0.99999999999999989, 0.99925087724220607)
# # # # Saudi Arabia cut ties with Iran - Mail &amp ; Guardian Online   https://t.co/9s0dtpAJnl 

(0.99999999999999989, 0.99954354026005854)
Iran : 4 prisoners in Gohadasht Prison begin their second week of hunger Strike https://t.co/qldbF6bv3D # iraq # LeMonde # google 

(0.99999999999999989, 0.99930986053666127)
MarketWatch : grow tension between Saudi Arabia and Iran be fuel a climb in oil prices   https://t.co/N4ueWWIG6C 

(0.99999999999999989, 0.

# Part 4: [Bonus] Your Own Analysis
Build your own analysis using the above twitter data.
Alternatively, collect your own tweets to analyze following the instructions in `twitter-instructions.md`

In [23]:
# Loading the twitter data
tweets = [unicode(tweet, errors='ignore').lower() for tweet in \
          open('../../assets/dataset/captured-tweets.txt', 'r')]

In [24]:
# Lemmatize the verbs for easier searching and keep symbols and punctuations
split_tweets = [[x.text if x.pos != spacy.parts_of_speech.VERB else x.lemma_ 
                 for x in nlp_toolkit(t)] for t in tweets]

In [25]:
# Build a `word2vec` model
model = Word2Vec(split_tweets, size=100, window=4, min_count=5, workers=4)

In [26]:
model.most_similar(positive=['verizon'])

[(u'2016', 0.9996863007545471),
 (u'3', 0.9996826648712158),
 (u'black', 0.9996769428253174),
 (u'without', 0.9996188879013062),
 (u'call', 0.9996151924133301),
 (u'us', 0.9995951652526855),
 (u'warn', 0.9995946884155273),
 (u'man', 0.9995759725570679),
 (u'israel', 0.99957275390625),
 (u'7', 0.9995619654655457)]

In [27]:
model.most_similar(positive=['syria'])

[(u'opposition', 0.9990849494934082),
 (u'casualties', 0.9988429546356201),
 (u'paris', 0.9984347820281982),
 (u'tech', 0.9983500838279724),
 (u'stopexecutionsiran', 0.9982234835624695),
 (u"'s", 0.9981861710548401),
 (u'in', 0.9981005787849426),
 (u'movements', 0.9980751276016235),
 (u'uk', 0.9980130195617676),
 (u'france', 0.997966468334198)]

In [28]:
model.most_similar(positive=['war'])

[(u'+', 0.999679684638977),
 (u'children', 0.99967360496521),
 (u'marketing', 0.9996545910835266),
 (u'=', 0.9996134042739868),
 (u'seo', 0.9996041059494019),
 (u'afghanistan', 0.9995942115783691),
 (u'facebook', 0.9995781779289246),
 (u'end', 0.9995607137680054),
 (u'relations', 0.9995331168174744),
 (u'under', 0.9995311498641968)]

In [29]:
model.most_similar(positive=['iran'])

[(u'regime', 0.9974987506866455),
 (u'syria', 0.9965398907661438),
 (u'opposition', 0.9957485198974609),
 (u'news', 0.995090126991272),
 (u'humanrights', 0.9950154423713684),
 (u'casualties', 0.9946978092193604),
 (u'stopexecutionsiran', 0.9946436285972595),
 (u'france', 0.9944775104522705),
 (u'paris', 0.9943448901176453),
 (u'democratic', 0.9943297505378723)]

In [30]:
# Using gensim
for i, split_tweet in enumerate(split_tweets):
    similarity_to_iran = max(
        [model.similarity('iran', tok) for tok in split_tweet if tok in model.vocab]+[0]
    )
    similarity_to_war = max(
        [model.similarity('war', tok) for tok in split_tweet if tok in model.vocab]+[0]
    )
    if similarity_to_iran > 0.999 and similarity_to_war > 0.999:
        print (similarity_to_iran, similarity_to_war)
        print ' '.join(split_tweet)

(1.0000000000000002, 0.99916527126767618)
rt @f396 : iran blame america , britain and ' zionists ' for nimr execution - https://t.co/bwxeicgaoa via https://t.co/ujstgmtt2f 

(1.0000000000000002, 0.99916527126767618)
rt @f396 : saudi arabia sever diplomatic ties with iran over embassy fire - https://t.co/r0izugja3v via https://t.co/ujstgmtt2f 

(1.0000000000000002, 0.99949220886738066)
rt @f396 : ' iran have a long record in attack foreign diplomatic missions , ' saudi ... - https://t.co/3gasrb3ost via https://t.co/ujstgmtt2f 

(1.0000000000000002, 0.99935009075166037)
# # # # saudi arabia cut ties with iran - mail &amp ; guardian online   https://t.co/vxcisn0hrh 

(1.0000000000000002, 0.99935009075166037)
# # # # saudi arabia cut ties with iran - mail &amp ; guardian online   https://t.co/9s0dtpajnl 

(1.0000000000000002, 0.99952670822463152)
iran : 4 prisoners in gohadasht prison begin their second week of hunger strike https://t.co/qldbf6bv3d # iraq # lemonde # google 

(1.0000000000