# NLP :

## Part of Speech Tagging
## Named Entity Recognition

In [1]:
import pandas as pd
import praw
from cytoolz import take
from datetime import datetime
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stopWords = set(stopwords.words('english'))
lt = WordNetLemmatizer()
import re

import spacy
from spacy import displacy
from spacy.matcher import Matcher
nlp = spacy.load('en')
#nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)



import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smiley/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import spacy

In [None]:
#!python -m spacy download en

In [None]:
#python -m spacy download en_core_web_md

In [4]:
# Global Dict for NLP
nlp_dict={'submission_id': list(),
             'top_destination': list(),
             'top_organization': list(),
              'top_product': list()}

## 1. Load Data

In [5]:
# Text Corpus
text_corpus=pd.read_csv("text_corpus.csv")
# NLP Processed, TODO: move this to 1st step of data
#nlp_df=pd.read_csv("./nlp_processed_corpus.csv")

In [6]:
text_corpus.shape

(100, 3)

## 2. Data Pre-processing
1. Removing white space from start and end of string 
2. Removing URL and special character
3. Lemmatization
4. Removing stop words


In [7]:
def tokenize(text,lower=True):
    if lower:
        return [token.strip().lower() for token in text.split()]
    else:
        return [token.strip() for token in text.split()]

def remove_stop_words(text):
     return [word for word in text if word not in stopWords]
    
def lemmatize(text):
    return [(lt.lemmatize(x)) for x in text]

def clean_text(text):
    #regex to remove URL from string 
    text=re.sub('http://\S+|https://\S+', '', text)    
    #regex to remove special characters from string except [: ? ! . , ']
    text=re.sub(r"[^a-zA-Z0-9?!',.]"," ",text)
    
    tokenized_text=tokenize(text,False)
    lemmatized_text=lemmatize(tokenized_text)
    cleaned_text=remove_stop_words(lemmatized_text)

    return ' '.join(token for token in cleaned_text)

## 3. NLP Actions

### Feature 1: Named Entity Recognition

 1. GPE :	Countries, cities, states.
 2. LOC : Non-GPE locations, mountain ranges, bodies of water.
 3. ORG : Companies, agencies, institutions, etc.
 4. FAC : Buildings, airports, highways, bridges, etc.
 5. PRODUCT : Objects, vehicles, foods, etc. (Not services.)
 6. EVENT : Named hurricanes, battles, wars, sports events, etc.


In [8]:
def ner_processing(text_nlp):
    #this function identifies labels from NER on Spacy doc
    GPE=[]
    ORG_FAC=[]
    PRODUCT=[]
    for ent in text_nlp.ents:
            if ent.label_=="GPE" or ent.label_=="LOC":
                GPE.append(str.lower(str(ent)))
            elif ent.label_=="FAC" or ent.label_=="ORG":
                ORG_FAC.append(str.lower(str(ent)))
            elif ent.label_=="PRODUCT":
                PRODUCT.append(str.lower(str(ent)))

    return GPE, ORG_FAC, PRODUCT

Approach 1: Using spacy to identify - Org , destinations, brands. Uses default library

#### Expected:
- Top_destination: Name of places, countries 
- Top_organization: hotel names, brands, airport name
- Top product: travel type, activities


#### Observation:
- Top_destination: moderate result 
- Top_organization: bad result - gives org name but not related to travel
- Top: product: bad - gives product/org name but not really relevant

#### Work to improve NER recognition
- Approach 1: From the received results, apply Word2Vec
- Approach 2: Rule based parsing
- Approach 3: Apply LDA 

TODO: 
- Extract Aspects/Nouns from Submission topics's text to generate tags for this submission.
- Include submission column

>`eg: I would like to go to Switzerland for my marriage` - Switzerland, Marriage

In [9]:
def nlp_processing(submission_id, text):
    # This will perform all NLP processing on text. Inclued following stages:
    # 1. creates chunk of text before processing through spacy as max character limit 
    # for spacy NLP processing is 100000
    # 2. generate top-K  gpe(destinations), orgs(organizations), products (PRODUCT), 
    
    tokens_list =text.split()
    chunk_counter=0
    gpe, org, product = list(), list(), list()    
    chunk_size=80000
    
    while chunk_counter < len(tokens_list)//chunk_size:
        chunk=tokens_list[chunk_counter*chunk_size: chunk_counter*chunk_size+chunk_size]
        # transforming into Spacy Doc format
        text_nlp=nlp(' '.join(chunk))
        _gpe, _org, _product = ner_processing(text_nlp)                
        gpe.extend(_gpe)
        org.extend(_org)
        product.extend(_product)
        chunk_counter+=1
    
    # handle if any remaining chunks
    if chunk_size*chunk_counter < len(tokens_list):
        chunk=tokens_list[chunk_size*chunk_counter:]
        text_nlp=nlp(' '.join(chunk))
        _gpe, _org, _product=ner_processing(text_nlp)
        gpe.extend(_gpe)
        org.extend(_org)
        product.extend(_product)        
    
    # adding to global nlp_dict
    nlp_dict['submission_id'].append(submission_id)  
    nlp_dict['top_destination'].append(Counter(gpe).most_common(5))
    nlp_dict['top_organization'].append(Counter(org).most_common(5))
    nlp_dict['top_product'].append(Counter(product).most_common(5))

In [None]:
import pickle

In [None]:
from sklearn.externals import joblib
joblib.dump(clf, 'NB_spam_model.pkl')

In [None]:
NB_spam_model = open('NB_spam_model.pkl','rb')
clf = joblib.load(NB_spam_model)

## 4. Data Processing

In [10]:
# clean data before processing : TO
text_corpus['cleaned_text']=text_corpus['text'].apply(clean_text)

#removing previous "text" column
text_corpus=text_corpus.iloc[:,[0,1,3]]

# performing NLP on all: TODO this has to be done after data gathering stage 
text_corpus.apply(lambda x : nlp_processing(x.submission_id, x.cleaned_text), axis=1)
nlp_df=pd.DataFrame.from_dict(nlp_dict)

In [52]:
nlp_df=pd.DataFrame.from_dict(nlp_dict)

In [55]:
nlp_df.head()

Unnamed: 0,submission_id,top_destination,top_organization,top_product
0,8h6aao,"[(switzerland, 22), (italy, 11), (finland, 10)...","[(aus, 12), (sf giants, 7), (wengen, 6), (saim...","[(phi phi island, 2)]"
1,95l2e6,"[(nk, 31), (japan, 28), (north korea, 26), (in...","[(cia, 50), (mcd, 24), (sk, 20), (eurojunk, 17...","[(run, 4), (gotta, 3), (fucking, 1), (tuk tuks..."
2,8yj2tg,"[(us, 233), (kyoto, 209), (japan, 208), (europ...","[(eu, 125), (google maps, 10), (love europe, 9...","[(ng, 6), (better fluid, 1)]"
3,8i4939,"[(switzerland, 56), (belgium, 15), (england, 8...","[(interlaken, 20), (ga, 11), (wengen, 10), (au...","[(usd 7, 1), (usd 6, 1)]"
4,85awza,"[(paris, 30), (france, 7), (seine, 6), (versai...","[(eiffel tower, 14), (buy metro passes, 2), (c...",[]


In [56]:
nlp_df['top_destination']=nlp_df['top_destination'].map(lambda x: ",".join([i[0] for i in x]))
nlp_df['top_organization']=nlp_df['top_organization'].map(lambda x: ",".join([i[0] for i in x]))
nlp_df['top_product']=nlp_df['top_product'].map(lambda x: ",".join([i[0] for i in x]))

In [71]:
list(nlp_df[nlp_df['submission_id'] == '85awza']['top_destination'])[0]

'paris,france,seine,versailles,eiffel'

In [106]:
nlp_df.query('submission_id=="85awza"')['top_destination'].values[0]

'paris,france,seine,versailles,eiffel'

In [99]:
destination = list(nlp_df[nlp_df['submission_id'] == input_submission_id]['top_destination'])[0]

In [138]:
nlp_df['submission_id'].isin(['4344']).any()

False

In [74]:
list(nlp_df[nlp_df['submission_id'] == '85awza']['top_product'])[0]

''

In [114]:
out=float('nan')
#out="abc"
import math

if not math.isnan(out):#float('nan')):
    print("in out")
else:
    print("nan cant be printed")

nan cant be printed


In [120]:
out="adte"
#out=float('nan')

try:
    print(out.isalpha())
except Exception as E:
    print("no result")

True


In [57]:
nlp_df.head()

Unnamed: 0,submission_id,top_destination,top_organization,top_product
0,8h6aao,"switzerland,italy,finland,australia,colorado","aus,sf giants,wengen,saimaa lapland,hahah",phi phi island
1,95l2e6,"nk,japan,north korea,india,sk","cia,mcd,sk,eurojunk,basilica","run,gotta,fucking,tuk tuks,darjeeling express ..."
2,8yj2tg,"us,kyoto,japan,europe,tokyo","eu,google maps,love europe,visit.,well,chance.,dc","ng,better fluid"
3,8i4939,"switzerland,belgium,england,op,halbtax","interlaken,ga,wengen,audi,lotr","usd 7,usd 6"
4,85awza,"paris,france,seine,versailles,eiffel","eiffel tower,buy metro passes,cafe,crepes,steps",


In [58]:
# writing the nlp_df content to csv file
nlp_df.to_csv("./nlp_processed_corpus.csv",index=False)

In [59]:
!pwd

/Users/smiley/PycharmProjects/Travel_Insights/notebooks


In [None]:
nlp_df=pd.read_csv("./nlp_processed_corpus.csv")

### merging nlp data with text data

In [13]:
merged_nlp_df=pd.concat([text_corpus, nlp_df], axis=1)
merged_nlp_df=merged_nlp_df.iloc[:, [0,1,2,4,5,6]]

In [14]:
# Tokenize the text
merged_nlp_df['tokens']=merged_nlp_df['cleaned_text'].apply(tokenize)

In [15]:
merged_nlp_df.head()

Unnamed: 0,submission_id,submission_title,cleaned_text,top_destination,top_organization,top_product,tokens
0,8h6aao,Wife and I hate big social events and love tra...,Wife I hate big social event love traveling. S...,"[(switzerland, 22), (italy, 11), (finland, 10)...","[(aus, 12), (sf giants, 7), (wengen, 6), (saim...","[(phi phi island, 2)]","[wife, i, hate, big, social, event, love, trav..."
1,95l2e6,The exact moment I took a step too close to th...,The exact moment I took step close border Nort...,"[(nk, 31), (japan, 28), (north korea, 26), (in...","[(cia, 50), (mcd, 24), (sk, 20), (eurojunk, 17...","[(run, 4), (gotta, 3), (fucking, 1), (tuk tuks...","[the, exact, moment, i, took, step, close, bor..."
2,8yj2tg,Wandering around Kyoto at night,Wandering around Kyoto nightKyoto amazing. One...,"[(us, 233), (kyoto, 209), (japan, 208), (europ...","[(eu, 125), (google maps, 10), (love europe, 9...","[(ng, 6), (better fluid, 1)]","[wandering, around, kyoto, nightkyoto, amazing..."
3,8i4939,I heard this place had stunning views but I ju...,I heard place stunning view I prepared this. M...,"[(switzerland, 56), (belgium, 15), (england, 8...","[(interlaken, 20), (ga, 11), (wengen, 10), (au...","[(usd 7, 1), (usd 6, 1)]","[i, heard, place, stunning, view, i, prepared,..."
4,85awza,Went to the top of the Eiffel Tower and there ...,Went top Eiffel Tower happened rainbow ParisLo...,"[(paris, 30), (france, 7), (seine, 6), (versai...","[(eiffel tower, 14), (buy metro passes, 2), (c...",[],"[went, top, eiffel, tower, happened, rainbow, ..."


In [16]:
#writing merged nlp df to csv file
merged_nlp_df.to_csv("./merged_nlp_df.csv",index=False)

In [None]:
#merged_nlp_df=pd.read_csv("./merged_nlp_df.csv")
#make sure to make tokens as list of str 

In [17]:
merged_nlp_df.head()

Unnamed: 0,submission_id,submission_title,cleaned_text,top_destination,top_organization,top_product,tokens
0,8h6aao,Wife and I hate big social events and love tra...,Wife I hate big social event love traveling. S...,"[(switzerland, 22), (italy, 11), (finland, 10)...","[(aus, 12), (sf giants, 7), (wengen, 6), (saim...","[(phi phi island, 2)]","[wife, i, hate, big, social, event, love, trav..."
1,95l2e6,The exact moment I took a step too close to th...,The exact moment I took step close border Nort...,"[(nk, 31), (japan, 28), (north korea, 26), (in...","[(cia, 50), (mcd, 24), (sk, 20), (eurojunk, 17...","[(run, 4), (gotta, 3), (fucking, 1), (tuk tuks...","[the, exact, moment, i, took, step, close, bor..."
2,8yj2tg,Wandering around Kyoto at night,Wandering around Kyoto nightKyoto amazing. One...,"[(us, 233), (kyoto, 209), (japan, 208), (europ...","[(eu, 125), (google maps, 10), (love europe, 9...","[(ng, 6), (better fluid, 1)]","[wandering, around, kyoto, nightkyoto, amazing..."
3,8i4939,I heard this place had stunning views but I ju...,I heard place stunning view I prepared this. M...,"[(switzerland, 56), (belgium, 15), (england, 8...","[(interlaken, 20), (ga, 11), (wengen, 10), (au...","[(usd 7, 1), (usd 6, 1)]","[i, heard, place, stunning, view, i, prepared,..."
4,85awza,Went to the top of the Eiffel Tower and there ...,Went top Eiffel Tower happened rainbow ParisLo...,"[(paris, 30), (france, 7), (seine, 6), (versai...","[(eiffel tower, 14), (buy metro passes, 2), (c...",[],"[went, top, eiffel, tower, happened, rainbow, ..."


### Top_destination

In [18]:
merged_nlp_df['top_destination'][0:5]

0    [(switzerland, 22), (italy, 11), (finland, 10)...
1    [(nk, 31), (japan, 28), (north korea, 26), (in...
2    [(us, 233), (kyoto, 209), (japan, 208), (europ...
3    [(switzerland, 56), (belgium, 15), (england, 8...
4    [(paris, 30), (france, 7), (seine, 6), (versai...
Name: top_destination, dtype: object

### Top_organization

In [19]:
merged_nlp_df['top_organization'][0:5]

0    [(aus, 12), (sf giants, 7), (wengen, 6), (saim...
1    [(cia, 50), (mcd, 24), (sk, 20), (eurojunk, 17...
2    [(eu, 125), (google maps, 10), (love europe, 9...
3    [(interlaken, 20), (ga, 11), (wengen, 10), (au...
4    [(eiffel tower, 14), (buy metro passes, 2), (c...
Name: top_organization, dtype: object

### Top_product

In [20]:
merged_nlp_df['top_product'][0:5]

0                                [(phi phi island, 2)]
1    [(run, 4), (gotta, 3), (fucking, 1), (tuk tuks...
2                         [(ng, 6), (better fluid, 1)]
3                             [(usd 7, 1), (usd 6, 1)]
4                                                   []
Name: top_product, dtype: object

### Visualization NER using Displacy

In [21]:
#displaCy to view a beautiful visualization of the Named Entity annotated sentence:
#from spacy import displacy
text_nlp=nlp(merged_nlp_df['cleaned_text'][0])

doc = text_nlp
displacy.render(doc, style='ent', jupyter=True)

## Results

Approach 1: Using spacy to identify - Org , destinations, brands. Uses default library

#### Expected:
- Top_destination: Name of places, countries 
- Top_organization: hotel names, brands, airport name
- Top product: travel type, activities


#### Observation:
- Top_destination: moderate result 
- Top_organization: bad result - gives org name but not related to travel
- Top: product: bad - gives product/org name but not really relevant

#### Work to improve NER recognition
- Approach 1: From the received results, apply Word2Vec
- Approach 2: Rule based parsing
- Approach 3: Apply LDA 

TODO: 
- Extract Aspects/Nouns from Submission topics's text to generate tags for this submission.
- Include submission column

>`eg: I would like to go to Switzerland for my marriage` - Switzerland, Marriage

#  Word Cloud

In [None]:
import pandas as pd
from wordcloud import WordCloud
pd.set_option('precision', 2)
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_colwidth', 250)
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
#vn= dict(result)
g = WordCloud(width=2000,height=2000).generate(' '.join(str(i) for i in grouped_locations_reviews['Positive_doc_phrases'][0])).to_image()

plt.figure(figsize=(8,10))
#plt.imshow(g,interpolation='bilinear')
plt.imshow(g)
plt.axis('off')
plt.show()

#for location 3737


# removing words in good cloud and from list and plotting in negative 

A=["doctor", "n't" ,"good" ," great","professional", " knowledgeable", "happy" , "pleasant"]
g = WordCloud(width=1000,height=1000).generate(' '.join(str(i) for i in [item for item in grouped_locations_reviews['Negative_doc_phrases'][2] if item not in (grouped_locations_reviews['Positive_doc_phrases'][2] & A )])).to_image()

plt.figure(figsize=(50,20))
plt.imshow(g,interpolation='bilinear')
plt.imshow(g)
plt.axis('off')
plt.show()



In [None]:
#doc = nlp(u'Hello, world! Hello world!')
doc=' '.join(str(i) for i in result)
doc=nlp(doc)
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id] # # get string representation
    span = doc[start:end]  # the matched span
    print(match_id, string_id, start, end, span.text)

In [None]:
# creating a dictionary with relevant words
# filtering only for following (try to lemmatize to remove plurals or tenses)
# Nouns - NN, NNP, NNS, NNPS
# Verb - VB, VBD, VBG, VBN, VBP
# Adjective - JJ, JJR, JJS
# adverb - 'RB', 'RBR', 'RBS'
tags=[ 'JJ','NN', 'NNS', 'RB','RBR','RBS', 'VBD','VBN']
relation2=dict()
for token in doc2:
    # checking if the tokens belong to pos categories
    if token.tag_ in tags:
        # checking if the tokens belong to pos categories
        if token.head.tag_ in tags:
            if token.head.text in relation2.keys():
                relation2[token.head.text].append((token.text,token.tag_,token.head.text,token.head.tag_))
            else:
                relation2[token.head.text]=[(token.text,token.tag_,token.head.text,token.head.tag_)]