##  Pipeline and Properties

In [1]:
import spacy 
nlp = spacy.load('en')

In [2]:
document = open("hotelreviews.txt").read()
document = nlp(document)

In [3]:
dir(document)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_py_tokens',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_disk',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'mem',
 'merge',
 'noun_chunks',
 'noun_chunks_iterator',
 'print_tree',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set_extension',
 'similarity',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'to_bytes',
 'to_disk',
 'user_data',
 'user_hooks',
 'u

## Tokenization

In [4]:
# first token of the doc 
document[0]

Nice

In [5]:
# last token of the doc  
document[len(document)-5]

boston

In [6]:
# List of sentences of our doc 
list(document.sents)

[Nice place Better than some reviews give it credit for.,
 Overall, the rooms were a bit small but nice.,
 Everything was clean, the view was wonderful and it is very well located (the Prudential Center makes shopping and eating easy and the T is nearby for jaunts out and about the city).,
 Overall, it was a good experience and the staff was quite friendly. ,
 what a surprise What a surprise the Sheraton was after reading some of the reviews.,
 it would appear there is a massive difference in the rooms, the South tower being the best.,
 Check in was very efficient and the room was lovely, very large with the most comfortable beds ever.,
 The hotel as stated is in a fantastic location and the Wrentham Village outlet is well worth a visit for bargain shopping ( the bus picks up outside).,
 The hotel bar is a little pricey ( not helped by the current dollar rate) but is a nice place to relax after a busy day shopping.,
 There is a number of restaurants close by.,
 A cab from the airport t

## Part of Speech Tagging

In [7]:
# get all tags
all_tags = {w.pos: w.pos_ for w in document}

In [8]:
# all tags of first sentence of our document 
for word in list(document.sents)[0]:  
    print(word, word.tag_)

Nice JJ
place NN
Better JJR
than IN
some DT
reviews NNS
give VBP
it PRP
credit NN
for IN
. .


In [9]:
#define some parameters  
noisy_pos_tags = ["PROP"]
min_token_length = 2

#Function to check if the token is a noise or not  
def isNoise(token):     
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True 
    elif token.is_stop == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise 
def cleanup(token, lower = True):
    if lower:
       token = token.lower()
    return token.strip()

In [10]:
# top unigrams used in the reviews 
from collections import Counter
cleaned_list = [cleanup(word.string) for word in document if not isNoise(word)]
Counter(cleaned_list) .most_common(5)

[('the', 709),
 ('hotel', 685),
 ('room', 653),
 ('great', 300),
 ('sheraton', 286)]

##  Dependency Parsing

In [11]:
# extract all review sentences that contains the term - hotel
hotel = [sent for sent in document.sents if 'hotel' in sent.string.lower()]

# create dependency tree
sentence = hotel[2] 
print(sentence)
print("-"*110)
for word in sentence: 
    print( word, ': ', str(list(word.children)))

A cab from the airport to the hotel can be cheaper than the shuttles depending what time of the day you go.
--------------------------------------------------------------------------------------------------------------
A :  []
cab :  [A, from]
from :  [airport, to]
the :  []
airport :  [the]
to :  [hotel]
the :  []
hotel :  [the]
can :  []
be :  [cab, can, cheaper, .]
cheaper :  [than]
than :  [shuttles]
the :  []
shuttles :  [the, depending]
depending :  [time]
what :  []
time :  [what, of]
of :  [day]
the :  []
day :  [the, go]
you :  []
go :  [you]
. :  []


In [12]:
# check all adjectives used with a word 
def pos_words (sentence, token, ptag):
    sentences = [sent for sent in sentence.sents if token in sent.string]     
    pwrds = []
    for sent in sentences:
        for word in sent:
            if token in word.string: 
                   pwrds.extend([child.string.strip() for child in word.children
                                                      if child.pos_ == ptag] )
    return Counter(pwrds).most_common(10)

pos_words(document, 'hotel', "ADJ")

[('other', 20),
 ('great', 10),
 ('good', 7),
 ('nice', 6),
 ('better', 6),
 ('different', 5),
 ('many', 5),
 ('my', 4),
 ('best', 4),
 ('wonderful', 3)]

## Noun Phrases

In [13]:
# Generate Noun Phrases 
doc = nlp(u'I love data science on analytics vidhya') 
for np in doc.noun_chunks:
    print (np.text, np.root.dep_, np.root.head.text)

I nsubj love
data science dobj love
analytics vidhya pobj on


## Word to Vectors Integration

In [14]:
from numpy import dot 
from numpy.linalg import norm 
parser = spacy.load('en_core_web_sm')
#Generate word vector of the word - apple  
apple = parser.vocab[u'apple']

#Cosine similarity function 
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
others = list({w for w in parser.vocab if w.has_vector and w.orth_.islower() and w.lower_ != unicode("apple")})

# sort by similarity score
others.sort(key=lambda w: cosine(w.vector, apple.vector)) 
others.reverse()

print ("top most similar words to apple:" )
for word in others[:]:
    print( word.orth_)

top most similar words to apple:


## Machine Learning with text using Spacy

In [15]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import string
punctuations = string.punctuation

from spacy.lang.en import English
parser = English()

#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [16]:
#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [17]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Load sample data
train = [('I love this sandwich.', 'pos'),          
         ('this is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('this is my best work.', 'pos'),
         ("what an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'),
         ('he is my sworn enemy!', 'neg'),          
         ('my boss is horrible.', 'neg')] 
test =   [('the beer was good.', 'pos'),     
         ('I do not enjoy my job', 'neg'),
         ("I ain't feelin dandy today.", 'neg'),
         ("I feel amazing!", 'pos'),
         ('Gary is a good friend of mine.', 'pos'),
         ("I can't believe I'm doing this.", 'neg')]

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data = pipe.predict([x[0] for x in test]) 
for (sample, pred) in zip(test, pred_data):
    print (sample, pred )
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

('the beer was good.', 'pos') pos
('I do not enjoy my job', 'neg') neg
("I ain't feelin dandy today.", 'neg') neg
('I feel amazing!', 'pos') pos
('Gary is a good friend of mine.', 'pos') pos
("I can't believe I'm doing this.", 'neg') neg
Accuracy: 1.0
