In [1]:
import pandas as pd
import numpy as np
import spacy

### Preprocessing with Spacy with Examples

In [2]:
data = pd.read_feather('data/preprocessed_data')
text = data['text']

In [3]:
nlp = spacy.load('en')

In [4]:
doc = text[0]
doc

"We originally wanted to eat next door, but there was a 30 minute wait so we popped over to Slice of Vegas.  Friendly staff, fun setting and really good food!  Almost forgot to check in and unlock the free garlic parm fries and that would have been a travesty because they were delicious and not something I would normally order with a pizza.  We also got the garlic bread with cheese.  The roasted garlic bulb that came with it was amazing.  The pizza was very good as well.  I'm all about the crust and it was just the right amount of thick and thin (in fact, I was eating the leftover crusts off my husbands plate).  Overall, great lunch spot."

In [5]:
doc = nlp(doc)

In [6]:
list(doc.sents)

[We originally wanted to eat next door, but there was a 30 minute wait so we popped over to Slice of Vegas.  ,
 Friendly staff, fun setting and really good food!  ,
 Almost forgot to check in and unlock the free garlic parm fries and that would have been a travesty because they were delicious and not something I would normally order with a pizza.  ,
 We also got the garlic bread with cheese.  ,
 The roasted garlic bulb that came with it was amazing.  ,
 The pizza was very good as well.  ,
 I'm all about the crust,
 and it was just the right amount of thick and thin,
 (in fact, I was eating the leftover crusts off my husbands plate).  ,
 Overall, great lunch spot.]

In [37]:
# Part of sentence identified by Spacy
pos= {}
for token in doc:
    pos[token]=token.pos_
pos

{We: 'PRON',
 originally: 'ADV',
 wanted: 'VERB',
 to: 'PART',
 eat: 'VERB',
 next: 'ADJ',
 door: 'NOUN',
 ,: 'PUNCT',
 but: 'CCONJ',
 there: 'ADV',
 was: 'VERB',
 a: 'DET',
 30: 'NUM',
 minute: 'NOUN',
 wait: 'NOUN',
 so: 'ADV',
 we: 'PRON',
 popped: 'VERB',
 over: 'PART',
 to: 'ADP',
 Slice: 'PROPN',
 of: 'ADP',
 Vegas: 'PROPN',
 .: 'PUNCT',
  : 'SPACE',
 Friendly: 'ADJ',
 staff: 'NOUN',
 ,: 'PUNCT',
 fun: 'NOUN',
 setting: 'NOUN',
 and: 'CCONJ',
 really: 'ADV',
 good: 'ADJ',
 food: 'NOUN',
 !: 'PUNCT',
  : 'SPACE',
 Almost: 'ADV',
 forgot: 'VERB',
 to: 'PART',
 check: 'VERB',
 in: 'PART',
 and: 'CCONJ',
 unlock: 'VERB',
 the: 'DET',
 free: 'ADJ',
 garlic: 'ADJ',
 parm: 'ADJ',
 fries: 'NOUN',
 and: 'CCONJ',
 that: 'DET',
 would: 'VERB',
 have: 'VERB',
 been: 'VERB',
 a: 'DET',
 travesty: 'NOUN',
 because: 'ADP',
 they: 'PRON',
 were: 'VERB',
 delicious: 'ADJ',
 and: 'CCONJ',
 not: 'ADV',
 something: 'NOUN',
 I: 'PRON',
 would: 'VERB',
 normally: 'ADV',
 order: 'VERB',
 with: 'ADP',
 

In [27]:
pos.keys()

dict_keys([We, originally, wanted, to, eat, next, door, ,, but, there, was, a, 30, minute, wait, so, we, popped, over, to, Slice, of, Vegas, .,  , Friendly, staff, ,, fun, setting, and, really, good, food, !,  , Almost, forgot, to, check, in, and, unlock, the, free, garlic, parm, fries, and, that, would, have, been, a, travesty, because, they, were, delicious, and, not, something, I, would, normally, order, with, a, pizza, .,  , We, also, got, the, garlic, bread, with, cheese, .,  , The, roasted, garlic, bulb, that, came, with, it, was, amazing, .,  , The, pizza, was, very, good, as, well, .,  , I, 'm, all, about, the, crust, and, it, was, just, the, right, amount, of, thick, and, thin, (, in, fact, ,, I, was, eating, the, leftover, crusts, off, my, husbands, plate, ), .,  , Overall, ,, great, lunch, spot, .])

In [36]:
for k in pos.keys():
    if k.pos_ =='PRON':
        print(k,k.pos_)

We PRON
we PRON
they PRON
I PRON
We PRON
it PRON
I PRON
it PRON
I PRON


In [33]:
removal = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']

In [39]:
for np in doc.noun_chunks: # The noun chuncks identified by Spacy
    print(np)

We
next door
a 30 minute
we
Slice
Vegas
Friendly staff
fun setting
really good food
the free garlic parm fries
a travesty
they
I
a pizza
We
the garlic bread
cheese
The roasted garlic bulb
it
The pizza
I
the crust
it
just the right amount
fact
I
my husbands plate
Overall, great lunch spot


### Final code 

Here we are doing Part Of Sentence tagging and removal of all the words that are not important.

In [16]:
# Summary of preprocessing with Spacy
data = pd.read_feather('data/preprocessed_data')

nlp = spacy.load('en')

def clean_up_spacy(text):
    removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal1:
            lemma = token.lemma_
            text_out.append(lemma)
    return text_out