In [1]:
import spacy
import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab)

In [5]:
nlp.add_pipe(coref, name='neuralcoref')

In [16]:
doc = nlp("My sister has a dog. She loves him.")
print(doc._.coref_resolved)

My sister has a dog. My sister loves a dog.


In [18]:
import os
import numpy as np
import pandas as pd

data_dir = "/home/stavros/DATA/AirbnbReviews"

area = "nyc"
area_dir = os.path.join(data_dir, area)

data = pd.read_csv(os.path.join(area_dir, "reviews.csv.gz"))
print(data.shape)

clean_data = data[pd.notnull(data.comments)]
print(clean_data.shape)

(1166689, 6)
(1166096, 6)


In [19]:
import langdetect
from utils import dependencies
from utils import preprocessing


normalizer = preprocessing.CorpusNormalizer(
                 html_stripping=False, contraction_expansion=True,
                 accented_char_removal=True, text_lower_case=True,
                 text_lemmatization=False, special_char_removal=False,
                 stopword_removal=False, remove_digits=False)

n_samples = 500
n_message = 100

ids = np.arange(len(clean_data))
np.random.shuffle(ids)

sampled_columns = list(clean_data.columns) + ["processed_comments", "coref_resolved"]
sampled_data = pd.DataFrame(index=range(n_samples), columns=sampled_columns)
i, ic = 0, 0
while ic < n_samples:
    data_row = clean_data.iloc[ids[i]]
    review = data_row["comments"]
    i += 1
    if (not isinstance(review, str)) or len(review) < 5:
        # Skip invalid reviews
        continue
    if "canceled" in review:
        # If it is an automated cancellation review then skip
        continue
    try:
        review_lang = langdetect.detect(review)
    except:
        continue
    if review_lang != "en":
        continue
        
    processed_review = normalizer([review])[0]
    sampled_data.iloc[ic] = data_row
    sampled_data.iloc[ic]["processed_comments"] = processed_review
    sampled_data.iloc[ic]["coref_resolved"] = nlp(processed_review)
    
    ic += 1
        
    if ic % n_message == 0:
        print("{} / {} found.".format(ic + 1, n_samples))

101 / 1000 found.
201 / 1000 found.
301 / 1000 found.
401 / 1000 found.
501 / 1000 found.
601 / 1000 found.
701 / 1000 found.
801 / 1000 found.
901 / 1000 found.
1001 / 1000 found.


In [41]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
sampled_data["entities"] = sampled_data["comments"].map(lambda text: nlp(text).ents)

sampled_data.columns

Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments',
       'processed_comments', 'coref_resolved', 'entities'],
      dtype='object')

In [51]:
ind = np.random.randint(0, len(sampled_data))

print(sampled_data.iloc[ind].comments)
print()
print(sampled_data.iloc[ind].entities)

We had a wonderful weekend at Marty’s place ! It was even better than we expected . Will definitely be coming back again someday :)

(Marty,)


In [52]:
sentence = "We had a fantastic stay at Alvaro's apartment. It was clean and quiet during the nighttime.  The neighborhood is great and there are 2 subways within 10 min walking distance (Website hidden by Airbnb) would book the room anytime again!"
print(sentence)

doc = nlp(sentence)
print(doc.ents)

We had a fantastic stay at Alvaro's apartment. It was clean and quiet during the nighttime.  The neighborhood is great and there are 2 subways within 10 min walking distance (Website hidden by Airbnb) would book the room anytime again!
(Alvaro, 2, 10, Airbnb)


In [57]:
doc.ents[-1].label_

'GPE'