In [1]:
import numpy as np
import pandas as pd
import sys

In [2]:
sys.version

'3.6.5 (default, Apr 25 2018, 14:23:58) \n[GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.1)]'

In [3]:
reviews = pd.read_csv("/Users/samaraarkotti/Downloads/reviews/reviews.csv")

In [4]:
reviews.head()
#reviews.loc[reviews['listing_id'] == 10730]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,2013-05-21,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,2013-05-29,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,2013-06-06,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,2013-06-15,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,2013-06-16,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


In [5]:
from nltk.corpus import stopwords   # stopwords to detect language
from nltk import wordpunct_tokenize # function to split up our words

def get_language_likelihood(input_text):
    """Return a dictionary of languages and their likelihood of being the 
    natural language of the input text
    """
 
    input_text = input_text.lower()
    input_words = wordpunct_tokenize(input_text)
 
    language_likelihood = {}
    total_matches = 0
    for language in stopwords._fileids:
        language_likelihood[language] = len(set(input_words) &
                set(stopwords.words(language)))
 
    return language_likelihood
 
def get_language(input_text):
    """Return the most likely language of the given text
    """ 
    likelihoods = get_language_likelihood(input_text)
    return sorted(likelihoods, key=likelihoods.get, reverse=True)[0]

In [6]:
reviews_df = reviews[reviews.apply(lambda srs: pd.notnull(srs['comments']), axis='columns')]

In [7]:
example_listing_reviews = reviews_df.query('listing_id == 10730')

In [8]:
from nltk import word_tokenize

In [9]:
words = np.concatenate(np.array([word_tokenize(r) for r in example_listing_reviews['comments'].values]))

In [10]:
words

array(['I', 'went', 'to', ..., 'all', 'the', 'sights'], dtype='<U17')

In [11]:
from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, BigramCollocationFinder

bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)

finder.apply_freq_filter(3) 
finder.nbest(bigram_measures.pmi, 10)  

[('centrally', 'located'),
 ('an', 'amazing'),
 ('If', 'you'),
 ('next', 'time'),
 ('view', 'of'),
 ('very', 'nice'),
 ('The', 'kitchen'),
 ('very', 'helpful'),
 ('a', 'hotel'),
 ('than', 'a')]

In [12]:
review_words = reviews_df.groupby('listing_id').apply(
    lambda df: np.concatenate(np.array([word_tokenize(r) for r in df['comments'].values]))
)

In [13]:
import string

ex = ['Hi', 'there', '.', '?', '!', ',']
[w for w in ex if w not in string.punctuation]

['Hi', 'there']

In [15]:
review_words_f = review_words.map(lambda arr: np.array([w for w in arr if w not in string.punctuation]))

In [16]:
review_words_f.head()

listing_id
3353    [Very, friendly, and, helpful, Convenient, loc...
5506    [Terry, 's, Hotel, Alterntv, in, Boston, was, ...
6695    [Terry, 's, apartment, is, beautifully, decora...
6976    [A, Wonderful, pleasant, and, charming, host, ...
8792    [The, unit, was, quiet, convenient, excellentl...
dtype: object

In [17]:
def reattach_contractions(wordlist):
    words = []
    for i, word in enumerate(wordlist):
        if word[0] == "'" or word == "n't":
            words[-1] = words[-1] + word
        else:
            words.append(word)
    return words

In [18]:
review_words_f = review_words_f.map(reattach_contractions)

In [19]:
def bigramify(words):
    finder = BigramCollocationFinder.from_words(words)
    finder.apply_freq_filter(3) 
    return finder.nbest(bigram_measures.pmi, 3)

review_bigrams = review_words_f.map(bigramify)

In [20]:
review_bigrams.head(20)

listing_id
3353             [(green, line), (T, station), (at, home)]
5506     [(Hotel, Alternative), (better, than), (3, nig...
6695     [(Back, Bay), (public, transportation), (Fort,...
6976      [(Forest, Hills), (Roslindale, Village), (’, s)]
8792            [(Beacon, Hill), (an, excellent), (I, am)]
9273                   [(when, we), (I, would), (If, you)]
9765                                [(of, the), (to, the)]
9824            [(Back, Bay), (small, but), (on, Newbury)]
9855                [(of, the), (the, reception), (is, a)]
9857       [(2nd, bedroom), (enjoyed, our), (here, again)]
9858                                                    []
9860         [(Great, location), (distance, to), (a, bit)]
9870                                                    []
10730     [(centrally, located), (an, amazing), (If, you)]
10758                                                   []
10809                [(close, to), (it, was), (The, room)]
10811                                        

In [21]:
def sample_reviews(listing_id):
    bigrams = review_bigrams[listing_id]
    review_texts = reviews[reviews['listing_id'] == listing_id]['comments'].values
    sample_reviews = []
    for bigram in bigrams:
        sample_review_list = list(filter(lambda txt: " ".join(bigram) in txt, review_texts))
        num_reviews = len(sample_review_list)
        sample_review = sample_review_list[0]
        sample_review = sample_review.replace(" ".join(bigram), "****" + " ".join(bigram) + "****")
        start_index = sample_review.index("****")
        sample_text = "..." + sample_review[start_index - 47: start_index + 47] + "..."
        sample_reviews.append(sample_text)
    return sample_reviews

In [24]:
for review in sample_reviews(3353):
    print(review)

...e is very nicely located - a few blocks to the ****green line**** T station and bus stops, wit...
...icely located - a few blocks to the green line ****T station**** and bus stops, with several r...
...hen I first arrived there. Usually ,he was not ****at home**** and works a lot. There are thre...


In [25]:
for review in sample_reviews(10730):
    print(review)

...appartment was real value for (URL HIDDEN) was ****centrally located**** and provided easy acc...
...h. The apartment had everything we needed plus ****an amazing**** view of Boston from the roof...
...need.
The bed sofa can accomodate two people. ****If you**** plan on having visitors note the...
