In [1]:
from data_processing_features import get_data, text_data_cleaning
from model_building import tfidf_features_fit
import pandas as pd
from sklearn.cluster import KMeans
from rake_nltk import Rake

# get data
df = get_data('New_Delhi_Reviews')

# clean data 
df = text_data_cleaning(df)

tfidf, tfidf_matrix = tfidf_features_fit(df)

df_clustering = pd.merge(df['rating_review'],tfidf_matrix,left_index=True,right_index=True, how='inner')

kmeans = KMeans(n_clusters=20, random_state=0, n_init="auto").fit(df_clustering)

df_clustering['cluster_labels'] = kmeans.labels_



data imported
starting data cleaning
converted to lowercase
removed punctuations
removed stopwords
applied stemming
applied lemmatization


In [2]:
df_clustering.head()

Unnamed: 0,rating_review,10,45,absolut,accompani,across,actual,ad,add,afternoon,...,word,work,world,worth,ye,year,yet,your,yummi,cluster_labels
0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.210969,0.229975,0.0,8
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
4,5,0.0,0.0,0.15213,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.172863,0.0,0.0,0.0,0.0,3


In [3]:
df_clustering.columns

Index(['rating_review', '10', '45', 'absolut', 'accompani', 'across', 'actual',
       'ad', 'add', 'afternoon',
       ...
       'word', 'work', 'world', 'worth', 'ye', 'year', 'yet', 'your', 'yummi',
       'cluster_labels'],
      dtype='object', length=561)

In [4]:
merged_df = pd.merge(df, df_clustering['cluster_labels'], left_index=True, right_index=True, how='inner')

In [5]:
merged_df.head()

Unnamed: 0,rating_review,review_full,review_original,cluster_labels
0,5,total love auro place realli beauti quit fanci...,"Totally in love with the Auro of the place, re...",8
1,5,go bar 8 day regularli husband fulli satisfi s...,I went this bar 8 days regularly with my husba...,8
2,5,friend birthday celebr food good tast realli f...,We were few friends and was a birthday celebra...,8
3,5,fatjar cafe market perfect place casual lunch ...,Fatjar Cafe and Market is the perfect place fo...,8
4,5,hey guy crave pizza search visit cafe ye highl...,"Hey Guys, if you are craving for pizza and sea...",3


# Implement RAKE for keyword extraction

In [67]:
from rake_nltk import Rake
from nltk.corpus import stopwords
import string

stopwords_list = stopwords.words('english')
exclude = string.punctuation

r = Rake(stopwords = stopwords_list, punctuations=exclude, language='en', max_length=15, min_length=10, include_repeated_phrases=False)

# def rake_features(df):
#     r.extract_keywords_from_text(df['review_original'][0])
#     return r.get_ranked_phrases_with_scores()

In [71]:
r.extract_keywords_from_sentences(merged_df[merged_df['cluster_labels']==11]['review_original'])

In [72]:
t = pd.DataFrame(r.get_ranked_phrases_with_scores())
t.head(5)

Unnamed: 0,0,1
0,188.285714,895 /- plus taxes contained .... ... two small...
1,186.962963,good service 진짜 인도에서 온곳중에 최악이었어요 가지마세요 .. 맛도 최...
2,184.733333,order main yha se cancel kr deti hu aur tum de...
3,180.175926,regular thick cold coffee .. garlic bread .. u...
4,169.0,rotolo di mozzarella con carciofi e rucola con...


# use SPACY to extract summarised reviews

In [6]:
df1 = merged_df.copy()

In [55]:
df1.head()

Unnamed: 0,rating_review,review_full,review_original,cluster_labels
0,5,total love auro place realli beauti quit fanci...,"Totally in love with the Auro of the place, re...",8
1,5,go bar 8 day regularli husband fulli satisfi s...,I went this bar 8 days regularly with my husba...,8
2,5,friend birthday celebr food good tast realli f...,We were few friends and was a birthday celebra...,8
3,5,fatjar cafe market perfect place casual lunch ...,Fatjar Cafe and Market is the perfect place fo...,8
4,5,hey guy crave pizza search visit cafe ye highl...,"Hey Guys, if you are craving for pizza and sea...",3


In [10]:
df1['new'] = df1.groupby(['cluster_labels'])['review_original'].transform(lambda x: ' '.join(x)).drop_duplicates()

In [11]:
df1 = df1.dropna().reset_index()

In [12]:
df1.head(25)

Unnamed: 0,index,rating_review,review_full,review_original,cluster_labels,new
0,0,5,total love auro place realli beauti quit fanci...,"Totally in love with the Auro of the place, re...",8,"Totally in love with the Auro of the place, re..."
1,4,5,hey guy crave pizza search visit cafe ye highl...,"Hey Guys, if you are craving for pizza and sea...",3,"Hey Guys, if you are craving for pizza and sea..."
2,9,5,real nice comfort bar whiskey man get huge var...,Real nice and comfortable bar. I am a whiskey ...,5,Real nice and comfortable bar. I am a whiskey ...
3,10,4,drink happi hour good select excel servic bar ...,"We had drinks during happy hour, good selectio...",2,"We had drinks during happy hour, good selectio..."
4,13,5,fli visit amaz emmanuel attent help appreci he...,A flying visit but has been amazing. Emmanuel ...,16,A flying visit but has been amazing. Emmanuel ...
5,15,5,amaz food amaz hospit friendli staffhad fun mr...,Amazing food. Amazing hospitality.. friendly s...,4,Amazing food. Amazing hospitality.. friendly s...
6,16,5,realli amaz food mouthwat dessert chef ranvir ...,I had really amazing food with mouthwatering d...,13,I had really amazing food with mouthwatering d...
7,24,4,stay near visit place huge place load option e...,was staying near by so visited this place. hug...,12,was staying near by so visited this place. hug...
8,50,5,servic provid shubham naveen praiseworthi food...,Services provided by shubham and naveen are p...,0,Services provided by shubham and naveen are p...
9,73,5,food servic good staff hospit polit binay serv...,Food and services were very good The staff wa...,9,Food and services were very good The staff wa...


In [60]:
len(df1['new'].loc[df1.index[1]])

9350332

In [16]:

import spacy

In [20]:
nlp = spacy.load('en_core_web_sm')


In [21]:
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation


In [27]:
len(df1['new'].loc[df1.index[0]][:100000])

100000

In [41]:
doc = nlp(df1['new'].loc[df1.index[0]][:100000])


In [29]:
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

In [30]:
from collections import Counter

In [31]:

freq_word = Counter(keyword)
print(freq_word.most_common(5))

[('food', 455), ('servic', 315), ('great', 291), ('good', 281), ('place', 253)]


In [32]:
sent_strength={}
for sent in doc.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent]+=freq_word[word.text]
            else:
                sent_strength[sent]=freq_word[word.text]
print(sent_strength)

{total love auro place realli beauti quit fanci time ambienc pure give sens posit throughout outdoor indoor interior quit quaint cute love open kitchen idea whole marketplac ideolog due coronoviru specif use dispos cutleri keep pandem mind take precautionari measur begin place mask staff use good sanitis food realli amaz special pizza straight oven hummu pita bread quit delici your look classi yet sooth italian place delhifatjar go go bar: 2887, 8 day regularli husband fulli satisfi servic staff good vitoni amen serv us daili sure visit highli recommend ❣️ friend birthday celebr food good tast realli fresh love highli recom fatjar cafe market perfect place casual lunch love one ambienc delight food total best thing place purchas choos oliv lot directli place take prevent measur spread covid19 say without hesit far safest place felt look action ✨ look special meal find fatjar well present tasti food whilst expens usual pay well worth experi chicken roast veget chicken tender tasti veget

In [34]:
from heapq import nlargest

In [35]:
summarized_sentences = nlargest(3, sent_strength, key=sent_strength.get)
print(summarized_sentences)

[im express thing caus im also bartenderand last vika keep great work confid spend hour bargood ambiencefriendli staffgreat chat sandeep bartend bar sign great servic hospit industri impress drink quoin bar enjoy lot bar tender say rockstar champ come mumbai last night dri day ganga bar tender offer nice drink refresh virgin mojito mani time like appreci ganga waiter aquib provid best servic time india buffet good select excel servic make five star experi two margarita lobbi bar luci bartend great engag make perfect margarita chan serv bar snack ladi great make feel welcom visit novotel pullman visit lobbi bar quick drink light bite thank much luci chan highli recommend visit place friend love ambienc love drink especi want mention avinash make amaz cocktail friend quoin bar sport bar novotel aeroc go busi meet avinash kumar suggest best offer gud choic starter realli appreci servic loyalti toward guest compani best futur gener like go new hotel tri someth new come enjoy match enjoy dr

In [39]:
from gensim.summarization import summarize


In [61]:
summarize(str(df1['new'].loc[df1.index[0]]), word_count=50)


# use spacy for summarization

In [7]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

In [9]:
nlp = spacy.load('en_core_web_sm')

In [47]:
doc = nlp(df1['new'].loc[df1.index[13]][:1000000])

In [48]:
len(list(doc.sents))

12043

In [49]:
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

In [50]:
freq_word = Counter(keyword)
print(freq_word.most_common(5))

[('food', 1699), ('restaurant', 952), ('place', 924), ('service', 777), ('good', 540)]


In [51]:
type(freq_word)

collections.Counter

In [52]:
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():  
        freq_word[word] = (freq_word[word]/max_freq)
freq_word.most_common(3)

[('food', 1.0),
 ('restaurant', 0.5603296056503826),
 ('place', 0.5438493231312537)]

In [53]:
sent_strength={}
for sent in doc.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent]+=freq_word[word.text]
            else:
                sent_strength[sent]=freq_word[word.text]
print(sent_strength)



In [54]:
summarized_sentences = nlargest(1, sent_strength, key=sent_strength.get)
print(summarized_sentences)

[I went here with great expectations after reading so many good reviews on this site but to my surprise the place was a very smallish cafe and the food that they had on offer was Pathetic  i took a group of 50 people in here after reading the reviews here and we tried 5 different bread and 5 pastries and they were all stale and were smelling bad as well -i was astonished to see so many foreign tourist in this place, probably thinking that this is the best Delhi has to offer the staff was polite and the service was good the prices too were reasonable but when i go to a bakery i expect good food - but they didn't have anything good with them it seems this place is only good for people who are running short on cash and would eat crap if it is served cheap i have seen the previous reviews here and they all seem fake as i can see people decided only to review this restaurant/cafe in Delhi(only 1-2 reviews ever given by these people) whereas there is a lot more to have in Delhi Don't bother 

In [55]:
final_sentences = [ w.text for w in summarized_sentences ]
summary = ' '.join(final_sentences)
print(summary)

I went here with great expectations after reading so many good reviews on this site but to my surprise the place was a very smallish cafe and the food that they had on offer was Pathetic  i took a group of 50 people in here after reading the reviews here and we tried 5 different bread and 5 pastries and they were all stale and were smelling bad as well -i was astonished to see so many foreign tourist in this place, probably thinking that this is the best Delhi has to offer the staff was polite and the service was good the prices too were reasonable but when i go to a bakery i expect good food - but they didn't have anything good with them it seems this place is only good for people who are running short on cash and would eat crap if it is served cheap i have seen the previous reviews here and they all seem fake as i can see people decided only to review this restaurant/cafe in Delhi(only 1-2 reviews ever given by these people) whereas there is a lot more to have in Delhi Don't bother c