In [1]:
import random
random.seed(a=None, version=2)

In [2]:
import dill
import gensim
import os
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.probability import FreqDist
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import FastText
from gensim.models import Word2Vec
import spacy
from gensim.models import Phrases
import collections
from networkx.exception import NetworkXError
from networkx.generators.ego import ego_graph
from chinese_whispers import chinese_whispers, aggregate_clusters
import matplotlib.pyplot as plt
import networkx as nx
import markov_clustering as mc
import datetime

### Loading Reviews

In [30]:
# Start time
print(datetime.datetime.now())

2019-08-06 11:48:53.824801


In [31]:
#read data
df = pd.read_csv (r'C:\Users\siddhartha\Desktop\Tag Extraction Automation\V1 - Work\langid_en_300k.csv')

#filter only english data
is_df_en = df['review_langid']== 'en'
df = df[is_df_en]
# df = df.head(100)
#review text_en as DF
df = df[['review']]
# df

### Pre-processing --> Bag of N-Grams-uni,bi,tri

In [33]:
# Start time
print(datetime.datetime.now())

2019-08-06 11:48:57.250931


In [35]:
#load Spacy
nlp = spacy.load("en_core_web_sm")
nlp.Defaults.stop_words |= {"-PRON-","my_new_stopword2"}
def process_text(text):
    my_doc = nlp(text)
    return [w.lemma_.lower() for w in my_doc if not w.is_stop and not w.is_punct and not w.is_digit and w.is_alpha]

#unigram
df['ntext'] = df['review'].apply(process_text)
#bigram
bigram = Phrases(df['ntext'], min_count=5, threshold=2)
df['twograms'] = df['ntext'].apply(lambda x : bigram[x])
#trigram
trigram = Phrases(bigram[df['ntext']], min_count=5, threshold=2)
df['threegrams'] = df['twograms'].apply(lambda x : trigram[x])

# df.head(10)

### Custom stopwords removal using "Most frequent and least frequent words as stop words"

In [36]:
# Start time
print(datetime.datetime.now())

2019-08-06 13:53:52.652407


In [37]:
# Custom stop words removal using "Most frequent and least frequent terms as stop words"
fdist = FreqDist([w for row in df['threegrams'].tolist() for w in row])
fdist

custom_stops_freq = []
for word, frequency in fdist.most_common():
#     print('{};{}'.format(word, 100*round(frequency/fdist.N(), 5)))
    custom_stops_freq.append({"word":word, "distri":100*round(frequency/fdist.N(), 5)})
    
# custom_stops_freq

custom_stops_df = pd.DataFrame(custom_stops_freq)

high_f = custom_stops_df[custom_stops_df['distri'] >= 0.5]
low_f = custom_stops_df[custom_stops_df['distri'] <= 0.004]
custom_stops_df = pd.concat([high_f, low_f])
# custom_stops_df

custom_stops_list = custom_stops_df['word'].tolist()
len(custom_stops_list)

def remove_custom_stops(wordlist):
    return list(set(wordlist) - set(custom_stops_list))

df['cusstops_filtered'] = df['threegrams'].apply(remove_custom_stops)

In [38]:
len(custom_stops_list)

156546

In [39]:
df.head()

Unnamed: 0,review,ntext,twograms,threegrams,cusstops_filtered
0,My favorite place to stay in mumbai. The staff...,"[favorite, place, stay, mumbai, staff, turnove...","[favorite_place, stay, mumbai, staff, turnover...","[favorite_place_stay, mumbai, staff_turnover, ...","{mumbai, low, comfortable, member_staff, want,..."
1,The hotel has a good connection to London cent...,"[hotel, good, connection, london, centre, trad...","[hotel, good, connection, london, centre, trad...","[hotel, good_connection, london, centre, tradi...","{feel_like, pub, centre, catch, comfortable, b..."
2,Went for a weekend getaway with the Mrs. Hotel...,"[go, weekend, getaway, hotel, clean, kind, sta...","[go, weekend_getaway, hotel, clean, kind, star...","[go, weekend_getaway, hotel, clean, kind, star...","{resort, kind, awesome, take, bar, spot, searc..."
3,The hotel is a perfect location to easily get ...,"[hotel, perfect, location, easily, theatre, ac...","[hotel, perfect_location, easily, theatre, acc...","[hotel_perfect_location, easily, theatre, acco...","{guest, help, appreciate, size, easily, theatr..."
4,The hotel was very nice and well located. We ...,"[hotel, nice, locate, enjoy, breakfast, buffet...","[hotel, nice, locate, enjoy, breakfast_buffet,...","[hotel, nice, locate, enjoy, breakfast_buffet_...","{meal, return, nice_helpful, try, person, go, ..."


In [3]:
# import pickle
# df['cusstops_filtered'].to_pickle('df_custom_stops_cleared_300k_hotel.pickle')
# df = pd.read_pickle('df_custom_stops_cleared_300k_hotel.pickle')
# df
# df = pd.DataFrame(df)
# df.head()
# df['resultant'] = df['cusstops_filtered'].apply(lambda x : list(x))
# df

### Retrain on Existing w2v and Gensim models

In [40]:
# Start time
print(datetime.datetime.now())

2019-08-06 15:47:40.784649


In [12]:
corpus = [w for row in df['resultant'].tolist() for w in row]

# corpus = [w for row in df['cusstops_filtered'].tolist() for w in row]
len(corpus)

4085232

In [13]:
fdist = FreqDist(corpus)
# fdist.most_common(10)

# Load the pretrained Model
model1 = Word2Vec.load("word2vec_model2")

stops = stopwords.words("english")
wlist = []
for item, f in fdist.items():
    if item.lower() not in stops:
        wlist.append({"word":item, "count":f})
        
words_df = pd.DataFrame(wlist)
words_df.sort_values(by="count", ascending=False, inplace=True)

# words_df.head(n=10)

def get_similarity(w):
    similar = []
    try:
        similar = model1.wv.most_similar(w)
    except KeyError:
        print(w)
    return similar

words_df['similar_words'] = words_df['word'].apply(get_similarity)
# words_df.head()

similarlist = list(zip(words_df['word'], words_df['similar_words']))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


room_clean
stay_night
room_small
good_location
definitely_stay
staff_friendly_helpful
bed_comfortable
suite
recommend_hotel
breakfast_good
place_stay
location_great
enjoy_stay
room_comfortable
room_floor
room_spacious
location_good
hotel_great_location
great_place
spa
hotel_locate
bedroom
housekeeping
room_big
highly_recommend_hotel
comfortable_bed
shopping
clean_comfortable
desk_staff
excellent_location
breakfast_include
bar_restaurant
room_clean_comfortable
restaurant_bar
staff_excellent
relaxing
tired
location_excellent
swimming_pool
room_ready
air_conditioning
choose_hotel
friendly_helpful_staff
staff_amazing
bed_comfy
housekeep
welcoming
good_experience
ground_floor
room_size
check_check
shuttle
toiletry
staff_helpful_friendly
executive_lounge
standard_room
food_drink
definitely_recommend_hotel
stay_week
good_price
room_good_size
boutique_hotel
comfy_bed
good_night_sleep
nice_view
staff_lovely
club_lounge
pleasant_stay
short_stay
mini_bar
free_breakfast
change_room
checkout
bathro

In [14]:
len(corpus)
# 33277 for 1000 reviews before custom stopwords
# 26966 for 1000 reviews after custom stopwords high freq
# 24090 for 1000 reviews after custom stopwords high and low freq

4085232

### Graph Generation from Word Embeddings

In [15]:
# Start time
print(datetime.datetime.now())

2019-08-06 16:52:26.738399


In [16]:
#creating a empty graph
G = nx.Graph()

for t1, t2 in similarlist:
    for w in t2:
        G.add_weighted_edges_from([(t1, w[0], w[1])])
        
# G.edges()

# degree_sequence = sorted([d for n, d in G.degree()], reverse=True)  # degree sequence
# # print "Degree sequence", degree_sequence
# degreeCount = collections.Counter(degree_sequence)
# deg, cnt = zip(*degreeCount.items())

# fig, ax = plt.subplots()
# plt.bar(deg, cnt, width=0.80, color='b')

# plt.title("Degree Histogram")
# plt.ylabel("Count")
# plt.xlabel("Degree")
# ax.set_xticks([d + 0.4 for d in deg])
# ax.set_xticklabels(deg)


# plt.show()

for w in words_df[words_df['count'] <5]['word'].tolist():
    try:
        G.remove_node(w)
    except NetworkXError:
        print (w)
        
# G.size()
# len(G)

### Creating Ego Graph for 'G'

In [17]:
# Ego Graph for G is G itself
# if the maximum lenght for achieveing the node length is built for the "G", then ego-graph will be producing the same graph G

### Markovian Clustering using hyperparameter for 'G'

In [18]:
# Start time
print(datetime.datetime.now())

2019-08-06 16:52:27.304754


In [19]:
# # with hyperparameter tuning
# nodelist = list(G)

# # then get the adjacency matrix (in sparse form)
# matrix = nx.to_scipy_sparse_matrix(G)

# # Auto hyperparameter Tuning (inflation rate)
# # perform clustering using different inflation values
# # for each clustering run, calculate the modularity
# inflation_auto = []
# for inflation in [i / 10 for i in range(12, 30)]:
#     result = mc.run_mcl(matrix, expansion=2, inflation=inflation)
#     clusters = mc.get_clusters(result)
#     Q = mc.modularity(matrix=result, clusters=clusters)
# #     print("inflation:", inflation, "modularity:", Q)
#     inflation_auto.append((inflation, Q))
    
# cols =['inflation', 'modularity']
# result_inflation_auto = pd.DataFrame(inflation_auto, columns=cols)

# # finding inflation parameter value
# inflate_value = result_inflation_auto.loc[result_inflation_auto['modularity']==result_inflation_auto['modularity'].max(), 'inflation'].iloc[0]

# # cluster using the optimized cluster inflation value
# result = mc.run_mcl(matrix, expansion=2, inflation=inflate_value)
# clusters = mc.get_clusters(result)

# # fig = plt.figure(figsize=(20,20))
# # plt.subplot(111)
# # mc.draw_graph(matrix, clusters, node_size=50, with_labels=True, edge_color="silver")
# # plt.show()

# # finding inflation parameter value
# inflate_value = result_inflation_auto.loc[result_inflation_auto['modularity']==result_inflation_auto['modularity'].max(), 'inflation'].iloc[0]
# inflate_value

In [20]:
#without hyperparameter tuning, average = 1.6

nodelist = list(G)

# then get the adjacency matrix (in sparse form)
matrix = nx.to_scipy_sparse_matrix(G)

# cluster using the fixed cluster inflation value
result = mc.run_mcl(matrix, expansion=2, inflation=1.6)
clusters = mc.get_clusters(result)

In [21]:
# End time
print(datetime.datetime.now())
# 20 mins for 1000

2019-08-06 16:52:33.293555


In [22]:
clusters_list_of_lists = [list(elem) for elem in clusters]
row_no = list(range(len(nodelist)))
dictionary = dict(zip(row_no, nodelist))
def replace_all(word_list, dictionary):
   for i, j in dictionary.items():
      for lst in word_list: 
         if i in set(lst): 
            lst[lst.index(i)] = j

replace_all(clusters_list_of_lists, dictionary) 

In [23]:
n = 1
for s in clusters_list_of_lists:
    print('\ncluster->', n,'\n')
    print(*s, sep = ", ")
    n=n+1
    
# 4591


cluster-> 1 

restaurant, this_restaurant, resturant, place, italian_restaurant, this_place, place_eat, liman, restaurant_london, chinese_restaurant, cheap_eat, local, japanese_restaurant, asian_restaurant, chinese_food, indian_restaurant, italian_food, one_favorite, lebanese_restaurant, restaraunt, jazz_club

cluster-> 2 

venue, view_london, great_view, cocktail_bar, view, amaze_view, view_amaze, view_city, fantastic_view, beautiful_view, set, fabulous_view, view_incredible, stun_view, amazing_view, view_stun, wonderful_view, incredible_view, outstanding_view

cluster-> 3 

eatery, restuarant, office_build

cluster-> 4 

establishment, restaurant_paris, complain_lack

cluster-> 5 

location, great_location, central_london, chill_atmosphere, nice_ambiance, worth_visit

cluster-> 6 

dine_room, lovely_garden, downstairs, upstairs, bar, bar_area, sit_bar, hang_bar, seat_bar, door, floor, window, table_right, table_window, table_outside, outside, inside, inside_outside, table_inside, sp

option, menu_offer, plenty, nice_selection, good_variety, good_choice, good_selection, selection, great_variety, cold_warm, great_choice, lot_choice, wide_selection, great_selection, plenty_variety, lot_variety, lot_choose, hard_choose, include_bottle

cluster-> 121 

offering, price_possibly, offer_wide, delicious_unusual, classic_french

cluster-> 122 

choose, order_different, select_dish, course_taste, al_la_carte, opt, please_choice, order_appetizer, partner_order, selection_different, offer_pre, not_suitable

cluster-> 123 

set_menu, starter_main, choice, taste_menu, course, course_desert, match_wine, set_lunch

cluster-> 124 

walk, book_problem

cluster-> 125 

enter, immediately_get, upon_arrive, reach_restaurant, entire, whole, first_let, private_dine, arrive_saturday, ask_window

cluster-> 126 

wander, wait_outside, want_sit, entrance_restaurant, stumble

cluster-> 127 

seat_right, move, pass, away

cluster-> 128 

europe, uk, india, hong_kong, usa, european, france, spai

standard, maintain_high, reasonable_value

cluster-> 315 

quality, quantity

cluster-> 316 

basic, price_steep, self_service, term, michelin, non, food_wise

cluster-> 317 

high_standard, world_class, pleasurable_experience, class, attentive_obtrusive, attentive_kind, satisfactory, view_outstanding, enjoyable_even, excellent_quality, locate_basement, overall_average

cluster-> 318 

extremely_high, what_expectsmall_plate, item, half_portion, plate, prawn_linguine

cluster-> 336 

polite_helpful, speak_english, extremely_attentive, attentive_professional, kind_helpful, polite_professional, attentive_prompt, welcome_attentive

cluster-> 337 

pleasant, prompt_friendly

cluster-> 338 

attentive_intrusive, thank_make, extra_special, even_thank, absolutely_fabulous, start_finish, professional_attentive

cluster-> 339 

far

cluster-> 340 

head_include, price_point, wine_expensive, premium, cost_euro, expensive_average, two_course

cluster-> 341 

rainforest_cafe, adventure, -PRON-_conf

apart, too_bad, love_idea, spoil_experience, key, look_interest, critical, essential, fail_deliver, happy_explain, need_know, give_opportunity, obvious, apparent, evident, not_wrong, understandable, necessity, priority

cluster-> 635 

spend_day, stay_nearby, walk_hotel, kings_road, walk_brooklyn, barbican, ihop, pop_lunch, stroll

cluster-> 636 

cheap_price, price_bad, economical, reliable, dependable, atmosphere_unique, nice_environment, will_help

cluster-> 637 

feel_uncomfortable, before_leave, message, josephine, close_window, tour_kitchen, ask_happen

cluster-> 638 

complete, equally_impressive, truly_wonderful, as_pre, mother_law, sit_counter

cluster-> 639 

oven, microwave, water_serve, barely_warm

cluster-> 640 

small_slice, toast, roast_potato, for_dessert, cheese_mushroom

cluster-> 641 

free_wifi, draft_beer, just_right, sausage_roll, mum_happy, dish_prepare, can_wrong, only_downside, stop_rave, tea_scone

cluster-> 642 

comfort, classy_restaurant, step_time, typica

convention, gathering

cluster-> 1019 

complementary, eye_catch, gluten_free_vegan, wonderfully_prepare

cluster-> 1020 

seating, view_window

cluster-> 1021 

summary, description, brief

cluster-> 1022 

public, in_summary, paper_towel

cluster-> 1023 

rack, shelf, stack, glass_home

cluster-> 1024 

louvre, overlook_london

cluster-> 1025 

draw, right_opposite, drawing

cluster-> 1026 

crew, cast

cluster-> 1027 

superior, inferior, attentive_hover, unmatched

cluster-> 1028 

terminal, port

cluster-> 1029 

soap, vinegar, poppadom, syrup, ginger_beer, sponge, candy

cluster-> 1030 

meter, ft, m

cluster-> 1031 

memory, thank_hutong, have_previously, daniel_boulud, twice_year, past_month

cluster-> 1032 

print, exotic_meat, complain_waitress

cluster-> 1033 

cramp, niggle, bad_point, line_door

cluster-> 1034 

body, on_way

cluster-> 1035 

doorstep, after_have, visit_australia, big_chain

cluster-> 1036 

freeze, frozen, skinny_fry

cluster-> 1037 

damage, hurt, impact

In [24]:
# Framework as df
df_clusters_framework = []

n=1
for l in clusters_list_of_lists:
    df_clusters_framework.append((n, len(l), l))
    n=n+1

cols = ['tag_no/cluster_no', 'keys_length', 'keys']
df_clusters_labelled_framework = pd.DataFrame(df_clusters_framework, columns = cols)
# df_clusters_labelled

df_clusters_labelled_framework = df_clusters_labelled_framework[df_clusters_labelled_framework['keys_length'] != 1]

In [25]:
df_clusters_labelled_framework.to_csv('Output_Clustered_Keys.csv') 

In [26]:
# End time
print(datetime.datetime.now())

2019-08-06 16:52:55.950005


In [None]:
# total clusters before custom stop words = 634