In [None]:
import numpy as np 
import pandas as pd
import os
import seaborn as sns
print(os.listdir("../input"))
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from wordcloud import WordCloud
from geopy.geocoders import Nominatim
from folium.plugins import HeatMap
import folium
from tqdm import tqdm
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.colors as mcolors
from sklearn.manifold import TSNE
from gensim.models import word2vec
import nltk
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
df = pd.read_csv('../input/zomato-bangalore-restaurants/zomato.csv')
df.head()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df = df.rename({'approx_cost(for two people)':'two_ppl_avg', 'listed_in(type)':'listed_in_type', 'listed_in(city)':'listed_in_city'}, axis='columns')

In [None]:
df.head()

In [None]:
df.listed_in_city.unique()

In [None]:
df.location.unique()

In [None]:
len(df.name.unique())

<h2>EDA

Top restaurant chains in Bangalore

In [None]:
plt.figure(figsize=(15, 6))
df1 = df['name'].value_counts()[:10]
sns.barplot(y=df1.index, x=df1.values)
plt.title('Most famous chains in Bangalore')
plt.xlabel('Number of restaurants')
plt.show()

Online orders

In [None]:
plt.figure(figsize=(10,10))
ax = sns.countplot(df['online_order'])
plt.title('Restaurants taking online orders')

# The second parameter is the point to annotate - (p.get_x() + p.get_width() / 2., p.get_height()) points to the center of the top side of the bar plot
# The xytext is used to define the place to put the text WITH RESPECT TO the annotate point
# get_x returns the top left coordinate
# ha and va are used to say which part of the text will be at the defined coordinate, if we don't use center then the left-side of the text would be at the starting coordinate
for p in ax.patches:
    ax.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha="center", va="center",
                fontsize=11, color='black', xytext=(0, 10),
                textcoords='offset points')
plt.show()

In [None]:
for p in ax.patches:
    print(p.get_x())
    print(p.get_width())
    print(p.get_height())

In [None]:
df2 = df['online_order'].value_counts()
colors = ['#FEBFB3', '#E1396C']

trace = go.Pie(labels=df2.index, values=df2.values, textinfo="value",
              marker=dict(colors=colors, line=dict(width=2)))
layout = go.Layout(title='Accepting vs not accepting online orders', width=500, height=500)
fig = go.Figure(data=[trace], layout=layout)
fig.show()

Table bookings

In [None]:
df3 = df['book_table'].value_counts()
colors = ['#96D38C', '#D0F9B1']

trace = go.Pie(labels=df3.index, values=df3.values, textinfo="value",
              marker=dict(colors=colors, line=dict(width=2)))
layout = go.Layout(title='Accepting vs not accepting table bookings', width=500, height=500)
fig = go.Figure(data=[trace], layout=layout)
fig.show()

Rating distribution

In [None]:
df4 = df['rate']
df4 = df4.dropna().apply(lambda x: float(x.split('/')[0]) if (len(x)>3) else np.nan).dropna()
df4

In [None]:
df4.unique()

In [None]:
plt.figure(figsize=(7, 5))
sns.displot(df4, kde=True)
plt.title('Distribution of ratings')
plt.show()

Rating vs Cost

In [None]:
df_5 = df[['two_ppl_avg','rate','online_order']]
df_5.head()

In [None]:
df_5['two_ppl_avg'].unique()

In [None]:
df_5.loc[:,'two_ppl_avg']

In [None]:
df_5.loc[:,'two_ppl_avg'] = df_5.loc[:,'two_ppl_avg'].dropna().apply(lambda x: int(x.replace(',','')))
df_5.loc[:,'rate'] = df_5.loc[:,'rate'].dropna().apply(lambda x: float(x.split('/')[0]) if len(x)>3 else np.nan).dropna()

In [None]:
df_5

In [None]:
plt.figure(figsize=(10,7))
sns.scatterplot(x='rate', y='two_ppl_avg', hue='online_order', data=df_5)
plt.show()

In [None]:
plt.figure(figsize=(7, 7))
sns.histplot(df_5['two_ppl_avg'], kde=True, element='step')
plt.show()

In [None]:
df['votes']

In [None]:
df_yes = df[df['online_order']=='Yes']['votes']
df_no = df[df['online_order']=='No']['votes']

In [None]:
print(df_yes.shape)
print(df_no.shape)

In [None]:
df_yes

In [None]:
trace0 = go.Box(y=df_yes, name='accepting online orders', marker=dict(color='rgb(202, 131, 11)'))
trace1 = go.Box(y=df_no, name='not accepting online orders', marker=dict(color='rgb(20,111,211)'))

layout = go.Layout(title='Box plots of votes', width=800, height=500)
data = [trace0, trace1]
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
df_6 = df['rest_type'].value_counts()[:10]
df_6

In [None]:
sns.barplot(x=df_6.values, y=df_6.index)
plt.title('Most common types of restaurants')
plt.show()

In [None]:
df.head()

In [None]:
df_5.head()

In [None]:
sns.boxplot(y='two_ppl_avg', x='online_order', data=df_5)

In [None]:
df_budget=df[['rate','two_ppl_avg','location','name','rest_type']].dropna()
df_budget['rate']=df_budget['rate'].apply(lambda x: float(x.split('/')[0]) if len(x)>3 else 0)
df_budget['two_ppl_avg']=df_budget['two_ppl_avg'].apply(lambda x: int(x.replace(',','')))

In [None]:
df_budget['rate']

In [None]:
def restaurant_calculator(location, rest):
    places = df_budget[(df_budget['two_ppl_avg']<=400) & (df_budget['location']==location) & (df_budget['rest_type']==rest) & (df_budget['rate']>4)]
    places = places['name'].unique()
    return places

In [None]:
restaurant_calculator('BTM',"Quick Bites")

Areas with highest frequency of restaurants

In [None]:
df.columns

In [None]:
df7 = df.location.value_counts()[:10]
sns.barplot(x=df7.values, y=df7.index, palette="dark:salmon_r")
plt.show()

Most popular type of cuisine in each area

In [None]:
df8 = df.groupby(['location','cuisines']).agg('count')
df8 = df8.reset_index()
df8

In [None]:
df8 = df8.sort_values(by="url", ascending=False)
df8

In [None]:
df8 = df8.drop_duplicates(subset=['location'],keep='first')

In [None]:
df8 = df8[:10]
df8 = df8[['location','cuisines','url']]
df8 = df8.rename(columns={"url":"count"})

In [None]:
df8

Geographical map

In [None]:
locs = pd.DataFrame({"Name":df['location'].unique()})
locs['Name'] = locs['Name'].apply(lambda x: 'Bangalore ' + str(x))
lat_lon=[]
geolocator = Nominatim(user_agent="app")
for location in locs['Name']:
    location = geolocator.geocode(location)
    if location is None:
        lat_lon.append(np.nan)
    else:
        geo = (location.latitude, location.longitude)
        lat_lon.append(geo)
        
locs['geo_loc']=lat_lon
locs.to_csv('locs.csv',index=False)

In [None]:
locs['Name'] = locs['Name'].apply(lambda x: x.replace("Bangalore", "")[1:])
locs.head()

Map

In [None]:
df_locs = pd.DataFrame(df['location'].value_counts().reset_index())
df_locs.columns = ['Name', 'count']
df_locs = df_locs.merge(locs, on='Name', how='left').dropna()
df_locs.head()

In [None]:
def generateBaseMap(default_location=[12.97, 77.59], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [None]:
latitude, longitude = zip(*np.array(df_locs['geo_loc']))
print(latitude)
print(longitude)

In [None]:
df_locs['lat'] = latitude
df_locs['long'] = longitude

In [None]:
basemap = generateBaseMap()
HeatMap(df_locs[['lat','long','count']].values.tolist(), zoom=20, radius=15).add_to(basemap)

In [None]:
basemap

Most popular cuisines in Bangalore

In [None]:
plt.figure(figsize=(6,6))
df9 = df['cuisines'].value_counts()[:10]
sns.barplot(x = df9.values, y=df9.index)
plt.title('Most popular cuisines in Bangalore')
plt.show()

North Indian restaurants

In [None]:
def heatmapping(col, name):
    data = pd.DataFrame(df[df[col]==name].groupby(['location'],as_index=False)['url'].agg('count'))
    data.columns=['Name', 'count']
    print(data.head())
    data = data.merge(locs, on="Name", how='left').dropna()
    #Use values to show values only
    data['lat'], data['lon'] = zip(*data['geo_loc'].values)
    return data.drop(['geo_loc'],axis=1)

In [None]:
north_food = heatmapping('cuisines', 'North Indian')

In [None]:
basemap = generateBaseMap()
HeatMap(north_food[['lat','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)
basemap

In [None]:
south_food = heatmapping('cuisines', 'South Indian')
basemap = generateBaseMap()
HeatMap(south_food[['lat','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)
basemap

Restaurant chains

In [None]:
df.head()

In [None]:
def produce_chains(name):
    #The column which is counted becomes the index so we have to reset it
    data_chain = pd.DataFrame(df[df['name']==name]['location'].value_counts().reset_index())
    data_chain.columns = ['Name', 'count']
    data_chain = data_chain.merge(locs, on='Name', how='left').dropna()
    data_chain['lat'], data_chain['lon'] = zip(*data_chain['geo_loc'].values)
    return data_chain[['Name','count','lat','lon']]

In [None]:
testing = produce_chains('Jalsa')
testing

In [None]:
mapbox_access_token="pk.eyJ1Ijoic2hhaHVsZXMiLCJhIjoiY2p4ZTE5NGloMDc2YjNyczBhcDBnZnA5aCJ9.psBECQ2nub0o25PgHcU88w"

In [None]:
# Method - 1
# Apply a groupby to get the count of each restaurant in each type
# Group up the count results according to restaurant type 
# Sort each group in the object in descending order and take the top 3 of each
# Series objects created
datas=df.groupby(['rest_type','name']).agg('count')
g = datas['url'].groupby('rest_type',group_keys=False)
g = g.apply(lambda x: x.sort_values(ascending=False).head(3))
g

In [None]:
# Method - 2
# Straight use n-largest after the second group function
datas=df.groupby(['rest_type','name']).agg('count')
g = datas['url'].groupby('rest_type',group_keys=False).nlargest(3)
g

In [None]:
# Convert series to dataframe
df_final = g.to_frame()
df_final = df_final.reset_index()
df_final

In [None]:
casual=df_final[df_final['rest_type']=='Casual Dining']
casual

In [None]:
def produce_trace(data_chain, name):
    data_chain['text'] = data_chain['Name']+'<br>'+data_chain['count'].astype(str)
    trace = go.Scattermapbox(
            lat = data_chain['lat'],
            lon = data_chain['lon'],
            mode = 'markers',
            marker = go.scattermapbox.Marker(
                    size=data_chain['count']*4),
            text=data_chain['text'],
            name=name)
    
    return trace

In [None]:
data = []
for row in casual['name']:
    data_chain = produce_chains(row) #Produce lat lon of each restaurant in chain
    trace_0 = produce_trace(data_chain, row)
    data.append(trace_0)
    
layout = go.Layout(title='Casual dining restaurant chain locations around Bangalore',
                   autosize=True,
                   hovermode='closest',
                   mapbox=dict(
                           accesstoken=mapbox_access_token,
                           bearing=0,
                           style="streets",
                           center=dict(
                                   lat = 12.96,
                                   lon = 77.59),
                           pitch=0,
                           zoom=10)
                  )
fig = dict(data=data, layout=layout)
py.iplot(fig)

In [None]:
quick = df_final[df_final['rest_type']=='Quick Bites']
quick

In [None]:
data = []
for row in quick['name']:
    data_chain = produce_chains(row) #Produce lat lon of each restaurant in chain
    trace_0 = produce_trace(data_chain, row)
    data.append(trace_0)
    
layout = go.Layout(title='Quick Bites restaurant chain locations around Bangalore',
                   autosize=True,
                   hovermode='closest',
                   mapbox=dict(
                           accesstoken=mapbox_access_token,
                           bearing=0,
                           style="streets",
                           center=dict(
                                   lat = 12.96,
                                   lon = 77.59),
                           pitch=0,
                           zoom=10)
                  )
fig = dict(data=data, layout=layout)
py.iplot(fig)

In [None]:
cafes = df_final[df_final['rest_type']=='Cafe']
cafes

In [None]:
data = []
for row in cafes['name']:
    data_chain = produce_chains(row) #Produce lat lon of each restaurant in chain
    trace_0 = produce_trace(data_chain, row)
    data.append(trace_0)
    
layout = go.Layout(title='Cafe chain locations around Bangalore',
                   autosize=True,
                   hovermode='closest',
                   mapbox=dict(
                           accesstoken=mapbox_access_token,
                           bearing=0,
                           style="streets",
                           center=dict(
                                   lat = 12.96,
                                   lon = 77.59),
                           pitch=0,
                           zoom=10)
                  )
fig = dict(data=data, layout=layout)
py.iplot(fig)

In [None]:
df.head()

In [None]:
df['dish_liked']=df['dish_liked'].apply(lambda x : x.split(',') if type(x)==str else [''])
df['dish_liked']

In [None]:
type(df['dish_liked'][0])

In [None]:
df10 = df['rest_type'].value_counts()[:9]
df10

In [None]:
df10 = df10.index

In [None]:
def wordcloud(rest):
    plt.figure(figsize=(30,30))
    for i, r in enumerate(df10):
        plt.subplot(3, 3, i+1)
        #Creates a 2D list - Each element of the list is the dish_liked list 
        corpus = df[df['rest_type']==r]['dish_liked'].values.tolist()
        #We use list comprehension in a nested method as we need to enter each list in the list, to extract our words
        corpus = ','.join(x for list_words in corpus for x in list_words)
        wordcloud = WordCloud(max_font_size=None, background_color='white', collocations='False', width=1500, height=1500).generate(corpus)
        plt.imshow(wordcloud)
        plt.title(r)
        plt.axis('off')

In [None]:
wordcloud(df10)

In [None]:
df.head()

In [None]:
df.reviews_list[0]

In [None]:
all_ratings=[]
for name, ratings in tqdm(zip(df['name'], df['reviews_list'])):
    # Separates out rating from the text
    ratings = eval(ratings)
    for score, doc in ratings:
        if score:
            score = score.strip("Rated").strip()
            doc = doc.strip('RATED').strip()
            score = float(score)
            all_ratings.append([name, score, doc])

In [None]:
all_ratings[:5]

In [None]:
df_ratings = pd.DataFrame(all_ratings,columns=['name', 'rating', 'review'])

In [None]:
df_ratings

In [None]:
df_ratings['review']=df_ratings['review'].apply(lambda x : re.sub('[^a-zA-Z0-9\s]',"",x))

In [None]:
df_ratings

In [None]:
review_rest = df_ratings['name'].value_counts()[:9]
review_rest

In [None]:
review_rest = review_rest.index

In [None]:
review_rest

In [None]:
rest=df_ratings['name'].value_counts()[:9].index
def produce_wordcloud(rest):
    
    plt.figure(figsize=(20,30))
    for i,r in enumerate(rest):
        plt.subplot(3,3,i+1)
        corpus=df_ratings[df_ratings['name']==r]['review'].values.tolist()
        corpus=' '.join(x  for x in corpus)
        wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,
                      width=1500, height=1500).generate(corpus)
        plt.imshow(wordcloud)
        plt.title(r)
        plt.axis("off")
        

        
        
produce_wordcloud(rest)

In [None]:
plt.figure(figsize=(7,6))
rating=df_ratings['rating'].value_counts()
sns.barplot(x=rating.index,y=rating)
plt.xlabel("Ratings")
plt.ylabel('count')

<h2>Topic modelling

In [None]:
df_ratings['review_cat'] = np.where(df_ratings['rating']>2.5,1,0)

In [None]:
df_ratings.head()

* Remove stop words
* Lemmatize each word
* Create corpus
* Tokenize them

In [None]:
stops=stopwords.words('english')
lem=WordNetLemmatizer()
corpus=' '.join(lem.lemmatize(x) for x in df_ratings[df_ratings['review_cat']==1]['review'][:3000] if x not in stops)
tokens=word_tokenize(corpus)

In [None]:
len(tokens)

In [None]:
len(set(tokens))

In [None]:
df_ratings.shape

In [None]:
tokens[:10]

In [None]:
vect=TfidfVectorizer()
vect_fit=vect.fit(tokens)    

In [None]:
id_map=dict((v,k) for k,v in vect.vocabulary_.items())
vectorized_data = vect_fit.transform(tokens)

In [None]:
vectorized_data.shape

In [None]:
gensim_corpus=gensim.matutils.Sparse2Corpus(vectorized_data,documents_columns=False)
ldamodel = gensim.models.ldamodel.LdaModel(gensim_corpus,id2word=id_map,num_topics=5,random_state=34,passes=25)

In [None]:
counter=Counter(corpus.split(' '))

In [None]:
len(gensim_corpus)

In [None]:
out=[]
# Show topics default 10 words
topics = ldamodel.show_topics(formatted=False)
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i, weight, counter[word]])
        
dataframe = pd.DataFrame(out, columns=['word','topic_id','importance','word_count'])

In [None]:
dataframe

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8,6), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=dataframe.loc[dataframe.topic_id==i, :], color=cols[i], width=0.3, alpha=0.3, label='Word Count')
    ax.set_ylabel('Word Count', color=cols[i])
    #ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=8)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(dataframe.loc[dataframe.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); 

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=8, y=1.05)    
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8,6), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="importance", data=dataframe.loc[dataframe.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Importance', color=cols[i])
    #ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=8)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(dataframe.loc[dataframe.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left');

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=8, y=1.05)    
plt.show()

<h2>Negative words

In [None]:
stops=stopwords.words('english')
lem=WordNetLemmatizer()
corpus=' '.join(lem.lemmatize(x) for x in df_ratings[df_ratings['review_cat']==0]['review'][:3000] if x not in stops)
tokens=word_tokenize(corpus)

In [None]:
vect=TfidfVectorizer()
vect_fit=vect.fit(tokens)
id_map=dict((v,k) for k,v in vect.vocabulary_.items())
vectorized_data=vect_fit.transform(tokens)
gensim_corpus=gensim.matutils.Sparse2Corpus(vectorized_data,documents_columns=False)
ldamodel = gensim.models.ldamodel.LdaModel(gensim_corpus,id2word=id_map,num_topics=5,random_state=34,passes=25)

In [None]:
counter=Counter(corpus.split(' '))
out=[]
topics=ldamodel.show_topics(formatted=False)
for i,topic in topics:
    for word,weight in topic:
        out.append([word,i,weight,counter[word]])

dataframe = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count']) 

In [None]:
dataframe

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8,6), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=dataframe.loc[dataframe.topic_id==i, :], color=cols[i], width=0.3, alpha=0.3, label='Word Count')
    ax.set_ylabel('Word Count', color=cols[i])
    #ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=8)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(dataframe.loc[dataframe.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); 

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=8, y=1.05)    
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8,6), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="importance", data=dataframe.loc[dataframe.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Importance', color=cols[i])
    #ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=8)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(dataframe.loc[dataframe.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left');
fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=8, y=1.05)    
plt.show()

T-SNE

In [None]:
stops=set(stopwords.words('english'))
lem=WordNetLemmatizer()
corpus=[]
for review in tqdm(df_ratings['review'][:10000]):
    words=[]
    for x in word_tokenize(review):
        x=lem.lemmatize(x.lower())
        if x not in stops:
            words.append(x)
            
    corpus.append(words)

In [None]:
model = word2vec.Word2Vec(corpus, vector_size=100, window=20, min_count=200, workers=4)

In [None]:
def tsne_plot(model):
    labels=[]
    tokens=[]
    
    for word in model.wv.index_to_key:
        tokens.append(model.wv[word])
        labels.append(word)
        
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=1)
    new_values = tsne_model.fit_transform(tokens)
    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(10,10))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                    xy=(x[i], y[i]),
                    xytext=(5, 2), 
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
        
    plt.show()

In [None]:
tsne_plot(model)

In [None]:
postive=df_ratings[df_ratings['rating']>3]['review'][:2000]
negative=df_ratings[df_ratings['rating']<2.5]['review'][:2000]

def return_corpus(df):
    corpus=[]
    for review in df:
        tagged=nltk.pos_tag(word_tokenize(review))
        adj=[]
        for x in tagged:
            if x[1]=='JJ':
                adj.append(x[0])
        corpus.append(adj)
    return corpus

In [None]:
corpus=return_corpus(postive)
model = word2vec.Word2Vec(corpus, vector_size=100, min_count=10,window=20, workers=4)
tsne_plot(model)

In [None]:
corpus=return_corpus(negative)
model = word2vec.Word2Vec(corpus, vector_size=100, min_count=10,window=20, workers=4)
tsne_plot(model)