In [None]:
# load packages

import boto3
import botocore
import psycopg2
import sqlalchemy
import pandas as pd
import numpy as np
import time
import pickle

# Gensim uses Python’s standard logging module to log various stuff at various priority levels; to activate logging (this is optional), run
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [None]:
# Let's use Amazon S3
client = boto3.client('s3')
s3 = boto3.resource('s3')

# Print out bucket names
for bucket in s3.buckets.all():
    print(bucket.name)
bucket_name = 'cse6242oan-xchen668'

In [None]:
# connect psql server
# psql --host cse6242project.cnsmcycpnqu7.us-east-1.rds.amazonaws.com --p --port 5432 --username=<your_name> --dbname=cse6242project
engine = sqlalchemy.create_engine('postgresql+psycopg2://xchen668:password@cse6242project.cnsmcycpnqu7.us-east-1.rds.amazonaws.com/cse6242project')

# business = pd.read_sql_query("SELECT * FROM {};".format("business"), engine)
businessDf = pd.read_sql_table("business", engine)

# check data schema
businessDf.head()
# drop geom col for postGis
businessDf.drop("geom", axis = 1)

In [None]:
query = """
    select a.*, b.city
    from review a
    inner join 
    business b
    on a.business_id = b.business_id
    where b.is_us = 1
    and b.is_restaurant = 1;
"""
usResReviews =  pd.read_sql_query(query, engine)

print('\nThe first review:\n')
print(usResReviews["text"][0], '\n')
print(usResReviews.shape)
print(usResReviews.columns)


# In[18]:


data = usResReviews[pd.notnull(usResReviews['text'])]
print(data.shape)

#size = 100000 #100,000
size = 1000000
data = data.sample(frac=1).reset_index(drop=True)
subdata, restdata = data.iloc[:size], data.iloc[size:]

//subdata.to_csv('review_sub_{}.csv'.format(size), index=False, quoting=3, sep=',', escapechar='\\', encoding='utf-8')

In [None]:
# Warning: long running time (150min)

import nltk
nltk.download('punkt')
# nltk.download('popular')
# test tokenizer
nltk.word_tokenize("Tokenize me")

from Word2VecUtility import Word2VecUtility

t0 = time.time()
review_sents = []
# Cleaning and parsing the reviews...
for i in range( 0, len(subdata["text"])):
    # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer)
    review_sents += Word2VecUtility.review_to_sentences(subdata.iloc[i]["text"])
    
t1 = time.time()
print(t1-t0)

with open('review_sents_{}.pkl'.format(size), 'wb') as out:
    pickle.dump(review_sents, out)

In [None]:
# In[28]:


model.wv.doesnt_match("man woman child kitchen".split())


# In[29]:


model.wv.doesnt_match("coffee tea juice restaurant".split())


# In[30]:


model.wv.most_similar("friendly")


# In[31]:


model.wv.most_similar("sushi")


# In[32]:


index2word_set = set(model.wv.index2word)
print(len(index2word_set))


# In[33]:


model.wv.most_similar(positive=['coffee'], topn=10)


# In[34]:


result = model.wv.most_similar(positive=['sushi'], negative=['japan'], topn=10)
print(result)


# In[35]:


from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

X = model[model.wv.vocab]

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.show()


# In[36]:


list(model.wv.vocab.keys())


# In[38]:


import seaborn as sns

tsne_df = pd.DataFrame(X_tsne, columns=['x','y'])
ax = sns.lmplot('x', 'y', tsne_df, fit_reg=False, size=8,
               scatter_kws={'alpha':0.7,'s':60})


# In[ ]:


ax = None


# In[ ]:


labels = list(model.wv.vocab.keys())

plt.figure(figsize=(16, 16)) 
for i in range(1000):
    plt.scatter(X_tsne[i, 0], X_tsne[i, 1])
    plt.annotate(labels[i],
                 xy=(X_tsne[i, 0], X_tsne[i, 1]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()


# In[39]:


labels = list(model.wv.vocab.keys())
tsne_df["label"] = labels


# In[40]:


tsne_df[(tsne_df['x']>0) & (tsne_df['x']<3) & 
       (tsne_df['y']>35) & (tsne_df['y']<38)][:20]


# In[41]:


filename = 'tsne_wordvects_{}.csv'.format(size)
tsne_df.to_csv(filename, sep=',')


# In[42]:


bucket_name = 'cse6242oan-xchen668'

# Uploads the given file using a managed uploader, which will split up large
# files automatically and upload parts in parallel.
client.upload_file(filename, bucket_name, "project/" + filename)


# In[43]:


print(restdata.shape)

restdata.to_csv('review_rest.csv', index=False, quoting=3, sep=',', escapechar='\\', encoding='utf-8')


# In[ ]:


from Word2VecUtility import Word2VecUtility

t0 = time.time()
rest_review_sents = []
# Cleaning and parsing the reviews...
for i in range( 0, len(restdata["text"])):
    # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer)
    review_sents += Word2VecUtility.review_to_sentences(restdata.iloc[i]["text"])
    
t1 = time.time()
print(t1-t0)


# In[ ]:


with open('rest_review_sents.pkl', 'wb') as out:
    pickle.dump(rest_review_sents, out)


# In[ ]:


from gensim.models import word2vec

t0 = time.time()
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# continue to train the model (this will take some time)
print("Training model...")
model.train(rest_review_sents, total_examples=len(rest_review_sents), epochs=model.epochs)

t1 = time.time()
print(t1-t0)


# In[ ]:


# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context_{}".format('all')
model.save(model_name)