In [1]:
from operator import add
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline, PipelineModel

from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
#sc = SparkContext(gateway = jg.launch_gateway())

import folium
import html

import pandas as pd
import numpy as np

In [2]:
data_path = '/Volumes/Transcend/dataset/'
model_path = '/Volumes/Transcend/MDS_Yelp/model/'
outout_path = '/Volumes/Transcend/MDS_Yelp/output/'

If spark jobs fail, stop the existing service and restart

In [None]:
sc.stop()

In [3]:
sc = SparkContext()
#sc = SparkContext('local')
spark = SparkSession(sc)

In [4]:
business_df = spark.read.parquet(data_path + 'business-small.parquet')
business_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [5]:
user_df = spark.read.parquet(data_path + 'user-small.parquet')
user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [6]:
review_df = spark.read.parquet(data_path + 'review-small.parquet')
review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [7]:
# create SQL view for later queries
review_df.createOrReplaceTempView("reviews")

# create review text dataframe
reviews_text = spark.sql("SELECT user_id, review_text FROM reviews")
reviews_text.show(3)

+--------------------+--------------------+
|             user_id|         review_text|
+--------------------+--------------------+
|u0LXt3Uea_GidxRW1...|Who would have gu...|
|u0LXt3Uea_GidxRW1...|Not bad!! Love th...|
|u0LXt3Uea_GidxRW1...|This is currently...|
+--------------------+--------------------+
only showing top 3 rows



In [8]:
# concatenate all reviews per restuarant

reviews_text_rdd = reviews_text.rdd
reviews_by_user_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_user_df = spark.createDataFrame(reviews_by_user_rdd)
reviews_by_user_df = reviews_by_user_df \
                            .withColumnRenamed('_1', 'user_id') \
                            .withColumnRenamed('_2', 'text')
reviews_by_user_df.count()

73041

#### Sample example of how word2vec works

In [9]:
## Example of using Word2vec
from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.02326762331649661,0.008931299671530724,-0.06394885405898094]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.009175857529044151,-0.024911361613443917,0.012272004171141555]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.005160079896450043,-0.0005152661353349686,0.014656295813620091]



In [10]:
## https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec

## The minimum number of times a token must appear to be included in the word2vec model's vocabulary"

In [None]:
%%time
# usefule link: https://www.tutorialkart.com/apache-spark/spark-mllib-tf-idf/
# create text processing pipeline -- this a lengthy resource-intensive process
# Build the pipeline 
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
# TD-IDF Vec
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)
vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF, word2Vec, vectorAssembler])

# fit the model
pipeline_mdl = pipeline.fit(reviews_by_user_df)

#save the pipeline model
pipeline_mdl.write().overwrite().save(model_path + 'pipe_txt')

In [9]:
# load the text transformation pipeline trained model
pipeline_mdl = PipelineModel.load(model_path + 'pipe_txt')

In [10]:
# transform the review data
reviews_by_user_trf_df = pipeline_mdl.transform(reviews_by_user_df)

In [11]:
# show the transformed review data
reviews_by_user_trf_df.select( 'text', 'nostopwrd', 'idf_vec', 'word_vec').show(10)

+--------------------+--------------------+--------------------+--------------------+
|                text|           nostopwrd|             idf_vec|            word_vec|
+--------------------+--------------------+--------------------+--------------------+
|A born and bred T...|[born, bred, toro...|(128365,[0,1,2,3,...|[-0.0298230579082...|
|I have never done...|[never, done, one...|(128365,[2,6,11,1...|[-0.0256846508636...|
|What a find!  I'm...|[find, m, almost,...|(128365,[2,3,6,11...|[-0.0783052189961...|
|Not impressed wit...|[impressed, place...|(128365,[0,1,2,3,...|[-0.0273270801487...|
|Very tight space....|[tight, space, fo...|(128365,[2,39,116...|[-0.0261619807634...|
|While in Toronto ...|[toronto, last, w...|(128365,[0,1,2,3,...|[-0.0836421314076...|
|I loved the decor...|[loved, decor, re...|(128365,[0,1,3,5,...|[-0.0428024260223...|
|Amazing vermacell...|[amazing, vermace...|(128365,[0,2,4,5,...|[-0.1015487008305...|
|Uncle Tetsu's is ...|[uncle, tetsu, ho...|(128365,[0,

In [11]:
def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

In [12]:
all_user_vecs = reviews_by_user_trf_df.select('user_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [16]:
all_user_vecs[0]
# user_id = 'nOTl4aPC4tKHK35T3bNauQ'
# DenseVector() - vector respresentation of all the reviews of User

('BytRWk8X1OelSgwwfXd8Aw',
 DenseVector([-0.0298, -0.0741, -0.0649, 0.0832, 0.0279, -0.0149, -0.0436, -0.0123, -0.0076, -0.0379, -0.0085, -0.0369, -0.0037, -0.0116, 0.0142, -0.0317, -0.0555, -0.0272, -0.0729, 0.0104, -0.0666, -0.0442, 0.0158, 0.0029, 0.0563, -0.0458, -0.0137, -0.0207, -0.0063, 0.0337, 0.0583, 0.0148, 0.0212, -0.0019, -0.031, -0.0436, 0.0617, 0.0352, -0.0151, -0.0433, -0.0319, -0.0212, -0.0246, 0.0944, 0.0214, -0.0364, -0.0363, 0.0447, 0.0029, 0.0008, 0.0107, -0.0229, -0.0219, -0.045, -0.0714, 0.0296, 0.0166, 0.0181, 0.0324, -0.0024, -0.0046, 0.0065, 0.0301, -0.049, -0.0385, 0.0413, 0.0566, 0.0437, -0.0322, 0.0902, -0.0098, -0.0056, 0.0242, 0.0346, 0.0133, -0.0412, -0.0373, 0.0083, -0.0207, 0.0461, -0.0175, -0.0269, 0.0018, -0.0071, -0.0001, 0.0023, 0.0449, 0.0108, 0.0087, 0.0074, 0.013, -0.0001, -0.0442, 0.0266, -0.0158, -0.0175, -0.0496, 0.0423, -0.0899, 0.034]))

In [13]:
def getSimilarUsers(u_ids, all_user_vecs, sim_user_limit=10):
    
    schema = StructType([   
                            StructField("user_id", StringType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("input_user_id", StringType(), True)
                        ])
    
    similar_user_df = spark.createDataFrame([], schema)
    similar_user_df_all = spark.createDataFrame([], schema)
    
    for u_id in u_ids:
        input_vec = [(r[1]) for r in all_user_vecs if r[0] == u_id]
        if(len(input_vec) < 1):
            print("not in the user_df")
            break
        else:
            #print("-------------------------------")
            #print(u_id)
            input_vec = input_vec[0]
    
        similar_user_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_user_vecs)
        
        similar_user_df = spark.createDataFrame(similar_user_rdd) \
            .withColumnRenamed('_1', 'user_id') \
            .withColumnRenamed('_2', 'score') \
            .orderBy("score", ascending = False)
            
        similar_user_df = similar_user_df.dropna()    
        similar_user_df = similar_user_df.filter(col("user_id") != u_id).limit(sim_user_limit)
        similar_user_df = similar_user_df.withColumn('input_user_id', lit(u_id))
        
        similar_user_df = similar_user_df \
                                    .union(similar_user_df)
        
        similar_user_df_all = similar_user_df_all.union(similar_user_df)
    similar_user_df_all = similar_user_df_all.dropDuplicates()    
    return similar_user_df_all

In [14]:
def getUserDetails(user):
    
    a = user.alias("a")
    b = user_df.alias("b")
    
    return a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.user_id'), col('b.user_name'),col('b.review_count')])

In [40]:
# test with two users

uids = ['nOTl4aPC4tKHK35T3bNauQ', 'QBac9-Ii6jR-yLsQ5MVTHg']

print('\ninput user details:')
user_df.select('user_id','user_name', 'review_count') \
    .filter(user_df.user_id.isin(uids) == True).show(truncate=False)
    
# get top 10 similar users
sim_users = getUserDetails(getSimilarUsers(uids, all_user_vecs, 10))

print('Top 10 similar Users for each input restaurant are:"')
sim_users_df = sim_users.select('input_user_id', 'a.user_id', 'user_name', 'score','review_count').toPandas()
sim_users_df


input user details:
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|nOTl4aPC4tKHK35T3bNauQ|Katherine|148         |
|QBac9-Ii6jR-yLsQ5MVTHg|Alex     |13          |
+----------------------+---------+------------+



  


Top 10 similar Users for each input restaurant are:"


Unnamed: 0,input_user_id,user_id,user_name,score,review_count
0,QBac9-Ii6jR-yLsQ5MVTHg,J5Eb7LhJaOa20k0ppcOCOg,Alek,0.907664,34
1,nOTl4aPC4tKHK35T3bNauQ,ZBllYKrFzaI0I7v6Wl26Wg,Cecilia,0.963999,135
2,QBac9-Ii6jR-yLsQ5MVTHg,cNhHuEQMIpLH_qc9qGz67A,Jay,0.910774,57
3,QBac9-Ii6jR-yLsQ5MVTHg,_IR48ok0ZkPMWJ2PlRCk0A,Michael,0.907098,82
4,nOTl4aPC4tKHK35T3bNauQ,myrcQ3h2G04Gv-ANG_oqrg,Linda,0.971381,112
5,QBac9-Ii6jR-yLsQ5MVTHg,MpN81tQOL86GaFse-_tTRQ,Amy,0.913263,46
6,QBac9-Ii6jR-yLsQ5MVTHg,kw-YtOKPXrRB2a9wRZlmzQ,Jimmy,0.915453,101
7,nOTl4aPC4tKHK35T3bNauQ,uO1w3qNo21c1bVHHFTYW0w,Joanne,0.972255,221
8,QBac9-Ii6jR-yLsQ5MVTHg,bPUpO-bP6BmAGvSwPyDsng,Michael,0.905848,142
9,nOTl4aPC4tKHK35T3bNauQ,PGx4HvY5joEeqXzam6tO7A,Lisa,0.965808,349


In [15]:
review = pd.read_csv("/Volumes/Transcend/MDS_Yelp/yelp_review.csv")

In [16]:
#uids = ['nOTl4aPC4tKHK35T3bNauQ', 'QBac9-Ii6jR-yLsQ5MVTHg']
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['PGx4HvY5joEeqXzam6tO7A'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] 
#average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,cefRDEK5O3t_iUuwnmL27Q,4,3,We booked a reservation at Smith for Summerlic...,"This place looks small from the outside, but i...",2012-07-12,2016-07-31
1,pOEL97ld-FJMKO8Ki8JmYg,3,5,"Okay, so my rating has gone down from the firs...",I am a big fan of anything Oliver & Bonacini. ...,2013-10-20,2013-01-13
2,EjZYT46Y2qHSoChynd0q-A,4,5,This small gallery is a bit hard to get to at ...,I love this place. I stumbled on it when my b...,2012-11-11,2014-10-03
3,_vZ7bHaGCjllogiZ7RH17w,3,4,My sister told me that Origin was well known f...,4.5 stars \n\nI came here with a girl friend ...,2014-04-02,2015-04-06
4,JPgBO-7imIPdc2XBkvynpQ,4,4,For some reason when I heard the name of this ...,"Huge, huge list of Martinis. Prices aren't ba...",2011-03-21,2015-01-03
5,S5bNE4Pmin8OQUMOFod8bQ,3,2,I actually think this restaurant deserves a 3-...,Came here with a girl friend for dinner on a F...,2011-07-17,2012-10-28
6,uF86ZhygpBEGr3CudNemYA,3,3,When deciding where to go to Valentine's Day d...,I'm terrified of the dark and it took me a lon...,2011-11-20,2012-04-10
7,RkekriZhaIZ7nSJBVept6Q,4,4,I really wanted to be blown away by this place...,I've wanted to try this place for awhile and f...,2013-02-16,2012-09-23


In [40]:
import fastparquet
review_df = pd.read_parquet(data_path + 'review-small.parquet', engine='fastparquet')

In [41]:
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['vJGLEHyhCs9V-5fAe-xx3w'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,oQylTvXwGIkKFdCjmafKVg,4,4,"We went here after dinner, simply for drinks. ...",We came here for Mother's day as has become tr...,2013-02-10,2013-06-04
1,5N8R7ALESZ30EoAzVJtabw,5,5,Went in on a whim and was not disappointed. I ...,The Dirty Bird invited me back to give them an...,2016-10-17,2015-10-15
2,u2ETlHOcFdRz4BxcdfsK0Q,3,3,We went here for Summerlicious. For $25 we got...,I came here for a ladies lunch with the girls ...,2012-07-08,2013-06-04


In [45]:
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['ZBllYKrFzaI0I7v6Wl26Wg'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,cefRDEK5O3t_iUuwnmL27Q,4,4,We booked a reservation at Smith for Summerlic...,"I would've given it 5 stars for the food, but ...",2012-07-12,2014-05-05
1,c78Pat78fVUBFPXYeVvbaQ,5,3,I am surprised at the people complaining about...,Overrated. \n\nMistake: coming to Odd Seoul on...,2016-01-24,2016-03-01


In [42]:
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['VVm-TFCpi9M1-k8ED0l1eA'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,zgQHtqX0gqMw1nlBZl2VnQ,2,4,"While I really enjoyed the noodles, their sign...",Actual rating 3.5\n\nLet's get my main gripe o...,2013-10-19,2014-11-22
1,kGOr_D-LNpgZ2M9N8TT4QQ,5,4,I went here with my husband and another couple...,La Societe looks as good as you'd imagine a re...,2013-01-27,2012-01-05
2,W5d8iNog90R-qw43m5dGwg,5,5,"I have been craving schnitzel for a LONG time,...",Had lunch here today and I'm totally sold. Si...,2012-10-08,2012-08-12
3,OIdOJaNS8M624F58XGV3PQ,4,3,"Really this deserves a 3-1\/2 star rating, but...",Actually around 3.5\n\nHaven't been here in qu...,2012-11-11,2014-12-22


In [44]:
review_uid1 = review.loc[review['user_id'].isin(['QBac9-Ii6jR-yLsQ5MVTHg'])]
review_uid2 = review.loc[review['user_id'].isin(['eV5usRjY2cDqNKVv8wXroA'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,r_BrIgzYcwo1NAuG9dLbpg,4,3,This review will be shorter than my first two....,Total cheap eat hipster restaurant. The food w...,2016-07-05,2016-05-02


##### Top users based on review count

In [23]:
# converting to pandas dataframe
user_pd = user_df.toPandas()
user_pd = user_pd.sort_values(by = 'review_count', ascending= False)
uids_top_500 = list(user_pd.user_id.iloc[0:500])
user_pd.head()

Unnamed: 0,user_id,user_name,review_count,yelping_since,useful,funny,cool,fans,average_stars
54601,8RcEwGrFIgkt9WQ35E6SnQ,George,7764,2009-11-06,123,139,113,272,3.49
44835,Xwnf20FKuikiHcSpcEbpKQ,Kenneth,6653,2011-06-10,1444,1142,1167,237,3.32
22639,CxDOIDnH8gp9KXzpBHJYXw,Jennifer,5868,2009-11-09,1241,1968,959,610,3.29
48114,HFECrzYDpgbS5EmTBtj2zQ,Eric,5344,2007-03-28,3905,3876,3847,397,3.93
29361,Hi10sGSZNxQH3NLyWSZ1oA,Fox,4537,2009-05-26,40103,40030,39837,871,3.81


##### Find similarity for top 500 users (count of reviews)

In [17]:
sim_users_all = pd.DataFrame(columns = ['input_user_id', 'user_id', 'user_name', 'score', 'review_count'])

In [33]:
iterator = 0
for uid in uids_top_500[250:400]:
    user_df.select('user_id','user_name', 'review_count') \
            .filter(user_df.user_id.isin([uid]) == True).show(truncate=False)

    sim_users = getUserDetails(getSimilarUsers([uid], all_user_vecs, 10))
    sim_users_pd = sim_users.select('input_user_id', 'a.user_id', 'user_name', 'score','review_count').toPandas()
    sim_users_all = pd.concat([sim_users_all, sim_users_pd])
    sim_users_all = sim_users_all.drop_duplicates()
    sim_users_all = sim_users_all.sort_values(by = 'input_user_id')
    iterator = iterator + 1
    print(iterator)
    #if iterator > 20:
    #   sim_users_all.to_csv("sim_users_200.csv")

with open("sim_users_250_400.txt", "wb") as fp:   #Pickling
    pickle.dump(sim_users_all, fp)

sim_users_all.to_csv("sim_users_250_400.csv")

+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|s19c8t_yAmFrby90LjDk8g|Jason    |940         |
+----------------------+---------+------------+



  


1
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|GGSfLyM88SbFGhnm46ykXQ|Coco     |938         |
+----------------------+---------+------------+

2
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|rqY0AQMUtoDQnfgxCNuKgw|Dan      |931         |
+----------------------+---------+------------+

3
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|j6wLUT0ZXi-x0otelYIFpA|Elaine   |930         |
+----------------------+---------+------------+

4
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|r-Ck0JPoMfHI7h2X0brm2w|Rashid   |926         |
+----------------------+---------+------------+

5
+----------------------+--

35
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|vKUlUXLVylwdttu7h1_Y5Q|Travis   |862         |
+----------------------+---------+------------+

36
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|q16FcvZyP5SURJEX8iGcTg|Angelica |862         |
+----------------------+---------+------------+

37
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|EiP1OFgs-XGcKZux0OKWIA|Allison  |861         |
+----------------------+---------+------------+

38
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|CoS9VbTzh74Hh7FSlAKKVA|Elsa     |859         |
+----------------------+---------+------------+

39
+--------------------

69
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|0FyfxrAMHm7AlrAkAXZQ9Q|Cyndi    |816         |
+----------------------+---------+------------+

70
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|y8jJV4AeH6ZEBagQ_ys8KQ|Jenny    |814         |
+----------------------+---------+------------+

71
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|C2uPDBzST_e1RBLDmgRtWg|Ross     |811         |
+----------------------+---------+------------+

72
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|CH9XhZWmcLYVAlzhwrhMvw|Jessica  |810         |
+----------------------+---------+------------+

73
+--------------------

103
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|_f9NyNygDasxm4x_8K0FMg|Jen      |770         |
+----------------------+---------+------------+

104
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|HzoQKKHDq9BI37dyJAAtGA|Jim      |769         |
+----------------------+---------+------------+

105
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|XVP0dQV-BsXKzp2KN214dg|Jess     |769         |
+----------------------+---------+------------+

106
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|FSDj4JOrZxd5Z-ax15Klgg|Reyz     |768         |
+----------------------+---------+------------+

107
+---------------

137
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|1JtpcHELRrSDoWS80WtzuQ|Jon      |729         |
+----------------------+---------+------------+

138
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|zzaq5Fn1U2Feut3dGxidNg|Marc     |727         |
+----------------------+---------+------------+

139
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|-s48ZKSA31DFNcgJLChhQQ|Cynthia  |726         |
+----------------------+---------+------------+

140
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|33h9Jv4VKbbxB42CUD9WQg|Wahed    |725         |
+----------------------+---------+------------+

141
+---------------

In [19]:
##### Load similar users file produced from above code cell
import pickle
with open("sim_users_200.txt", "rb") as fp:   # Unpickling
    sim_users_all = pickle.load(fp)

In [20]:
sim_users_all.head()

Unnamed: 0,input_user_id,user_id,user_name,score,review_count
9,-3s52C4zL_DHRK0ULG6qtg,ZlWvLV9xLBCMzow7otGeEA,Barry,0.82538,3
7,-3s52C4zL_DHRK0ULG6qtg,dsqeMUrkQZ80jfMgf0jszw,Lola,0.82247,30
6,-3s52C4zL_DHRK0ULG6qtg,FnzXRHWUl10fKq026BGGFQ,Constanza,0.823784,14
5,-3s52C4zL_DHRK0ULG6qtg,sbthWXD2VHV_jlPSRWn7RA,Laura,0.82296,20
4,-3s52C4zL_DHRK0ULG6qtg,ZDmizf31m6E5-rzE6OiGLg,Marianne,0.823582,18


In [29]:
sim_users_all.shape

(2050, 5)

In [30]:
merged_100 = pd.DataFrame(columns = ['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y'])

for i in range(len(sim_users_all)):
    review_uid1 = review.loc[review['user_id'].isin([sim_users_all.input_user_id.iloc[i]])] 
    review_uid2 = review.loc[review['user_id'].isin([sim_users_all.user_id.iloc[i]])] 
    merged = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
    merged = merged[['user_id_x', 'user_id_y', 'business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']]
    merged_100 = pd.concat([merged_100, merged])
    
with open("user_simil_business_ratings.txt", "wb") as fp:   #Pickling
    pickle.dump(merged_100, fp)

merged_100.to_csv("user_simil_business_ratings_250_400.csv")

In [None]:
# use this similarity matrix, recommend highly rated businesses from similar users
# high similarity can be seen, even if they have not reviewed a business in common.
# using similar model of users for businesses, can identify highly similar businesses
    # these businesses can be similar in ratings, category of business

####### To compare with Ahmed's similarity measurements

In [32]:
# test with user

uids = ['J5Eb7LhJaOa20k0ppcOCOg']

print('\ninput user details:')
user_df.select('user_id','user_name', 'review_count') \
    .filter(user_df.user_id.isin(uids) == True).show(truncate=False)
    
# get top 10 similar users
sim_users = getUserDetails(getSimilarUsers(uids, all_user_vecs, 200))

print('Top 10 similar Users for each input restaurant are:"')
sim_users.select('input_user_id', 'a.user_id', 'user_name', 'score','review_count').toPandas()


input user details:
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|J5Eb7LhJaOa20k0ppcOCOg|Alek     |34          |
+----------------------+---------+------------+



  


Top 10 similar Users for each input restaurant are:"


Unnamed: 0,input_user_id,user_id,user_name,score,review_count
0,J5Eb7LhJaOa20k0ppcOCOg,vtoiZ0SFtO51-xvEVTdIYQ,David,0.942662,44
1,J5Eb7LhJaOa20k0ppcOCOg,iwDP-y8NJG3Vem2z242LDg,CalamityJane,0.938837,89
2,J5Eb7LhJaOa20k0ppcOCOg,Np-Eumi0cPgX7S2LWOC57A,Rebecca,0.935694,24
3,J5Eb7LhJaOa20k0ppcOCOg,5InzuFd9Kq6uk22a3DIfgQ,Kelly,0.935056,40
4,J5Eb7LhJaOa20k0ppcOCOg,qZa2urX00YWHs-jRcWgUqw,Silas,0.932498,73
5,J5Eb7LhJaOa20k0ppcOCOg,Tbr5YHoLq1IlP1QkNZ2EeA,B.,0.931331,33
6,J5Eb7LhJaOa20k0ppcOCOg,nXehm6zt294ZswIgh5LpbA,Brandon,0.943381,148
7,J5Eb7LhJaOa20k0ppcOCOg,Eb6TPevViIzBy8ApjPtuqw,Inae,0.929516,24
8,J5Eb7LhJaOa20k0ppcOCOg,k9iqO8atK7BC4f8RcSXi-g,Esther,0.935531,42
9,J5Eb7LhJaOa20k0ppcOCOg,SkpCulULgiBfLc4lV8vx2g,Stephanie,0.936608,374


In [27]:
sim_users.select('input_user_id', 'a.user_id', 'user_name', 'score','review_count').toPandas().sort_values(by = 'input_user_id')

Unnamed: 0,input_user_id,user_id,user_name,score,review_count
19,CxDOIDnH8gp9KXzpBHJYXw,ma6206bmu-a_Ja7Iv-yRCw,Sharon,0.953301,232
2,CxDOIDnH8gp9KXzpBHJYXw,DNlzbI54zCOa4to-vTWBJQ,Karen,0.956598,395
16,CxDOIDnH8gp9KXzpBHJYXw,4D6LLuJfao_eHGA6XZR-bA,Jackie,0.951405,219
15,CxDOIDnH8gp9KXzpBHJYXw,QPJJohtGqkMkaN0Gt3TRIg,Cherrie,0.954686,235
5,CxDOIDnH8gp9KXzpBHJYXw,xC-q_yh0XwcjRLimkS3RNg,Jonny,0.954941,730
14,CxDOIDnH8gp9KXzpBHJYXw,LQG9wFTmscbAATp8QWj1hg,Kathy,0.956582,501
7,CxDOIDnH8gp9KXzpBHJYXw,IhjiCSCiaefHE-4FAvSOkA,Janice,0.957189,59
8,CxDOIDnH8gp9KXzpBHJYXw,Wu0yySWcHQ5tZ_59HNiamg,WaYnE,0.960825,1083
13,CxDOIDnH8gp9KXzpBHJYXw,KBh4r16e9Grc1HI9pG4wTg,Rainie,0.953038,450
12,CxDOIDnH8gp9KXzpBHJYXw,vZEY5WEJ9sRxTGhm97132A,Adrienne,0.951509,64
