In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [5]:
import warnings
warnings.filterwarnings('ignore')

## Using Latent Factor Collaborative Filtering

In [6]:
df_user = pd.read_csv('yelp_review_arizona.csv')

In [7]:
df_business = pd.read_csv('yelp_business.csv')

In [8]:
df_user.head()

Unnamed: 0,review_id,user_id,business_id,text,stars,date
0,V93SYj2OLh5m9Cquzf-7kg,ZwVz20be-hOZnyAbevyMyQ,2c9Vptks_vowLgVUMnCgjw,Came here while in town for a country concert....,4.0,2013-09-04 01:29:46
1,vNTFadc6T9HeH3Qa78dc_Q,91TB-gzcNyxFh46TL0pmnQ,6nKR80xEGHYf2UxAe_Cu_g,Best barbecue this side of the Mississippi!!!!...,5.0,2015-12-05 02:50:10
2,SXRFBCt5eXCBF7TlI7UG6Q,Y_QBiZpATJoz8hKUfYF66A,fbQaKW0Lte0JQ_opbnjdKg,Absolutely amazing. Think Chipotle for enchila...,5.0,2014-04-01 01:56:00
3,CqMNjtG0hNZGhDw4RDE-zw,_Jg-IA0M-GSjBlGu-wmejg,r8764MtYyt8JhxMvrfM_xQ,I was really disappointed with my most recent ...,2.0,2014-10-11 03:53:53
4,5hZLouGEW4wm6BTJ5aNUNw,1CqkFliipv_X15WYn5aPfg,QS3QxI7u5PRdtbGgI0-UsA,I grade sushi restaurants on 3 factors:\n- Qua...,4.0,2015-03-04 19:36:21


In [9]:
#Select ratings and text from review dataset
review_data = df_user[['user_id','business_id', 'stars', 'text']]

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mangesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
import string
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))

In [12]:
def review_text_process(rawtext):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in rawtext if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return " ".join([word for word in nopunc.split() if word.lower() not in stop])

In [13]:
review_data['text'] = review_data['text'].apply(review_text_process)

In [14]:
#Split train test for testing the model later
valid_size=0.15
a_train, a_valid, b_train, b_valid = train_test_split(review_data['text'], df_user['business_id'], test_size = valid_size)

In [15]:
df_userid = review_data[['user_id','text']]
df_businessid = review_data[['business_id', 'text']]

In [16]:
df_userid.head()

Unnamed: 0,user_id,text
0,ZwVz20be-hOZnyAbevyMyQ,Came town country concert better way start day...
1,91TB-gzcNyxFh46TL0pmnQ,Best barbecue side Mississippi Come car washed...
2,Y_QBiZpATJoz8hKUfYF66A,Absolutely amazing Think Chipotle enchiladas N...
3,_Jg-IA0M-GSjBlGu-wmejg,really disappointed recent visit Vintage 95 ma...
4,1CqkFliipv_X15WYn5aPfg,grade sushi restaurants 3 factors Quality Exce...


In [17]:
df_businessid.head()

Unnamed: 0,business_id,text
0,2c9Vptks_vowLgVUMnCgjw,Came town country concert better way start day...
1,6nKR80xEGHYf2UxAe_Cu_g,Best barbecue side Mississippi Come car washed...
2,fbQaKW0Lte0JQ_opbnjdKg,Absolutely amazing Think Chipotle enchiladas N...
3,r8764MtYyt8JhxMvrfM_xQ,really disappointed recent visit Vintage 95 ma...
4,QS3QxI7u5PRdtbGgI0-UsA,grade sushi restaurants 3 factors Quality Exce...


In [18]:
df_userid = df_userid.groupby('user_id').agg({'text': ' '.join})
df_businessid = df_businessid.groupby('business_id').agg({'text': ' '.join})

In [19]:
df_userid.head()

Unnamed: 0_level_0,text
user_id,Unnamed: 1_level_1
--2HUmLkcNHZp0xw6AMBPg,place JAM Surfer vibe great eats love machaca ...
--4rAAfZnEIAKJE80aIiYg,pulled pork spicy bbq sauce impressed Probably...
--Nnm_506G_p8MxAOQna5w,Cant say burger anything special Taste ok Shak...
--ty7Z9fEt08E3dS3_qoSA,know think important trust Yelp kind reviews t...
-0IiMAZI2SsQ7VmyzJjokQ,Ever fan roadside attractions Americana Id rea...


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
#userid vectorizer
u_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=3000)
u_vectors = u_vectorizer.fit_transform(df_userid['text'])
print('The size of the user-vector matrix for is ',u_vectors.shape)

The size of the user-vector matrix for is  (10937, 3000)


In [21]:
u_vectors

<10937x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 1015365 stored elements in Compressed Sparse Row format>

In [23]:
#Business id vectorizer
b_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=3000)
b_vectors = b_vectorizer.fit_transform(df_businessid['text'])
print('The size of the restaurant-vector matrix for is ',b_vectors.shape)

The size of the restaurant-vector matrix for is  (1411, 3000)


In [24]:
user_rating_matrix = pd.pivot_table(review_data, values='stars', index=['user_id'], columns=['business_id'])
user_rating_matrix.shape

(10937, 1411)

In [25]:
user_rating_matrix.head()

business_id,-050d_XIor1NpCuWkbIVaQ,-1UMR00eXtwaeh59pEiDjA,-4TMQnQJW1yd6NqGRDvAeA,-6h3K1hj0d4DRcZNUtHDuw,-8QlV3b_9H4BAh6LgMIr1g,-9eNGMp8XiygI8t8QFuFWw,-9nai28tnoylwViuJVrYEQ,-Bdw-5H5C4AYSMGnAvmnzw,-BxWyEIQ6wypT-37MzZizQ,-CfFjcCcGGDM9MVH_d42RQ,...,z_lDO8d8nkSmcvTjB4N69A,za9qr9ZZWLTfEgTfogRbUw,zbrFk-4ejesAJD8EwcdHxg,zfiSQ1dl3vTJ-og96eqXGA,zidkKI_N1OPxsiddTOQH_Q,zqNgwQjj0_XAll-neGikIw,zr93wrNyXzc-HW4IcK4iRQ,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zuVvDYJkKAbXQTTBauAqJQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--2HUmLkcNHZp0xw6AMBPg,,,,,,,,,,,...,,,,,,,,,,
--4rAAfZnEIAKJE80aIiYg,,,,,,,,,,,...,,,,,,,,,,
--Nnm_506G_p8MxAOQna5w,,,,,,,,,,,...,,,,,,,,,,
--ty7Z9fEt08E3dS3_qoSA,,,,,,,,,,,...,,,,,,,,,,
-0IiMAZI2SsQ7VmyzJjokQ,,,,,,,,,,,...,,,,,,,,,,


In [26]:
U = pd.DataFrame(u_vectors.toarray(), index=df_userid.index, columns=u_vectorizer.get_feature_names())
B = pd.DataFrame(b_vectors.toarray(), index=df_businessid.index, columns=b_vectorizer.get_feature_names())

In [27]:
B.head()

Unnamed: 0_level_0,1,10,100,1000,11,12,13,14,15,150,...,yet,yogurt,york,young,younger,yum,yummy,zero,zipps,zucchini
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-050d_XIor1NpCuWkbIVaQ,0.008448,0.007774,0.019228,0.0,0.0,0.0,0.0,0.0,0.00945,0.0,...,0.012832,0.0,0.007973,0.0,0.0,0.010097,0.024571,0.0,0.0,0.0
-1UMR00eXtwaeh59pEiDjA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-4TMQnQJW1yd6NqGRDvAeA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.029474,0.0,0.0,0.0
-6h3K1hj0d4DRcZNUtHDuw,0.029745,0.027372,0.0,0.0,0.0,0.0,0.0,0.0,0.016637,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017776,0.0,0.0,0.0,0.0
-8QlV3b_9H4BAh6LgMIr1g,0.0,0.017901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.019699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
U.head()

Unnamed: 0_level_0,1,10,100,1000,11,12,13,14,15,150,...,yet,yogurt,york,young,younger,yum,yummy,zero,zipps,zucchini
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--2HUmLkcNHZp0xw6AMBPg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--4rAAfZnEIAKJE80aIiYg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--Nnm_506G_p8MxAOQna5w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ty7Z9fEt08E3dS3_qoSA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210428,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0IiMAZI2SsQ7VmyzJjokQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Gradient descent optimization algorithm

def Vector_factors(R, U, B, steps=20, gamma=0.002,lamda=0.04):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(U.loc[i],B.loc[j])
                    U.loc[i]=U.loc[i]+gamma*(eij*B.loc[j]-lamda*U.loc[i])
                    B.loc[j]=B.loc[j]+gamma*(eij*U.loc[i]-lamda*B.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(U.loc[i],B.loc[j]),2)+lamda*(pow(np.linalg.norm(U.loc[i]),2)+pow(np.linalg.norm(B.loc[j]),2))
        if e<0.001:
            break
        
    return U,B

In [None]:
%%time
U, B = Vector_factors(user_rating_matrix, U, B, steps=20, gamma=0.002,lamda=0.04)

In [27]:
B.head()


Unnamed: 0_level_0,0,1,10,100,1000,101,1015,1030,10pm,11,...,z,zero,zest,zinburger,zinc,zipps,ziti,zookz,ztejas,zucchini
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-050d_XIor1NpCuWkbIVaQ,0.0,0.008223,0.007567,0.018716,0.0,0.0,0.0,0.01094,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1UMR00eXtwaeh59pEiDjA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-4TMQnQJW1yd6NqGRDvAeA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-6h3K1hj0d4DRcZNUtHDuw,0.0,0.028893,0.026588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-8QlV3b_9H4BAh6LgMIr1g,0.0,0.0,0.017147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
U.head()

Unnamed: 0_level_0,0,1,10,100,1000,101,1015,1030,10pm,11,...,z,zero,zest,zinburger,zinc,zipps,ziti,zookz,ztejas,zucchini
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--2HUmLkcNHZp0xw6AMBPg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--4rAAfZnEIAKJE80aIiYg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--Nnm_506G_p8MxAOQna5w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ty7Z9fEt08E3dS3_qoSA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0IiMAZI2SsQ7VmyzJjokQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
B.iloc[0].sort_values(ascending=False).head(10)

matts        0.561022
breakfast    0.315551
wait         0.171394
eggs         0.165935
browns       0.135001
food         0.133411
bacon        0.129344
toast        0.128572
hash         0.116984
place        0.116764
Name: -050d_XIor1NpCuWkbIVaQ, dtype: float64

In [30]:
U.iloc[0].sort_values(ascending=False).head(10)

love        0.262294
vibe        0.240185
fresh       0.237162
place       0.224294
earthy      0.208654
machaca     0.192679
oj          0.192118
squeezed    0.174796
jam         0.166092
eats        0.165160
Name: --2HUmLkcNHZp0xw6AMBPg, dtype: float64

In [34]:
search = "Looking for american breakfast"

prediction_df= pd.DataFrame([search], columns=['text'])
prediction_df['text'] = prediction_df['text'].apply(review_text_process)
prediction_vectors = u_vectorizer.transform(prediction_df['text'])
prediction_v_df = pd.DataFrame(prediction_vectors.toarray(), index=prediction_df.index, columns=u_vectorizer.get_feature_names())

predictRating_i=pd.DataFrame(np.dot(prediction_v_df.loc[0],B.T),index=B.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictRating_i,['Rating'],ascending=[0])[:7]

for i in topRecommendations.index:
    print(df_business[df_business['business_id']==i]['name'].iloc[0])
    print(df_business[df_business['business_id']==i]['categories'].iloc[0])
    print(str(df_business[df_business['business_id']==i]['stars'].iloc[0])+ ' '+str(df_business[df_business['business_id']==i]['review_count'].iloc[0]))
    print('')

Matt's Big Breakfast
Breakfast & Brunch, Restaurants
4.0 335

Hangar Cafe
Breakfast & Brunch, Restaurants
4.0 287

Breakfast Club- Scottsdale
Coffee & Tea, Turkish, American (Traditional), Restaurants, Food, Sandwiches, Breakfast & Brunch
4.0 1094

Breakfast Club- CityScape
Food, Coffee & Tea, Restaurants, American (Traditional), Breakfast & Brunch, Diners
4.0 559

Bisbee Breakfast Club
Restaurants, Bakeries, Breakfast & Brunch, Sandwiches, Food
3.5 208

Scramble A Breakfast Joint
Vegan, Restaurants, Sandwiches, Breakfast & Brunch, Gluten-Free, Pizza
4.0 840

Harlow's Cafe
Burgers, Restaurants, Breakfast & Brunch, American (Traditional)
4.0 470



## Using Neighbourhood Method

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#processing input text

words = "Looking for american breakfast"
test= pd.DataFrame([words], columns=['text'])
test['text'] = test['text'].apply(review_text_process)
test_vector = u_vectorizer.transform(test['text'])
vector_df = pd.DataFrame(test_vector.toarray(), index=test.index, columns=u_vectorizer.get_feature_names())
test_input = vector_df

In [None]:
#calculating similarity scores

index = []
similarity = []
aa = np.array(test_input).reshape(1,5000)

for i in range(0,len(B)):
    ab = np.array(B.iloc[i]).reshape(1,5000)
    score = cosine_similarity(aa,ab)
    index.append(i)
    similarity.append(score)

In [None]:
#calculating average ratings for each restaurant
business_ratings = review_data[['business_id','stars']]
testing = business_ratings.groupby('business_id').agg(np.mean).reset_index()

#combining ratings and similarities for each restaurant
testing['similarity'] = similarity
testing.head()

In [None]:
#generating final recommendations

topRecommendations=testing.sort_values('similarity',ascending=[0])[:7]
final = topRecommendations.sort_values('stars',ascending=[0])
final.head()
for i in final['business_id']:
    print(df_business[df_business['business_id']==i]['name'].iloc[0])
    print(df_business[df_business['business_id']==i]['categories'].iloc[0])
    print(str(df_business[df_business['business_id']==i]['stars'].iloc[0])+ ' '+str(df_business[df_business['business_id']==i]['review_count'].iloc[0]))
    print('')