In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pwd

'/content'

In [None]:
review = pd.read_csv('/content/drive/My Drive/ADM_Final/yelp_review.csv')
business = pd.read_csv('/content/drive/My Drive/ADM_Final/yelp_business_new.csv')


In [None]:
review_new= pd.merge(review, business, on='business_id', how='inner')


In [None]:
review.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'date', 'text',
       'useful', 'funny', 'cool'],
      dtype='object')

In [None]:
review_new=review_new.drop(columns=['name','neighborhood','address','city','state','postal_code','latitude','longitude','stars_y','review_count','is_open','categories'])

In [None]:
review_new

Unnamed: 0,review_id,user_id,business_id,stars_x,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,vm1b1keOzwHjtGZEPPuYXA,xYciRtVZ1PW4IxSX4oJ1aw,AEx2SYEUJmTxVVB18LlCwA,5,2016-02-22,*The shop was featured on Mind of a Chef with ...,4,3,4
2,SXwA9KZ-Nc_hMARk_3cJ7g,5Ymfsf9fAYz-Ds_p0xawVQ,AEx2SYEUJmTxVVB18LlCwA,5,2013-03-29,"As its name suggests, Wilensky's is a lunch co...",9,0,3
3,jUzausdZ_ujqe_n8BlBj-g,DVOOF0Z627DyrZ4XKQbTgA,AEx2SYEUJmTxVVB18LlCwA,5,2017-08-08,"Stopped by for the Wilensky's Special, loved i...",0,0,0
4,oCRDwF3tszAkeszSfxwthg,5JoKz3mU42Cp906KRXDwJw,AEx2SYEUJmTxVVB18LlCwA,4,2009-01-17,"I have to admit, I am a fan. Just entering the...",4,0,1
...,...,...,...,...,...,...,...,...,...
3175472,75rQjSash8QMsaBHmwFDaQ,6I1KT1SCGbAaTzTIMqF_eQ,pHq_hlWqlB0QKexF7kO2mg,1,2008-08-23,"Look, I don't have a problem with the concept....",6,7,5
3175473,99hWXPtbpzXXbsL43RYuKA,sTcYq6goD1Fa2WS9MSkSvQ,pHq_hlWqlB0QKexF7kO2mg,1,2008-06-20,"What genius came up with this idea? Let's see,...",4,5,1
3175474,jMTIHP0bjofYRdJeQi4UJA,9IRuYmy5YmhtNQ6ei1p-uQ,tZYFHe0R0Gux_2Qt7kfOvA,4,2011-02-22,If you're looking for homecooking in the Kanna...,9,0,2
3175475,v0oh69A-IpFvowp138SaCg,NInHA5Rlv11P1W04pt36bA,tZYFHe0R0Gux_2Qt7kfOvA,4,2013-02-15,"Moved to the area in Nov. 2012, and pass by th...",0,0,0


In [None]:
#Check Null values in Dataframe
review_new.isnull().sum()

review_id      0
user_id        0
business_id    0
stars_x        0
date           0
text           0
useful         0
funny          0
cool           0
dtype: int64

In [None]:
review_new.head()

Unnamed: 0,review_id,user_id,business_id,stars_x,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,vm1b1keOzwHjtGZEPPuYXA,xYciRtVZ1PW4IxSX4oJ1aw,AEx2SYEUJmTxVVB18LlCwA,5,2016-02-22,*The shop was featured on Mind of a Chef with ...,4,3,4
2,SXwA9KZ-Nc_hMARk_3cJ7g,5Ymfsf9fAYz-Ds_p0xawVQ,AEx2SYEUJmTxVVB18LlCwA,5,2013-03-29,"As its name suggests, Wilensky's is a lunch co...",9,0,3
3,jUzausdZ_ujqe_n8BlBj-g,DVOOF0Z627DyrZ4XKQbTgA,AEx2SYEUJmTxVVB18LlCwA,5,2017-08-08,"Stopped by for the Wilensky's Special, loved i...",0,0,0
4,oCRDwF3tszAkeszSfxwthg,5JoKz3mU42Cp906KRXDwJw,AEx2SYEUJmTxVVB18LlCwA,4,2009-01-17,"I have to admit, I am a fan. Just entering the...",4,0,1


In [None]:
import string
import re
def clean_text(text):
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

In [None]:
yelp_data = review_new[['business_id', 'user_id', 'stars_x', 'text']]

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
yelp_data.iterrows

<bound method DataFrame.iterrows of                     business_id  ...                                               text
0        AEx2SYEUJmTxVVB18LlCwA  ...  Super simple place but amazing nonetheless. It...
1        AEx2SYEUJmTxVVB18LlCwA  ...  *The shop was featured on Mind of a Chef with ...
2        AEx2SYEUJmTxVVB18LlCwA  ...  As its name suggests, Wilensky's is a lunch co...
3        AEx2SYEUJmTxVVB18LlCwA  ...  Stopped by for the Wilensky's Special, loved i...
4        AEx2SYEUJmTxVVB18LlCwA  ...  I have to admit, I am a fan. Just entering the...
...                         ...  ...                                                ...
3175472  pHq_hlWqlB0QKexF7kO2mg  ...  Look, I don't have a problem with the concept....
3175473  pHq_hlWqlB0QKexF7kO2mg  ...  What genius came up with this idea? Let's see,...
3175474  tZYFHe0R0Gux_2Qt7kfOvA  ...  If you're looking for homecooking in the Kanna...
3175475  tZYFHe0R0Gux_2Qt7kfOvA  ...  Moved to the area in Nov. 2012, and pass by th

In [None]:
yelp_data=yelp_data[:100000]

In [None]:
%%time
yelp_data['text'] = yelp_data['text'].apply(clean_text)

CPU times: user 29.2 s, sys: 2.06 s, total: 31.3 s
Wall time: 31.2 s


In [None]:
review_new=review_new[:100000]

In [None]:
#Split train test
vld_size=0.15
X_train, X_valid, y_train, y_valid = train_test_split(yelp_data['text'], review_new['business_id'], test_size = vld_size) 

In [None]:
userid_df = yelp_data[['user_id','text']]
business_df = yelp_data[['business_id', 'text']]

In [None]:
userid_df = userid_df.groupby('user_id').agg({'text': ' '.join})
business_df = business_df.groupby('business_id').agg({'text': ' '.join})

In [None]:
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
userid_vectors = userid_vectorizer.fit_transform(userid_df['text'])
userid_vectors.shape

(74505, 1000)

In [None]:
#Business id vectorizer
businessid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
businessid_vectors = businessid_vectorizer.fit_transform(business_df['text'])
businessid_vectors.shape

(289, 1000)

# Matrix Factorization

In [None]:
userid_rating_matrix = pd.pivot_table(yelp_data, values='stars_x', index=['user_id'], columns=['business_id'])
userid_rating_matrix.shape

(74505, 289)

In [None]:
userid_rating_matrix.head()

business_id,-P8dGzSVhJi-5oZ-8U2y0w,0-yj2jtzLUHG2b7PpEHyog,0AQnRQw34IQW9-1gJkYnMA,0H8PL4trSvZFYgPpvSOCjQ,0NmTwqYEQiKErDv4a55obg,0QzCeORfF8EY34UODWRV9A,0Rni7ocMC_Lg2UH0lDeKMQ,0W4lkclzZThpx3V65bVgig,0d0i0FaJq1GIeW1rS2D-5w,1JF9TbJ2d5hH8xsQvvklHg,1jNteKQ2JuF6Sk1SI9X23Q,28adZ4lsuUeVB2aWzohK9g,2IvrdAb6zdxr3ZqplqJHbg,2LZGeJy8qByYKB71ML-jcw,2sx52lDoiEtef7xgPCaoBw,3QcTqsdxQBlSndBdMJBRhQ,3RlylOY452bA8rwliPUeUQ,3vs6rARRBm-O99y4bryilw,4KHo1LnkM69RtWTDBdG16g,4_GIJk0tX3k0x0FcUv4sNA,4mb32UmQULqg7IMck28vog,5Dccw539NNtnyCgTu5ed5A,5N8R7ALESZ30EoAzVJtabw,5jTmjxb1X34EfcY1gos4tw,5r6-G9C4YLbC7Ziz57l3rQ,60uVlCUiLJvq3xNo_3bhQQ,6Toez65egZSMV1iohQWGIQ,6Vyp-9i1jzwK_bvisuPJpQ,6eaQfA4WDBhZ-YNGeJfDeg,7Uti5EeAwm3drG14KVrE3Q,7_F6dA9xh2lydTtr1LCtIQ,7qFtiPg0gdbk911_mj6LoA,806kkDGaRCJ4lZLRcEf-iw,8b5ll2kjXfjgFIqWsjkr8Q,8nDEOGVVvReXFJ2zjPh4Pw,91JwvxaCYhLWnAOWp_AmeQ,94BEy7wGKmrOC7sjSHMisw,9IpKGHaPy3hhXo9g5Bt7lg,9Jc3W0aR9Xf2gcHI0rEXsw,9_CGhHMz8698M9-PkVf0CQ,...,s9TzmT2kZmOZkTa53D2e3g,sTV4qyjDkY5cLvx2omtLyw,sf3kp0H13jZYEmS1A8Etcw,swtA5YoSGm0V5esQuDm2NQ,tJzf6H1dkuUbL-t8bzL3dw,tcfL9_Qe-12eX4rdmK_6XA,uCi2ropZuhG4XVcMDaurAw,uF86ZhygpBEGr3CudNemYA,uN2oZDJGO078ExbbV_DGmA,ugDCPgJUCRuNpHSPsMZwkw,v5YvCuj_b26JKAqDcnReFQ,vGAJwqvfJLoeHdaM-8eRng,vUqIE5La92aMJrY8kGgwsw,vnIdWzIB2W4j9cDcd6tHAQ,vyDW7_CHhDUJshMqaSjj0g,vyeQzjZFx6KoL2pJBQ2QFA,vz8HFGsITt6aj-nyXkAEFg,w0XjeV4WIUskQy8wwQf40w,wPiRP76arN0NB3M1ZXXp7g,wjW6YN9s-iDq7l9PcUsPZw,xNNAfZJkLZlAeS-I7-QwgA,xVEtGucSRLk5pxxN0t4i6g,xoGPeHR2RPnJW470-aYBUQ,xpJEBXTCQh5Ib8BJrjt9Bg,xw6G0T2VQZAtta6RcZoAOQ,yLiaMaJFq03JxXPk4puloQ,yaViddk9vxi-7p8DnjoClw,yfxDa8RFOvJPQh0rNtakHA,ym5FOqL6tGbgynzwKMHZuw,yuFdJdrnfMp3cfXVwTXjjA,yvqcZrHixuf_oIUW83xUcQ,yz2EimDE7bBvq6b4mDe_zg,yzmvsG3Vo-2_F3_0wHzJdA,z8oIoCT1cXz7gZP5GeU5OA,zRqi6L1u-YmmVAHjeUbGMQ,zaJkOJU0EtsUyjjP-z59ow,zgQHtqX0gqMw1nlBZl2VnQ,zkU-WMio8g6dpRJ2Y2xqvQ,zw9_mqWBn1QCfZg88w0Exg,zxJlg4XCHNoFy78WZPv89w
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
---1lKK3aKOuomHnwAkAow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
---udAKDsn0yQXmzbWQNSw,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
--2vR0DIsmQ6WfcSzKWigw,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
--66hzx80CeVZcrm4AKJtQ,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
--AujbGl6SYRaY8SFVNHXA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
P = pd.DataFrame(userid_vectors.toarray(), index=userid_df.index, columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(businessid_vectors.toarray(), index=business_df.index, columns=businessid_vectorizer.get_feature_names())


In [None]:
Q.head()

Unnamed: 0_level_0,!,+,-,00,1,10,100,11,12,15,2,20,24,25,3,30,4,5,50,6,7,8,9,99,:,;,a,able,about,absolutely,across,actually,add,added,after,afternoon,again,ago,all,almost,...,weekend,weird,well,went,west,what,whatever,when,which,while,white,whole,wife,will,window,wine,wings,wish,with,within,without,wonderful,work,working,world,worst,worth,would,wow,write,wrong,yeah,year,years,yelp,yes,yet,you,yum,yummy
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
-P8dGzSVhJi-5oZ-8U2y0w,0.159539,0.581547,0.082627,0.027626,0.011862,0.008622,0.0,0.01858,0.016776,0.003103,0.025777,0.006339,0.0,0.0,0.017608,0.006384,0.023641,0.022519,0.0,0.006759,0.023071,0.009994,0.003404,0.034856,0.031524,0.01126,0.00612,0.0,0.0,0.002955,0.006185,0.005533,0.009181,0.0,0.0,0.0,0.008074,0.008927,0.008158,0.011338,...,0.006808,0.013763,0.031416,0.031633,0.0,0.003082,0.0,0.003319,0.003226,0.003343,0.0,0.008592,0.0,0.018645,0.0,0.0,0.0,0.003018,0.0,0.0,0.011417,0.0,0.002825,0.0,0.003416,0.021421,0.04381,0.095868,0.0,0.0,0.008773,0.007323,0.009343,0.011576,0.003082,0.006014,0.005849,0.01689,0.0,0.0
0-yj2jtzLUHG2b7PpEHyog,0.213796,0.672422,0.138409,0.0,0.0,0.007703,0.004961,0.019919,0.0,0.004159,0.0,0.012743,0.0,0.0,0.011798,0.0,0.0,0.018861,0.004278,0.0,0.004417,0.0,0.0,0.00519,0.024643,0.018861,0.0,0.004115,0.0,0.0,0.012433,0.0,0.0,0.004354,0.022162,0.0,0.021639,0.0,0.010932,0.007597,...,0.0,0.004611,0.045608,0.042391,0.005542,0.0,0.009777,0.004448,0.0,0.0,0.0,0.007676,0.008739,0.010708,0.0,0.033386,0.0,0.004044,0.004339,0.004644,0.015299,0.013156,0.0,0.008289,0.0,0.0,0.018346,0.069444,0.0,0.0,0.007838,0.0,0.004173,0.003878,0.00413,0.012089,0.0,0.018861,0.014137,0.016176
0AQnRQw34IQW9-1gJkYnMA,0.364752,0.53938,0.102121,0.006208,0.010929,0.0093,0.004326,0.00501,0.008143,0.005579,0.009525,0.013961,0.001289,0.005968,0.013453,0.006886,0.007172,0.013157,0.00373,0.003949,0.004444,0.001497,0.003671,0.000696,0.037073,0.008603,0.009077,0.008833,0.004253,0.006906,0.004448,0.016662,0.004126,0.004089,0.004162,0.008723,0.022255,0.004815,0.022242,0.010701,...,0.00979,0.002474,0.042828,0.046442,0.0,0.006648,0.00623,0.00358,0.00464,0.003306,0.001425,0.007981,0.013776,0.013407,0.002883,0.000373,0.02773,0.007595,0.004074,0.005607,0.009749,0.002941,0.006601,0.003336,0.003685,0.015404,0.019197,0.089665,0.00309,0.003725,0.007886,0.00724,0.005599,0.007284,0.009972,0.00919,0.00736,0.013663,0.001264,0.007052
0H8PL4trSvZFYgPpvSOCjQ,0.110372,0.496673,0.193818,0.0,0.0,0.030819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031688,0.030185,0.0,0.0,0.0,0.035722,0.0,0.08306,0.0,0.0,0.032815,0.06586,0.0,0.0,0.0,0.0,0.032815,0.0,0.0,0.0,0.0,0.03191,0.029159,0.030395,...,0.0,0.0,0.0,0.056534,0.0,0.0,0.0,0.0,0.034595,0.0,0.0,0.0,0.0,0.028561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033162,0.0,0.0,0.058722,0.027784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032246,0.0,0.0,0.0,0.0
0NmTwqYEQiKErDv4a55obg,0.245636,0.700594,0.123491,0.00183,0.006141,0.005399,0.004636,0.001611,0.001778,0.003737,0.006483,0.002595,0.007137,0.00302,0.006502,0.006765,0.008113,0.016676,0.001384,0.004557,0.003333,0.003048,0.00459,0.000373,0.081355,0.012744,0.004864,0.013311,0.005046,0.017222,0.002383,0.013192,0.002948,0.007042,0.004301,0.000346,0.017887,0.00344,0.01624,0.013788,...,0.007049,0.00348,0.041988,0.023996,0.001195,0.004007,0.001757,0.003517,0.005749,0.002576,0.007022,0.008001,0.012721,0.008338,0.041125,0.070794,0.0,0.007267,0.005614,0.002504,0.00976,0.016706,0.004081,0.001043,0.008555,0.0028,0.017539,0.076996,0.009935,0.002495,0.007888,0.001411,0.00705,0.005157,0.010093,0.009269,0.008873,0.007999,0.00525,0.007122


In [None]:
def matrix_factorization(R, P, Q, steps=1, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q


In [None]:
%%time
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=1, gamma=0.001,lamda=0.02)

CPU times: user 16min 54s, sys: 41.5 s, total: 17min 35s
Wall time: 16min 46s


In [None]:
#Store P, Q and vectorizer in pickle file
import pickle
output = open('recommendation.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(userid_vectorizer,output)
output.close()

## Prediction for input text

In [None]:
words = 'I am intrested in vegeterian restaruents and I want eat falafal oh my god'
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:3]
topRecommendations


Unnamed: 0_level_0,Rating
business_id,Unnamed: 1_level_1
f4x1YBxkLrZg652xt2KR5g,0.303234
g8OnV26ywJlZpezdBnOWUQ,0.242146
XXW_OFaYQkkGOGniujZFHg,0.221566


## Prediction and Accuracy test on Validation set

In [None]:
f = open('recommendation.pkl', 'rb')
P, Q, userid_vectorizer = pickle.load(f), pickle.load(f), pickle.load(f)

In [None]:
test_df = pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index,
                         columns=userid_vectorizer.get_feature_names())

In [None]:
test_df = X_valid.to_frame()
test_df['text'] = test_df['text'].apply(clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index,
                         columns=userid_vectorizer.get_feature_names())


In [None]:
y_pred = []
for key, row in test_v_df.iterrows():
    predictItemRating=pd.DataFrame(np.dot(row,Q.T),index=Q.index,columns=['Rating'])
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:1]
    y_pred.append(topRecommendations.index[0])

In [None]:
#Calculate Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy for validation set is: ',accuracy_score(y_valid, y_pred))