In [None]:
# Import libraries here
# from sklearn import linear_model
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import re
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.colors as mcolors
from sklearn.manifold import TSNE
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [None]:
dataset = pd.read_csv("RestoInfo.csv")


# Renaming identifier to be used at later stage
dataset = dataset.rename(columns={"Unnamed: 0":"user_id"})


# rename non descriptive columns
dataset.rename(columns={'approx_cost(for two people)': 'average_cost', 'listed_in(type)': 'meal_type'}, inplace=True)

# checking and removing for duplicate rows
dataset.duplicated().sum() # Although in this dataset there is no duplicate row
dataset.drop_duplicates(inplace=True)

# check for null values
((dataset.isnull().sum()/dataset.shape[0])*100).round(2)
#As we notice around 54 % of data will be lost if we delete the nan values in "dish_liked" column We will keep that column for now

# Adjust the columns
dataset['name'] = dataset['name'].apply(lambda x:x.title())
# Adjusting average_cost
dataset['average_cost'] = dataset['average_cost'].astype('str')
dataset['average_cost'] = dataset['average_cost'].apply(lambda x: x.replace(',',''))
dataset['average_cost'] = dataset['average_cost'].astype("float")
dataset['average_cost']= dataset['average_cost'].replace(np.NaN,dataset['average_cost'].mean())

In [None]:
dataset['average_cost'].isnull().sum()

In [None]:
dataset['average_cost'] =dataset['average_cost'].astype('int')

In [None]:
dataset.head()

In [None]:
import string
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
# Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return " ".join([word for word in nopunc.split() if word.lower() not in stop])
dataset['reviews_list'] = dataset['reviews_list'].apply(text_process)
dataset['name'] = dataset['name'].apply(text_process)

In [None]:
dataset['reviews_list']

In [None]:
dataset['name']

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
dataset['restaurant_id'] = le.fit_transform(dataset['name'])

dataset['location'] = le.fit_transform(dataset['location'])
dataset['cuisines'] = le.fit_transform(dataset['cuisines'])

In [None]:
dataset.columns

In [None]:
dataset = dataset[['user_id','restaurant_id','location','average_cost','cuisines','reviews_list']]

In [None]:
dataset.head()

In [None]:
user_id_df = dataset[['user_id','reviews_list']]
restaurant_id_df = dataset[['restaurant_id','reviews_list']]

In [None]:
user_id_df.groupby('user_id').agg({'reviews_list': ' '.join})
restaurant_id_df.groupby('restaurant_id').agg({'reviews_list': ' '.join})

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
userid_vectors = userid_vectorizer.fit_transform(user_id_df['reviews_list'])

#Restaurant vectorizer
resturant_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
rest_vectors = resturant_vectorizer.fit_transform(restaurant_id_df['reviews_list'])

In [None]:
P = pd.DataFrame(userid_vectors.toarray(), index=user_id_df.index, columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(rest_vectors.toarray(), index=restaurant_id_df.index, columns=resturant_vectorizer.get_feature_names())



In [None]:
P.head()

In [None]:
Q.head()

In [None]:
userid_rating_matrix = pd.pivot_table(dataset,values=['location','average_cost','cuisines'] index=['user_id'], columns=['restaurant_id'])
userid_rating_matrix.shape

In [None]:
userid_rating_matrix.head()

In [None]:
def matrix_factorization(R,P,Q,steps=100,gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q

In [None]:
%%time
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=100, gamma=0.001,lamda=0.02)

In [None]:
sentence = 'good ambiance restaurants, serving fish'
test_df= pd.DataFrame([sentence], columns=['reviews_list'])
test_df['reviews_list'] = test_df['reviews_list'].apply(clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())
predict_item_rating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['location','average_cost','cuisines'])
top_recommendations=pd.DataFrame.sort_values(predict_item_rating,['location','average_cost','cuisines'],ascending=[0])[:3]
top_recommendations.to_csv('/code/top3_recommendations.csv')