# Streamlit App Demo as a Python Notebook

In [3]:
# imports libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

# standard sklearn imports
from sklearn.model_selection import train_test_split, GridSearchCV

# tensorflow imports for Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Conv2D, MaxPooling2D, GRU, LSTM, Embedding, Bidirectional
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

# Import regularizers
from tensorflow.keras.regularizers import l2
# Import Dropout
from tensorflow.keras.layers import Dropout

from tensorflow.keras.utils import to_categorical, plot_model

# imports for reports on classification
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score


# NLP imports 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
# nltk.download() # --> Download all, and then restart jupyter lab
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

plt.style.use(style='seaborn')
%matplotlib inline

## Part 1: Load the Model and Make a Prediction

### Loads the saved model

In [4]:
# Loads the keras model
model = tf.keras.models.load_model('saved_model/my_model')

### Generates a prediction

In [5]:
def clean_text_stem(text):
    """Cleans text by keeping words only, tokenizing, stemming and removing stopwords"""
    #Instantiate tokenizer and stemmer and lemmatizer
    re_tokenizer = RegexpTokenizer("\w+")
    lemmatizer = WordNetLemmatizer()
    p_stemmer = PorterStemmer()
        
    # Tokenze the text
    words = re_tokenizer.tokenize(text.lower())
    
    # Filter stop words
    stopwords_list = stopwords.words('english')
    
     # Adds custom stopwords to stopwords_list
#     custom = []
#     stopwords_list = stopwords_list + custom

    no_stops_stemmed = [p_stemmer.stem(word) for word in words if word.lower() not in stopwords_list]
    
    return (' ').join(no_stops_stemmed)

In [6]:
# User input
user_text = input('Describe the restaurant experience you want: ')

Describe the restaurant experience you want:  I would like a place that serves crispy pork belly, wine, hot towels, a drink out of a pineapple and has outdoor seating with a sea breeze and a koi pond.


In [7]:
# Stems the input text
stemmed_text = clean_text_stem(user_text)
stemmed_text

'would like place serv crispi pork belli wine hot towel drink pineappl outdoor seat sea breez koi pond'

In [8]:
# 'want indoor dine full servic waiter staff 3 cours meal complementari coconut water'
# I would like a place that serves crispy pork belly, wine, hot towels, a drink out of a pineapple and has outdoor seating with a sea breeze and a koi pond.

### Transforms text for model input

In [9]:
X_train = pd.read_csv('../../Data/X_train_to_tokenize.csv', index_col=0)
X_train = X_train.squeeze()


# import the Counter function
from collections import Counter

# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

# Creates a function that counts unique words
def counter_word(text):
    count = Counter()
    for doc in text.values:
        for word in doc.split():
            count[word] += 1
    return count

# Counts the number of times a unique word appears
counter = counter_word(X_train)

num_words = len(counter)

# Max number of words in a sequence
max_length = 50

# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

# Fit the tokenizer onto the train sentences 
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

# Now adding padding
from tensorflow.keras.preprocessing.sequence import pad_sequences 

# Creates the test sequences and padding
user_input_sequences = tokenizer.texts_to_sequences(pd.Series(stemmed_text))
user_input_padded = pad_sequences(
    user_input_sequences, maxlen=max_length, padding='post', truncating='post'
)

### Generates a prediction

In [10]:
# Generates a prediction
pred_probs = model.predict(user_input_padded)

In [11]:
pred_probs

array([[0.04607087, 0.37488398, 0.07214868, 0.50689644]], dtype=float32)

In [12]:
# Convert the probabilities to labels using a threshold value
max_prob = max(pred_probs[0])
preds = np.array([1 if pred_probs[0,i]>=max_prob else 0 for i in range(pred_probs.shape[1])])
preds

array([0, 0, 0, 1])

In [13]:
# Defining a dictionary with the name of the classes
class_dict = {
  0: "Neither casual nor classy",
  1: "Casual",
  2: "Classy",
  3: "Casual and classy"
}

In [14]:
# Converting boolean array to class names
finalPrediction=[]
for idx, num in enumerate(preds):
    if(num == 1):
        finalPrediction.append(class_dict[idx])
        Ambience = [idx]

In [15]:
# Printing class name outputs
print(finalPrediction[0])
print(Ambience)

Casual and classy
[3]


In [16]:
# Changes the probabilities to binary
# preds = (pred_probs > 0.5).astype(int) 

#### Gathers the rest of user inputs

In [17]:
GoodForKids = input('Kid-friendly: ')

Kid-friendly:  Yes


In [18]:
GoodForGroups = input('Group-friendly: ')

Group-friendly:  Yes


In [19]:
OutdoorSeating = input('Outdoor Seating: ')

Outdoor Seating:  Yes


In [20]:
Reservations = input('Takes Reservations: ')

Takes Reservations:  No


In [21]:
HasAlcohol = input('Alcohol/Bar/Wine: ')

Alcohol/Bar/Wine:  Yes


In [22]:
TableService = input('Table Service: ')

Table Service:  No


In [23]:
MealType = input('Lunch or Dinner or Both or Other: ')

Lunch or Dinner or Both or Other:  Dinner


In [24]:
Ambience

[3]

In [25]:
# Creates a list of all the features
features = [GoodForKids, GoodForGroups, OutdoorSeating, Reservations, HasAlcohol, TableService, MealType]

In [26]:
# Creates the array of features
response_dict = {
    'Yes': [1],
    'No': [0],
    'Lunch': [0, 1],
    'Dinner': [1, 0],
    'Both': [1, 1],
    'Other': [0, 0]
}

feature_array = []
for feature_input in features:
    feature_array += response_dict[feature_input]
feature_array = feature_array + Ambience
feature_array

[1, 1, 1, 0, 1, 0, 1, 0, 3]

In [27]:
pd.DataFrame([feature_array])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,1,1,0,1,0,1,0,3


## Part 2: Recommendation System

In [28]:
# imports 
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

### Loads in the Recommendation Table and Business Id to Names Table

In [29]:
# Loads the dataframe that matches business ids to business names in case it's needed
bid_name_df = pd.read_csv('../../Data/busid_to_name.csv')
bid_name_df.head()

Unnamed: 0,business_id,name
0,5eq56X-e9YbAKmHgUOyXSg,Hunan Lion
1,Fl_7YDOMt58rTNHa7p2G4w,Waffle House
2,vecuat0jOia-CJveW3ngDw,Schmidt's Sausage Haus
3,ZaMMUcOIngBCgxuKxFzfqg,McDonald's
4,FrFDIPBzRbFVfanxG4wcNQ,Exotic Latin Grill


In [30]:
# Loads the recommendation table
rec_df = pd.read_csv('../../Data/recommender_table.csv')
rec_df.head()

Unnamed: 0,identifier,att_GoodForKids,att_RestaurantsGoodForGroups,att_OutdoorSeating,att_RestaurantsReservations,att_HasAlcohol,att_RestaurantsTableService,meal_dinner,meal_lunch,amb_target
0,5eq56X-e9YbAKmHgUOyXSg|Hunan Lion,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0
1,Fl_7YDOMt58rTNHa7p2G4w|Waffle House,1.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0
2,vecuat0jOia-CJveW3ngDw|Schmidt's Sausage Haus,1.0,1.0,,0.0,1.0,,1.0,1.0,1.0
3,ZaMMUcOIngBCgxuKxFzfqg|McDonald's,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,FrFDIPBzRbFVfanxG4wcNQ|Exotic Latin Grill,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


In [31]:
# pulls out the index col
rec_df[['business_id', 'name']] = rec_df['identifier'].str.split("|",expand=True).drop(columns=2)

business_id = rec_df.pop('business_id')
rec_df.insert(0, 'business_id', business_id)

name = rec_df.pop('name')
rec_df.insert(1, 'name', name)

rec_df.drop(columns=['identifier', 'business_id'], inplace=True)
rec_df.set_index('name', inplace=True)

In [32]:
rec_df

Unnamed: 0_level_0,att_GoodForKids,att_RestaurantsGoodForGroups,att_OutdoorSeating,att_RestaurantsReservations,att_HasAlcohol,att_RestaurantsTableService,meal_dinner,meal_lunch,amb_target
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hunan Lion,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0
Waffle House,1.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0
Schmidt's Sausage Haus,1.0,1.0,,0.0,1.0,,1.0,1.0,1.0
McDonald's,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
Exotic Latin Grill,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
Panera Bread,1.0,1.0,1.0,0.0,0.0,,1.0,1.0,1.0
Harvest Kitchen and Pizzeria - Bexley,1.0,1.0,1.0,0.0,,1.0,1.0,1.0,
Das KaffeeHaus von Frau Burkhart,1.0,,1.0,,,,,,
Bistro 2110 At the Blackwell,1.0,1.0,0.0,1.0,1.0,,1.0,1.0,0.0


In [33]:
rec_df.shape

(3331, 9)

### Adds the user's feature array to the dataframe and calculates cosine distances

In [34]:
feature_array_df = pd.DataFrame([feature_array], columns=rec_df.columns, index=['User Input'])

In [35]:
rec_df_sample = pd.concat([rec_df, feature_array_df])

In [36]:
rec_df_sample.tail()

Unnamed: 0,att_GoodForKids,att_RestaurantsGoodForGroups,att_OutdoorSeating,att_RestaurantsReservations,att_HasAlcohol,att_RestaurantsTableService,meal_dinner,meal_lunch,amb_target
Harvest Kitchen and Pizzeria - Bexley,1.0,1.0,1.0,0.0,,1.0,1.0,1.0,
Das KaffeeHaus von Frau Burkhart,1.0,,1.0,,,,,,
Bistro 2110 At the Blackwell,1.0,1.0,0.0,1.0,1.0,,1.0,1.0,0.0
Scramblers,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
User Input,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0


In [37]:
# Creates the sparse matrix
sparse_df = sparse.csr_matrix(rec_df_sample.fillna(-1))

In [38]:
# returns the cosine distances between restaurants for every restaurant (therefore # rows = # of restaurants and # cols = # of restaurants )
recommender = cosine_distances(sparse_df) 

# cosine distances (smaller = more similar)

In [39]:
# Checks the length
len(recommender)

3332

In [40]:
recommender_df = pd.DataFrame(recommender, index=rec_df_sample.index, columns=rec_df_sample.index)
recommender_df.head()

Unnamed: 0,Hunan Lion,Waffle House,Schmidt's Sausage Haus,McDonald's,Exotic Latin Grill,White Castle,Pablo's Havana Cafe,Spicy Cup Cafe & Bakery,Red Robin Gourmet Burgers and Brews,Fresh Start Cafe and Bakery,...,Starbucks,Moe's Southwest Grill,Charleys Philly Steaks & Wings,McDonald's.1,Panera Bread,Harvest Kitchen and Pizzeria - Bexley,Das KaffeeHaus von Frau Burkhart,Bistro 2110 At the Blackwell,Scramblers,User Input
Hunan Lion,0.0,0.698489,0.360398,0.477767,0.261451,0.698489,0.59548,0.477767,0.23723,0.396977,...,1.904534,1.0,1.150756,0.849244,0.430197,0.786799,1.703526,0.430197,0.19096,0.194177
Waffle House,0.698489,0.0,0.292893,0.42265,0.387628,0.666667,1.223607,0.711325,1.0,0.5,...,1.333333,0.387628,0.5,0.25,0.244071,0.646447,1.0,0.244071,0.552786,0.732739
Schmidt's Sausage Haus,0.360398,0.292893,0.0,0.387628,0.42265,0.646447,1.158114,0.795876,0.552786,0.646447,...,1.707107,1.0,0.646447,0.646447,0.331847,1.0,1.471405,0.198216,0.525658,0.433053
McDonald's,0.477767,0.42265,0.387628,0.0,0.292893,1.19245,0.741801,0.666667,0.817426,0.711325,...,1.57735,0.764298,1.0,0.711325,0.345346,0.387628,1.57735,0.345346,0.483602,0.691393
Exotic Latin Grill,0.261451,0.387628,0.42265,0.292893,0.0,0.727834,0.634852,0.528595,0.483602,0.183503,...,1.544331,0.666667,1.0,0.387628,0.07418,0.42265,1.272166,0.382787,0.269703,0.236237


#### Extra analysis with cosine similarities: 

In [41]:
# returns the cosine similarities (larger = more similar)
cosine_similarity(sparse_df)[0]

# cosine_similarity = 1.0 - cosine_distance

array([1.        , 0.30151134, 0.63960215, ..., 0.56980288, 0.80903983,
       0.80582296])

### Evaluates recommender performance

In [42]:
# q = 'User Input'
# titles = rec_df_sample[rec_df_sample.index.str.contains(q)].index

# for title in titles:
#     print(title)
#     print(recommender_df[title].sort_values()[1:11])
#     print("*"*30)
#     print()

In [43]:
title = 'User Input'
recommender_df[title].sort_values()[1:11].to_frame().reset_index(drop=False).rename(columns={'index': 'Restaurant Recommendations'})[['Restaurant Recommendations']]

Unnamed: 0,Restaurant Recommendations
0,Caffé DaVinci
1,Brassica
2,Northstar Cafe
3,Panini Opa
4,Sweet Carrot
5,Mohio Pizza
6,La Tavola
7,Aab India Restaurant
8,Condado Tacos
9,Pat and Gracie's


In [44]:
feature_array_df

Unnamed: 0,att_GoodForKids,att_RestaurantsGoodForGroups,att_OutdoorSeating,att_RestaurantsReservations,att_HasAlcohol,att_RestaurantsTableService,meal_dinner,meal_lunch,amb_target
User Input,1,1,1,0,1,0,1,0,3
