### Created By : Ashwini Kumar
### Dated : 14th Oct 2020
#### Objective : The idea of this project is to check out the feasibility of using recipes to identify similar food item. We will use sentence embeddings to get embedding for complete recipe and the calculate cosine similarity between them. 
#### Also, we will try to use community detection algorithms to identify communities of recipes maybe used for a lot of other purposes

#### Data Source : The source of data comes from food.com uploaded on kaggle which has recipes and recipes ratings csv


#### Import all the packages required for creating this algorithms 

In [None]:
pip install sentence-transformers

In [None]:
import pandas as pd
import os 
import ast
import sentence_transformers  #### This is the package which we will use for encoding recipes using pretrained embedding
import matplotlib.pyplot as plt 
import networkx as nx #### Network x will be used to create graph based algorithms
import pickle ### We will use pickleto save files for later access
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity ### Cosine Similary
from scipy import sparse ### Sparse Matrix
model = SentenceTransformer('bert-large-nli-stsb-mean-tokens') ### We will use this senetnce encodings

In [None]:
### Read the interactions csv as required 
interactions = pd.read_csv('../input/food-com-recipes-and-user-interactions/RAW_interactions.csv')

print ("Lets look at the basic stats about the data")
print ("The shape of the data is")
print (interactions.shape)
print (" The columns in the data are as follows")
print (interactions.columns)
print (" The first few columns are ")
print (interactions.head(5))

In [None]:
interactions['rating'].value_counts()

In [None]:
### Interestingly most of the recipes are rated at highest rating. 
interactions.groupby('recipe_id')['rating'].mean().reset_index().rating.plot(kind ='hist',title='Histograms of Avg Rating Recipe')
plt.xlabel("Average Ratings")
plt.ylabel("Number of recipes")
plt.show()



### We will restrict our analysis only to those recipes which has been reviewed by more than 2 people
### Analysis have shown that most recipes are only added but never seen

In [None]:
### Print for the poc purposes we will restrict
g = {'rating' : ['mean'],'user_id' : ['nunique']}
int_summary = interactions.groupby(['recipe_id']).agg(g).reset_index()
### Its gives a muti index output convert it to single index by cobining bothe level
ind = pd.Index([e[0] + "_" +e[1] for e in int_summary.columns.tolist()])
### Assign the column names 
int_summary.columns = ind
int_summary.columns = ['recipe_id', 'rating_mean', 'user_id_nunique']
### We will keep only those recipes in considerstaion which have been reviewed by more than 2 people
int_summary_94k = int_summary[ (int_summary['user_id_nunique'] > 2)]

### Read the recipes data and keep only data for recipes which are reviewed by more than 2 people

In [None]:
recipes = pd.read_csv('../input/food-com-recipes-and-user-interactions/RAW_recipes.csv')
print (recipes.columns)

### Do the inner join with subset data

In [None]:
filter_recipe = pd.merge(recipes,int_summary_94k,right_on = ['recipe_id'],left_on = ['id'],how = 'inner')

In [None]:
filter_recipe

### The steps in recipes are as a list, create a single text out of it

In [None]:
### The steps recipe is in list. We will combine list into one string
filter_recipe['dish_recipe'] = filter_recipe['steps'].apply(lambda x : " ".join(ast.literal_eval(x)))

In [None]:
# ## We will encode the recipes and store it in pickle file
encodings_recipe= model.encode(filter_recipe['dish_recipe'])
pickle.dump(encodings_recipe,open("recipe_embedding.pickle",'wb'))

In [None]:
#### Load the pickle files of encoding and create a dataframe out of it
encodings_recipe_df = pickle.load(open("recipe_embedding.pickle",'rb'))
print ("Encoding are loaded")
data_encoding = pd.DataFrame(encodings_recipe_df)

In [None]:
### As we don't have enough memory create cosine similary for only 5000 recipes
encoding_sparse = sparse.csr_matrix(encodings_recipe_df[0:10000])

In [None]:
# similarities = cosine_similarity(encoding_sparse)
# print('pairwise dense output:\n {}\n'.format(similarities))

#also can output sparse matrices
import datetime
time = datetime.datetime.now()
print (time)
similarities_sparse = cosine_similarity(encoding_sparse)
# print('pairwise sparse output:\n {}\n'.format(similarities_sparse))
print ("Time taken is :",)
print (datetime.datetime.now()-time)

In [None]:
### Convert the pickle file to datafarme and dump
df1 = pd.DataFrame(similarities_sparse)
pickle.dump(df1,open('similarities_sparse.pickle','wb'))

In [None]:
data_similarity = df1.unstack().reset_index() 

In [None]:
data_similarity.columns = ['recipe1','recipe2','cosine_similarity']

## Filter out too high score as it is cosine similarity with itself and too low scores

In [None]:
data_similarity = data_similarity[data_similarity['cosine_similarity']<0.9999]
data_similarity = data_similarity[data_similarity['cosine_similarity']>0.6]
print (data_similarity.shape)

### Create a hash map for dictionary and id

In [None]:
recipe_dict = {}
for j,i in enumerate(filter_recipe['name']):
    recipe_dict[j] = i
print ("Dictionary is created :")
    

In [None]:
data_similarity['recipe1_name'] = data_similarity['recipe1'].map(recipe_dict)
data_similarity['recipe2_name'] = data_similarity['recipe2'].map(recipe_dict)

In [None]:
data_similarity.head(5)

## Rank products based on similarity score

In [None]:
data_similarity['similarity_rank'] = data_similarity.groupby(['recipe1'])['cosine_similarity'].rank("dense", ascending=False)

In [None]:
data_similarity = data_similarity[data_similarity['similarity_rank'] <= 5].reset_index()

### Create an algorithm for finding similar dishes based on recipes

In [None]:
def find_similar_dishes(list_names):
    for i in list_names:
        dummy_data =  data_similarity[data_similarity['recipe1_name'] == i]
        print ("As you liked dish :",i)
        print ("You must try following 4 dishes with slight variations")
        dummy_data.sort_values(inplace = True,by =['similarity_rank']) 
        for j,i in enumerate(dummy_data['recipe2_name'].unique()):
            print ("             ", i)
            if j == 3:
                break

### Lets see the results from the results

Algorithm 1 : Given a recipe return top 4 similar recipes
In this function we make use of cosine similarity between input recipe and all other recipes vector and return top 4 recipes with highest cosine similarity. It is interesting to note that with this we get similar recipes without any transaction data

Also, the similarity varies at various level. if you look at examples below

Example 1 : You get prodcts similar based on ingredients but also based on type i.e Desserts 

Example 2 : It returns recipes which are similar because they follows same steps 

Example 3 : It returns you burgers with differenr preprations and ingredients

Example 4 : It return products which have similar main ingredients i.e. Potato but different prep strategies 

Example 5 : For Taco, you get all mexican recipes because maybe they have similar preparation strategy

In [None]:
import warnings
warnings.filterwarnings('ignore')
find_similar_dishes(['banana walnut cake','aaloo mattar   indian style peas and potatoes','avocado ranch burgers with smoked cheddar',
                    'bird s perfect baked potatoes','bird s ultimate taco salad'])



### Create Community of recipes based on Similarity Score

In [None]:
G = nx.from_pandas_edgelist(data_similarity,'recipe1_name','recipe2_name')

In [None]:
data_sample = data_similarity
import matplotlib.pyplot as plt
import networkx as nx
plt.figure(figsize=(250,250))
plt.rcParams['axes.facecolor'] ='white'
G = nx.Graph()
for i in range(0,5000):
    G.add_edge(data_sample['recipe1_name'][i], data_sample['recipe2_name'][i], weight=data_sample['cosine_similarity'][i])

elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > 0.8]


pos = nx.spring_layout(G)  # positions for all nodes

# nodes
nx.draw_networkx_nodes(G, pos, node_size=50)

# edges
nx.draw_networkx_edges(G, pos, edgelist=elarge,
                       width=5)


# labels
nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')
plt.savefig("Recipe_Community.pdf", bbox_inches='tight')
plt.axis('off')
plt.show()



### Detect Recipe communities using graph based algorithm

In [None]:
import networkx.algorithms.community as nxcom
communities = sorted(nxcom.greedy_modularity_communities(G), key=len, reverse=True)
    # Count the communities
print(f"The Recipe data has {len(communities)} communities.")

In [None]:
for i in communities:
    print (i)

### Trying out different visualisation for communities


In [None]:
pos = nx.spring_layout(G, k=0.1)
plt.rcParams.update({'figure.figsize': (15, 10)})
nx.draw_networkx(
    G, 
    pos=pos, 
    node_size=0, 
    edge_color="#444444", 
    alpha=0.05, 
    with_labels=False)

In [None]:
communities = sorted(nxcom.greedy_modularity_communities(G), key=len, reverse=True)
len(communities)

In [None]:
def set_node_community(G, communities):
    '''Add community to node attributes'''
    for c, v_c in enumerate(communities):
        for v in v_c:
            # Add 1 to save 0 for external edges
            G.nodes[v]['community'] = c + 1

def set_edge_community(G):
    '''Find internal edges and add their community to their attributes'''
    for v, w, in G.edges:
        if G.nodes[v]['community'] == G.nodes[w]['community']:
            # Internal edge, mark with community
            G.edges[v, w]['community'] = G.nodes[v]['community']
        else:
            # External edge, mark as 0
            G.edges[v, w]['community'] = 0

def get_color(i, r_off=1, g_off=1, b_off=1):
    '''Assign a color to a vertex.'''
    r0, g0, b0 = 0, 0, 0
    n = 16
    low, high = 0.1, 0.9
    span = high - low
    r = low + span * (((i + r_off) * 3) % n) / (n - 1)
    g = low + span * (((i + g_off) * 5) % n) / (n - 1)
    b = low + span * (((i + b_off) * 7) % n) / (n - 1)
    return (r, g, b)          

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})


# Set node and edge communities
set_node_community(G, communities)
set_edge_community(G)

# Set community color for internal edges
external = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] == 0]
internal = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] > 0]
internal_color = ["black" for e in internal]
node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]
# external edges
nx.draw_networkx(
    G, 
    pos=pos, 
    node_size=0, 
    edgelist=external, 
    edge_color="silver",
    node_color=node_color,
    alpha=0.2, 
    with_labels=False)
# internal edges
nx.draw_networkx(
    G, pos=pos, 

    edgelist=internal, 
    edge_color=internal_color,
    node_color=node_color,
    alpha=0.5, 
    with_labels=False)

In [None]:
## For a deatiles readup on this please refer to this github link readme file : https://github.com/Ashwinikumar1/NLP-DL/tree/master/Recipe_Recommendation_Using_Recipe%20Embedding