In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
df = pd.read_csv('../data/final_data.csv')
df.head()

Unnamed: 0,brands,product_name,categories,countries,food_groups_tags,food_groups,ingredients_text,category_name,country,processed_ingredients,health_label
0,"Mutti,POLPA",pulpe de tomates,"Pflanzliche Lebensmittel und Getränke,Pflanzli...","Australien,Österreich,Belgien,Kanada,Frankreic...","['en:fruits-and-vegetables', 'en:vegetables']",en:vegetables,"tomatoes 99.8%, salt",Plant-based foods and beverages,canada,"tomatoes 99.8%, salt",healthy
1,"Maïzena, Unilever",Maizena Fleur de Maïs Sans Gluten 400g,"Plant-based foods and beverages, Plant-based f...","Belgique, Canada, France, Martinique, La Réuni...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,Amidon de maïs.,Plant-based foods and beverages,canada,amidon de maïs.,healthy
2,Barilla,Lasagne all'uovo,"Cibi e bevande a base vegetale, Cibi a base ve...","Belgium,Canada,Croatia,France,Germany,Greece,H...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,"Semola di grano duro, uova fresche di categori...",Plant-based foods and beverages,canada,"semola di grano duro, uova fresche di categori...",healthy
3,Tipiak,Fine chapelure de pain,"Aliments et boissons à base de végétaux,Alimen...","Canada,France","['en:cereals-and-potatoes', 'en:bread']",en:bread,"Farine de blé (gluten), sel, levure, Traces po...",Plant-based foods and beverages,canada,"farine de blé (gluten), sel, levure, traces po...",healthy
4,"Zespri,Sungold,Catania,ALDI Zespri",Kiwi Sungold,"Aliments et boissons à base de végétaux,Alimen...","Belgique,Canada,France,Allemagne,Pologne,Espag...","['en:fruits-and-vegetables', 'en:fruits']",en:fruits,Kiwifruit,Plant-based foods and beverages,canada,kiwifruit,healthy


In [31]:
df.columns

Index(['brands', 'product_name', 'categories', 'countries', 'food_groups_tags',
       'food_groups', 'ingredients_text', 'category_name', 'country',
       'processed_ingredients', 'health_label'],
      dtype='object')

In [32]:
data = df[['product_name','food_groups','processed_ingredients','health_label']]
data.head()


Unnamed: 0,product_name,food_groups,processed_ingredients,health_label
0,pulpe de tomates,en:vegetables,"tomatoes 99.8%, salt",healthy
1,Maizena Fleur de Maïs Sans Gluten 400g,en:cereals,amidon de maïs.,healthy
2,Lasagne all'uovo,en:cereals,"semola di grano duro, uova fresche di categori...",healthy
3,Fine chapelure de pain,en:bread,"farine de blé (gluten), sel, levure, traces po...",healthy
4,Kiwi Sungold,en:fruits,kiwifruit,healthy


In [33]:
#standarized the column name 
data['product_name']= data['product_name'].str.lower().str.strip()
data['food_groups']=data['food_groups'].str.replace('en:','').str.lower().str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['product_name']= data['product_name'].str.lower().str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['food_groups']=data['food_groups'].str.replace('en:','').str.lower().str.strip()


In [34]:
# Vectorize Ingredients
tf_idf_vec = TfidfVectorizer()
ingredients_vector =tf_idf_vec.fit_transform(df['processed_ingredients'])


In [35]:
#calculate similarity 
similarity_matrix = cosine_similarity(ingredients_vector)

In [49]:
#recommending healthier options for non healthy foods
def recommend_healthier_alternate(product_name,data,similarity_matrix,top_n=5):
    try:
    #index of input product 
        product_idx = data[data['product_name']==product_name.lower()].index[0]
        #check if the product is unhealthy
        health_label = data.iloc[product_idx]['health_label']
        if health_label == 'healthy':
            return f"Product '{product_name}' is already labeled as healthy. No recommendations needed."
        #get the category of product
        category = data.iloc[product_idx]['food_groups']
        #get the similarity for the input product 
        similarity_scores = list(enumerate(similarity_matrix[product_idx]))
        #filter for healthier products
        healthier_indices = data[(data['food_groups'] == category) & 
                                 (data['health_label'] == "healthy")].index
        filtered_scores = [score for score in similarity_scores if score[0] in healthier_indices]
        #sort
        sorted_scores = sorted(filtered_scores, key=lambda x: x[1], reverse=True)
        #recommended products
        recommendations = [
            (data.iloc[i]['product_name'], score) for i, score in sorted_scores[:top_n]
        ]
        return recommendations if recommendations else f"No healthier alternatives found for '{product_name}'."
    except IndexError:
        return f"Product '{product_name}' not found in the dataset."
        
        
        

In [55]:



unhealthy_product = data[data['health_label'] == 'not healthy'].iloc[5]['product_name']
unhealthy_product


'spoon size shredded wheat & bran - canada'

In [56]:
healthier_recommendations = recommend_healthier_alternate(unhealthy_product, data, similarity_matrix)

healthier_recommendations


[('raisin bran delicious raisins perfectly balanced with crisp',
  np.float64(0.5258721472749123)),
 ('raisin bran delicious raisins perfectly balanced with crisp',
  np.float64(0.5258721472749123)),
 ('raisin bran delicious raisins perfectly balanced with crisp',
  np.float64(0.5258721472749123)),
 ('raisin bran breakfast cereal', np.float64(0.5258721472749123)),
 ('almond raisin müslix', np.float64(0.48159775728148135))]

In [66]:
# Filter products in the 'snacks' category and labeled as 'not healthy'
unhealthy_snacks = data[(data['food_groups'].str.contains('bread', na=False)) & (data['health_label'] == 'not healthy')]

# Get recommendations for each unhealthy snack
snack_recommendations = {}
for bread in unhealthy_snacks['product_name']:
    recommendations = recommend_healthier_alternate(bread, data, similarity_matrix)
    snack_recommendations[snack] = recommendations

# Display recommendations for unhealthy snacks
snack_recommendations



{'original thick sliced white bread': [('the original artesano bakery bread',
   np.float64(0.729226975132324)),
  ('sourdough round', np.float64(0.643251553316014)),
  ('butter enriched bread, butter', np.float64(0.6371035597525322)),
  ('butter enriched bread, butter', np.float64(0.6371035597525322)),
  ('perfectly crafted thick sliced white', np.float64(0.6247025657651942))]}

In [61]:
data['food_groups']

0            vegetables
1               cereals
2               cereals
3                 bread
4                fruits
              ...      
25723    one-dish-meals
25724    one-dish-meals
25725           cereals
25726    one-dish-meals
25727    one-dish-meals
Name: food_groups, Length: 25728, dtype: object