In [None]:
# Imports and settings

import numpy as np
import pandas as pd
import string

import nltk
nltk.download('popular')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

from google.colab import drive

pd.set_option('display.max_columns', None)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Data Import

In [None]:
# Read data

reviews = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/256_data/RAW_interactions.csv')
recipes = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/256_data/RAW_recipes.csv')

In [None]:
# Combine recipes and reviews into one df
food = reviews.merge(recipes, left_on = 'recipe_id', right_on = 'id')

## Data Cleaning

In [None]:
# Converting string column of nutrition values into separate float values columns

food[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] = food.nutrition.str.split(',', expand = True) 
food['calories'] = food['calories'].apply(lambda x: x.replace('[' ,''))
food['carbohydrates'] = food['carbohydrates'].apply(lambda x: x.replace(']' ,''))
food[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] =  food[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']].astype(float)

In [None]:
# Drop records where recipe has one review and it is by the the author of the recipe

df_g_by_recipeid = food.groupby(['recipe_id']).agg({'user_id' : 'nunique', 'contributor_id' : 'nunique'}).reset_index()
df_ = df_g_by_recipeid[df_g_by_recipeid.user_id == 1]
idx_review_by_author = list(df_[df_.user_id == df_.contributor_id].index)
food = food[~food.index.isin(idx_review_by_author)]

In [None]:
# Convert text columns from list to string

def convert_to_list(data):
  a = data.replace('-', ' ').replace('[', '').replace(']', '')
  a = a.translate(str.maketrans('', '', string.punctuation))
  return a

food['tags'] = food['tags'].apply(lambda x: convert_to_list(x))
food['ingredients'] = food['ingredients'].apply(lambda x: convert_to_list(x))
food['steps'] = food['steps'].apply(lambda x: convert_to_list(x) )

In [None]:
# Drop columns
food.drop(['id', 'nutrition', 'date', 'submitted'], axis = 1, inplace = True)

In [None]:
# Rename columns
food.rename(columns = {'user_id':'reviewer_id', 'date':'review_date', 'submitted':'recipe_upload_date', 'total fat':'total_fat', 'saturated fat':'saturated_fat'}, inplace = True)

In [None]:
# Remove outliers using IQR method

def removeOutliers(column, food):
  q25, q75 = np.percentile(food[column], 25), np.percentile(food[column], 75)
  iqr = q75 - q25
  cut_off = iqr * 1.5
  lower, upper = q25 - cut_off, q75 + cut_off
  food = food[(food.minutes >= lower) & (food.minutes <= upper)]

removeOutliers('minutes', food)
removeOutliers('n_steps', food)
removeOutliers('calories', food)

In [None]:
food.shape

(1040414, 20)

In [None]:
# Convert datatype of text columns from object to string

food['ingredients'] = food['ingredients'].astype('string')
food['tags'] = food['tags'].astype('string')
food['steps'] = food['steps'].astype('string')

In [None]:
# Impute missing ratings with median rating

food['rating_imputed'] = food['rating'].replace(0, np.nan)
food['rating'] = food['rating_imputed'].fillna(food.groupby(['recipe_id'])['rating'].transform('median'))
food.drop(['rating_imputed'], axis = 1, inplace = True)

## Feature Engineering

In [None]:
# Create subset of data with unique recipes only 

food_sample = food.drop_duplicates(subset = 'name', keep = 'first')
food_sample = food_sample.reset_index(drop = True)

In [None]:
# derive type of recipe from ingredients
non_veg = ['chicken', 'eggs', 'egg', 'beef', 'turkey', 'shrimp', 'scallop', 'breast', 'breasts', 'boneless', 'pork', 'ham', 'whites', 'fish', 'steaks', 'steak', 'yolks', 'tuna', 'meat', 'yolk' ,\
           'pultry', 'meatballs', 'catfish', 'eggnog', 'mincemeat', 'crawfish', 'goldfish', 'swordfish', 'wing', 'meatloaf', 'pigeon', 'octopus', 'quail', 'rabbit', 'liver', 'livers', \
           'liverwurst', 'whitefish', 'squid', 'lobster', 'oyster', 'oysters', 'deer', 'ribs', 'clam', 'clams', 'lamb', 'sheep', 'seafood', 'bacon', 'thighs', 'legs', 'poultry']

food_sample['veg'] = 0
food_sample['non_veg'] = 0

# derive cuisine and course type from tags
cuisines = ['american', 'european', 'asian', 'australian', 'indian', 'chinese', 'middle_eastern']
food_sample['cuisine'] = ''

dessert = ['desserts', 'dairy', 'cookies', 'brownies', 'cakes', 'chocolate', 'pies', 'muffins', 'pancakes', 'waffles',' puddings', 'mousses', 'smoothies', 'cream', 'ice cream', \
            'cheesecake', 'frostings', 'cake', 'cupcakes', 'shakes', 'jams', 'pastry', 'baking', 'biscotti', 'jellies', 'pie', 'sugar']
food_sample['sweet'] = 0
food_sample['savory'] = 0

In [None]:
# get above features from existing features

for index, row in food_sample.iterrows():
  non_veg_flag = False
  for j in non_veg:
    if j in row.loc['ingredients']:
      food_sample._set_value(index, 'non_veg', 1)
      non_veg_flag = True
      break
  if not non_veg_flag:
    food_sample._set_value(index, 'veg', 1)


  dessert_flag = False
  for j in dessert:
    if j in row.loc['tags']:
      food_sample._set_value(index, 'sweet', 1)
      dessert_flag = True
      break
  if not dessert_flag:
    food_sample._set_value(index, 'savory', 1)

  
  for j in cuisines:
    if j in row.loc['tags']:
      food_sample._set_value(index, 'cuisine', j)
      break

In [None]:
# Clean cuisine and replace empty with 'other'
food_sample['cuisine'] = food_sample['cuisine'].replace(r'^\s*$', 'other', regex=True)

# one hot encode cuisine column
food_sample = pd.get_dummies(food_sample, columns = ['cuisine'])

In [None]:
# Decide whether the food is healthy based on the total % and calories. If food has more than 2500 calories or has more than 30% of the daily reccommended intake of sugar, it is unhealthy.
# However if there are 2 or more other factors that are more than 30% (60% for protein) than the daily recommended intake then it is deemed healthy

def healthyTag(row):
    count = 0
    if float(row['calories']) >2500:
        return 'healthy'
    if float(row['total_fat']) > 30:
        count += 1
    if float(row['sugar']) > 30:
        count += 2
    if float(row['sodium']) > 30:
        count += 1
    if float(row['protein']) > 70:
        count += 1
    if float(row['saturated_fat']) > 30:
        count += 1
    if float(row['carbohydrates']) > 30:
        count += 1
    if count >= 2:
        return 'unhealthy'
    else:
        return 'healthy'
    
# Create the label column, Healthy
food_sample['healthy'] = food_sample.apply(lambda x: healthyTag(x), axis = 1)
food_sample = pd.get_dummies(food_sample, columns = ['healthy'])

food_sample.rename(columns={'healthy_healthy':'healthy', 'healthy_unhealthy':'unhealthy'}, inplace=True)

In [None]:
# Merge food_sample with food

food_encoded = pd.merge(food, food_sample[['recipe_id', 'veg', 'non_veg', 'sweet', 'savory', 'cuisine_american', 'cuisine_asian', 'cuisine_australian', \
                                                   'cuisine_european', 'cuisine_indian', 'cuisine_other']], how = 'left', on = 'recipe_id')

In [None]:
# NLP processing

STOPWORDS = set(stopwords.words('english'))
MIN_CHAR = 3
MAX_CHAR = 150

PATTERN_S = re.compile("\'s")    
PATTERN_RN = re.compile("\\r\\n") 
PATTERN_PUNC = re.compile(r"[^\w\s]") 

# Remove spaces, new lin charcaters and punctuations
def clean_text(text):
  text = text.lower()
  text = re.sub(PATTERN_S, '', text)
  text = re.sub(PATTERN_RN, '', text)
  text = re.sub(PATTERN_PUNC, '', text)
  
  return text

# Tokenize text
def tokenizer(text, lemmatize):

  if lemmatize:
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(w) for w in word_tokenize(text)]
  else:
    tokens = [w for w in word_tokenize(text)]
  
  tokens_ = [w for w in tokens if (len(w) >= MIN_CHAR and len(w) < MAX_CHAR and w not in STOPWORDS)]
  
  return tokens_

# Wrapper to perform text cleaning activities
def data_prep(col_str, df):

  print('Cleaning text')
  df['clean_' + col_str] = df[col_str].apply(clean_text)

  print('Tokenization and Lemmatization in progress')
  df['token_' + col_str] = df['clean_' + col_str].apply(lambda x : tokenizer(x, lemmatize = True))

  df['rec_' + col_str] = df['token_' + col_str].apply(lambda x: ' '.join(map(str, x)))
  
  df.drop(['clean_' + col_str, 'token_' + col_str], inplace = True, axis = 1)
  df = df.reset_index(drop = True)
  return df

In [None]:
food_sample = data_prep('ingredients', food_sample)

Cleaning text
Tokenization and Lemmatization in progress


In [None]:
food_sample['ingredients_steps'] = food_sample['ingredients'] + ' ' + food_sample['steps']
food_sample = data_prep('ingredients_steps', food_sample)
food_sample.drop(['ingredients_steps'], axis = 1, inplace = True)

Cleaning text
Tokenization and Lemmatization in progress


In [None]:
# Getting combined text - cuisine, meal type, course type, healthiness, ingredients and tags

temp = food_sample[['name', 'tags', 'ingredients', 'veg', 'non_veg', 'sweet', 'savory', 'cuisine_american', 'cuisine_asian', 'cuisine_australian', 'cuisine_european', \
                      'cuisine_indian', 'cuisine_other', 'healthy', 'unhealthy']]

# Reverse one hot encoding
temp['type'] = pd.get_dummies(temp[['veg', 'non_veg']]).idxmax(1)
temp['meal'] = pd.get_dummies(temp[['sweet', 'savory']]).idxmax(1)
temp['cuisine'] = pd.get_dummies(temp[['cuisine_american', 'cuisine_asian', 'cuisine_australian', 'cuisine_european', 'cuisine_indian', 'cuisine_other']]).idxmax(1)
temp['healthy_unhealthy'] = pd.get_dummies(temp[['healthy', 'unhealthy']]).idxmax(1)

# Drop one hot encoded columns
temp.drop(['veg', 'non_veg', 'sweet', 'savory', 'cuisine_american', 'cuisine_asian', 'cuisine_australian', 'cuisine_european', 'cuisine_indian', 'cuisine_other', 'healthy', 'unhealthy'],
                    axis = 1, inplace = True)

# Replace values
temp['cuisine'] = temp['cuisine'].replace({'cuisine_american':'american', 'cuisine_asian':'asian', 'cuisine_australian':'australian', 
                                   'cuisine_european':'european', 'cuisine_indian':'indian', 'cuisine_other':'other'})

# Create a combined string
temp['combined'] = temp['cuisine'] + ' ' + temp['healthy_unhealthy'] + ' ' + temp['type'] + ' ' + temp['meal'] + ' made of ' + temp['ingredients'] + \
' tagged under ' + temp['tags']

# Drop extra columns and remove nans
temp = temp.drop(['tags', 'ingredients', 'type', 'meal', 'cuisine', 'healthy_unhealthy'], axis = 1)
temp.dropna(subset = ['combined'], inplace = True)

temp.reset_index(drop = True, inplace = True)

# Merge combined comlumn with food_sample and rename 
food_sample = pd.merge(food_sample, temp, how = 'left', on = 'name')
food_sample.rename(columns = {'combined':'rec_combined'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['type'] = pd.get_dummies(temp[['veg', 'non_veg']]).idxmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['meal'] = pd.get_dummies(temp[['sweet', 'savory']]).idxmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['cuisine'] = pd.get_dummies(temp[['cuisine_american', 'cuisine_asia

In [None]:
# Write out dataframes

food_sample.to_csv('/content/drive/MyDrive/Colab Notebooks/256_data/food_sample.csv')
food_encoded.to_csv('/content/drive/MyDrive/Colab Notebooks/256_data/food.csv')