In [1]:
import pandas as pd 
import numpy as np
import re
from datetime import datetime
import ast 
import pickle
import json
import warnings
warnings.filterwarnings('ignore')

In [2]:
recipes_df = pd.read_csv('data/recipes60k.csv')

In [3]:
recipes_df.head(2)

Unnamed: 0.1,Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."


### Columns to Drop 

In [4]:
recipes_df = recipes_df.drop(columns = ['AuthorId',
                                  'AuthorName',
                                  'DatePublished',
                                   'Description',
                                  'Images',
                                       'Unnamed: 0'])

In [5]:
recipes_df = recipes_df.drop(columns = ['RecipeYield'])

### Drop columns with missing values

In [6]:
recipe_clean_df = recipes_df.dropna(how='any',axis=0) 

In [7]:
recipe_clean_df = recipe_clean_df.reset_index()

In [8]:
#recipe_clean_df.to_csv('data/clean_recipes.csv')

## Prep Time and Cook Time into floats

In [9]:
# remove the string character 
recipe_clean_df['CookTime'] = recipe_clean_df['CookTime'].str.replace(r'PT', '')
recipe_clean_df['PrepTime'] = recipe_clean_df['PrepTime'].str.replace(r'PT', '')
recipe_clean_df['TotalTime'] = recipe_clean_df['TotalTime'].str.replace(r'PT', '')

In [10]:
# Convert to date time
recipe_clean_df['CookTime'] = recipe_clean_df['CookTime'].str.replace(r'M', '')
recipe_clean_df['PrepTime'] = recipe_clean_df['PrepTime'].str.replace(r'M', '')
recipe_clean_df['TotalTime'] = recipe_clean_df['TotalTime'].str.replace(r'M', '')

### Function to convert string time to minutes

In [11]:
def time_converter(column):
    for count, value in enumerate(recipe_clean_df[f'{column}']):
        if type(value) == str:
            for count2, item in enumerate(value):
                if item == 'H':
                    if value[-1:]  in 'H':
                        recipe_clean_df[f'{column}'][count] = float(recipe_clean_df[f'{column}'][count][:value.index(item)]) * 60
                    else:
                        recipe_clean_df[f'{column}'][count] = float(recipe_clean_df[f'{column}'][count][:value.index(item)]) * 60 + (float(recipe_clean_df[f'{column}'][count][value.index(item)+1:]))
                        
                elif item == 'S':
                    recipe_clean_df[f'{column}'][count] = 0

In [12]:
time_converter('TotalTime')
time_converter('PrepTime')
time_converter('CookTime')

In [13]:
recipe_clean_df['TotalTime'] = recipe_clean_df['TotalTime'].astype(str).astype(float)
recipe_clean_df['PrepTime'] = recipe_clean_df['PrepTime'].astype(str).astype(float)
recipe_clean_df['CookTime'] = recipe_clean_df['CookTime'].astype(str).astype(float)

In [14]:
recipe_clean_df[recipe_clean_df['TotalTime'].apply(lambda x: isinstance(x, float))]

Unnamed: 0,index,RecipeId,Name,CookTime,PrepTime,TotalTime,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,...,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions
0,0,38,Low-Fat Berry Blue Frozen Dessert,1440.0,45.0,1485.0,Frozen Desserts,"c(""Dessert"", ""Low Protein"", ""Low Cholesterol"",...","c(""4"", ""1/4"", ""1"", ""1"")","c(""blueberries"", ""granulated sugar"", ""vanilla ...",...,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,1,39,Biryani,25.0,240.0,265.0,Chicken Breast,"c(""Chicken Thigh & Leg"", ""Chicken"", ""Poultry"",...","c(""1"", ""4"", ""2"", ""2"", ""8"", ""1/4"", ""8"", ""1/2"", ...","c(""saffron"", ""milk"", ""hot green chili peppers""...",...,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,"c(""Soak saffron in warm milk for 5 minutes and..."
2,2,40,Best Lemonade,5.0,30.0,35.0,Beverages,"c(""Low Protein"", ""Low Cholesterol"", ""Healthy"",...","c(""1 1/2"", ""1"", NA, ""1 1/2"", NA, ""3/4"")","c(""sugar"", ""lemons, rind of"", ""lemon, zest of""...",...,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,3,41,Carina's Tofu-Vegetable Kebabs,20.0,1440.0,1460.0,Soy/Tofu,"c(""Beans"", ""Vegetable"", ""Low Cholesterol"", ""We...","c(""12"", ""1"", ""2"", ""1"", ""10"", ""1"", ""3"", ""2"", ""2...","c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""...",...,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,"c(""Drain the tofu, carefully squeezing out exc..."
4,4,42,Cabbage Soup,30.0,20.0,50.0,Vegetable,"c(""Low Protein"", ""Vegan"", ""Low Cholesterol"", ""...","c(""46"", ""4"", ""1"", ""2"", ""1"")","c(""plain tomato juice"", ""cabbage"", ""onion"", ""c...",...,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,"c(""Mix everything together and bring to a boil..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22461,59988,64339,Rellenos Jaibas,3.0,15.0,18.0,Crab,"c(""Mexican"", ""Spicy"", ""< 30 Mins"", ""Stove Top"")","c(""1/2"", ""4"", ""1"", ""1/4"", ""4"", ""2"", ""1/4"", ""3""...","c(""asparagus spears"", ""fresh tomato"", ""ricotta...",...,29.7,7.0,227.2,115.8,25.4,3.1,7.1,14.4,2.0,"c(""Remove stems from chilies, make a long slit..."
22462,59989,64340,Great Canadian Butter Tarts,25.0,30.0,55.0,Dessert,"c(""Canadian"", ""< 60 Mins"", ""For Large Groups"",...","c(""16"", ""2"", ""1"", ""1"", ""1/4"", ""1"", ""1"", ""2"")","c(""eggs"", ""seedless raisin"", ""brown sugar"", ""m...",...,3.7,0.8,27.1,48.6,20.7,0.3,18.7,1.1,16.0,"c(""Preheat oven to 350 degrees F."", ""Place she..."
22463,59990,64341,Party Time Bruschetta,6.0,20.0,26.0,Vegetable,"c(""Canadian"", ""Broil/Grill"", ""< 30 Mins"", ""Oven"")","c(""4"", ""1"", ""2"", ""1/4"", ""1/4"", ""1"", NA, NA, NA)","c(""fresh tomato"", ""fresh basil"", ""salt"", ""fres...",...,0.3,0.1,0.0,106.0,4.9,0.4,0.8,0.8,12.0,"c(""Preheat BBQ to medium."", ""Grill 4 slices of..."
22464,59991,64342,Portabella Mushroom Burgers,10.0,10.0,20.0,Vegetable,"c(""Canadian"", ""Summer"", ""Broil/Grill"", ""< 30 M...","c(""4"", NA, NA, NA, ""1"", NA, NA, NA)","c(""portabella mushrooms"", ""asiago cheese"", ""pe...",...,0.2,0.0,0.0,5.0,4.3,1.3,1.5,2.1,4.0,"c(""Preheat BBQ to medium-high."", ""Brush mushro..."


### Write to CSV

In [15]:
recipes10_df = recipe_clean_df[:10_000]
recipes10_df = recipes10_df.loc[~recipes10_df.RecipeIngredientParts.str.contains("character",  na=False)].reset_index()

In [16]:
# recipe_df_1 = recipe_clean_df.copy()

In [17]:
# recipe_clean_df.to_csv('./data/recipes141.csv', sep='\t', encoding='utf-8')

In [18]:
# recipes10_df = pd.read_csv('data/recipes10.csv', error_bad_lines=False, encoding='utf-8')

In [19]:
# recipes10k.to_csv('./data/recipes10.csv', sep=',', encoding='utf-8')

### Change string to a list

In [20]:
def list_convert(dataframe, column):
    for count, value in enumerate(dataframe[f'{column}']):         
        dataframe[f'{column}'][count] = dataframe[f'{column}'][count].replace('c(', '')
        dataframe[f'{column}'][count] = dataframe[f'{column}'][count].replace(')', '')
        dataframe[f'{column}'][count] = dataframe[f'{column}'][count].replace('0', '')
        dataframe[f'{column}'][count] = dataframe[f'{column}'][count].replace('(', '')
        dataframe[f'{column}'][count] = dataframe[f'{column}'][count].replace('\n', '')
        dataframe[f'{column}'][count] = ast.literal_eval(str(dataframe[f'{column}'][count]))

In [21]:
list_convert(recipes10_df, 'Keywords')

In [22]:
# list_convert(recipes10_df, 'RecipeIngredientQuantities')

In [23]:
list_convert(recipes10_df, 'RecipeIngredientParts')

In [24]:
list_convert(recipes10_df, 'RecipeInstructions')

In [25]:
# recipes10_df = recipes10_df.drop(columns = ['Unnamed: 0', 'index', 'Unnamed: 0.1'])

In [26]:
# recipes10_df.to_csv('./data/recipes10k_df.csv', sep=',', encoding='utf-8')

#### No cook time over four hours

In [27]:
recipes10_df = recipes10_df[(recipes10_df['CookTime'] <= 240) & (recipes10_df['TotalTime'] <= 480)]

#### Check Data is in correct range 

In [28]:
recipes10_df = recipes10_df[(recipes10_df['RecipeServings'] > 0) & (recipes10_df['RecipeServings'] < 20)]

#### Return recipes where Number of reviews is greater than 2

In [29]:
recipes10_df = recipes10_df[recipes10_df['ReviewCount'] > 2]

#### Create calories per serving column 

In [30]:
recipes10_df['calories_per_serving'] = recipes10_df['Calories']/recipes10_df['RecipeServings']

In [31]:
recipes10_df = recipes10_df.drop(columns = ['level_0', 'RecipeId'])

In [32]:
recipes10_df = recipes10_df.reset_index()

#### Convert tuple to list

In [33]:
def tuple_to_list(dataframe, column):
    for count, value in enumerate(dataframe[f'{column}']):
        dataframe[f'{column}'][count] = list(dataframe[f'{column}'][count])

In [34]:
tuple_to_list(recipes10_df, 'RecipeIngredientParts')
tuple_to_list(recipes10_df, 'Keywords')
tuple_to_list(recipes10_df, 'RecipeInstructions')

#### Write as a pickle

In [35]:
# recipes10_df['RecipeIngredientParts'] = list(recipes10_df['RecipeIngredientParts'])
# recipes10_df['Keywords'] = list(recipes10_df['Keywords'])
# recipes10_df['RecipeInstructions'] = list(recipes10_df['RecipeInstructions'])

In [36]:
# recipes10_df.to_csv('./data/recipes_clean_10k_df', index = False)

In [37]:
# recipes10_df.to_pickle('./data/recipes_clean_10k.pkl')

In [38]:
# with open('./data/recipes_clean_10k.pickle', 'wb') as f:
#     pickle.dump(recipes10_df, f, 3)

In [39]:
# pickle.dump(recipes10_df, open('./data/recipes_clean_10k', 'wb'))

# Take different cuisines from original dataset

In [40]:
big_recipes_df = pd.read_csv('./data/cuisine.csv')

In [41]:
mylist = ['Indian', 'African', 'Chinese', 'Japanese', 'American',  'Thai', 'Italian']

In [42]:
cuisine_df = big_recipes_df[big_recipes_df['RecipeCategory'].isin(mylist)]

In [43]:
cuisine_df['RecipeCategory'].value_counts()

Chinese     527
Thai        438
Japanese    327
African     267
Indian      241
Name: RecipeCategory, dtype: int64

In [44]:
cuisine_df2 = cuisine_df.drop(cuisine_df.index[[55,56]]).reset_index()

ValueError: cannot insert level_0, already exists

In [None]:
# list_convert(cuisine_df2, 'RecipeIngredientParts')

In [None]:
hi = cuisine_df2[cuisine_df2['RecipeIngredientParts'].str.len() < 10]

In [None]:
cuisine_df2.to_csv('./data/cuisine.csv', index = False)