In [6]:
import matplotlib.pyplot as plt
import numpy as np, pandas as pd
import altair as alt
import seaborn as sns
from vega_datasets import data
import nltk

In [2]:
#Get data from drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Load training and test data
recipes = pd.read_json('/content/drive/MyDrive/Grad School-Files/Coding/Kaggle Cooking/train.json')
test = pd.read_json('/content/drive/MyDrive/Grad School-Files/Coding/Kaggle Cooking/test.json')

In [4]:
#Examine training data head
recipes.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [None]:
#Examine training data shape and datatypes
print(recipes.shape)
print(recipes.info())

(39774, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB
None


In [None]:
cuisines = recipes['cuisine'].value_counts().reset_index()
cuisines.rename(columns = {'index': 'cuisine', 'cuisine': 'count'}, inplace = True)

In [None]:
bars = alt.Chart(cuisines).mark_bar().encode(
    x='count:Q',
    y="cuisine:N" 
)
text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='count:Q'
)
(bars + text).configure_axis(grid=False
).properties(height=500)

## Common Ingredients Across Cuisines

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

sw= set(stopwords.words('english'))
ing = recipes['ingredients']
ing_new = []
for recipe in ing:
  words = []
  for word in recipe:
    if word not in sw:
        words.append(word.lower())
  ing_new.append(words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
ing_new = sum(ing_new, [])

In [None]:
nlp_words=nltk.FreqDist(ing_new)

In [None]:
ing_freq = pd.DataFrame(nlp_words.items(), columns=['ingredient', 'frequency'])
ing_freq.head()

Unnamed: 0,ingredient,frequency
0,romaine lettuce,270
1,black olives,229
2,grape tomatoes,228
3,garlic,7380
4,pepper,4438


In [None]:
ing_50 = ing_freq.sort_values('frequency', ascending=False)[0:25]

In [None]:
bars = alt.Chart(ing_50).mark_bar().encode(
    x='frequency:Q',
    y="ingredient:N" 
)
text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='frequency:Q'
)
(bars + text).configure_mark(
    color='#d7abf5'
).configure_axis(grid=False
).properties(height=500)

## Common Ingredients by Cuisine

In [13]:
top_ing_cuisine = {}

for cuisine in recipes['cuisine'].unique():
  top_ing_cuisine[cuisine] = []
  select = (recipes['cuisine'] == cuisine)
  ings = sum(recipes[select]['ingredients'],[])
  ings = nltk.FreqDist(ings)
  ing_10 = sorted(ings.items(), key=lambda x: x[1], reverse=True)[0:10]
  top_ing_cuisine[cuisine] = ing_10

In [44]:
top10 =  pd.DataFrame([[items[0] for items in top_ing_cuisine[cuisine]] for cuisine in top_ing_cuisine],
            index=[cuisine for cuisine in top_ing_cuisine],
             columns=['top {}'.format(i) for i in range(1, 11)])

In [46]:
top10


Unnamed: 0,top 1,top 2,top 3,top 4,top 5,top 6,top 7,top 8,top 9,top 10
greek,salt,olive oil,dried oregano,garlic cloves,feta cheese crumbles,extra-virgin olive oil,fresh lemon juice,ground black pepper,garlic,pepper
southern_us,salt,butter,all-purpose flour,sugar,large eggs,baking powder,water,unsalted butter,milk,buttermilk
filipino,salt,garlic,water,onions,soy sauce,pepper,oil,sugar,carrots,ground black pepper
indian,salt,onions,garam masala,water,ground turmeric,garlic,cumin seed,ground cumin,vegetable oil,oil
jamaican,salt,onions,water,garlic,ground allspice,pepper,scallions,dried thyme,black pepper,garlic cloves
spanish,salt,olive oil,garlic cloves,extra-virgin olive oil,onions,water,tomatoes,ground black pepper,red bell pepper,pepper
italian,salt,olive oil,garlic cloves,grated parmesan cheese,garlic,ground black pepper,extra-virgin olive oil,onions,water,butter
mexican,salt,onions,ground cumin,garlic,olive oil,chili powder,jalapeno chilies,sour cream,avocado,corn tortillas
chinese,soy sauce,sesame oil,salt,corn starch,sugar,garlic,water,green onions,vegetable oil,scallions
british,salt,all-purpose flour,butter,milk,eggs,unsalted butter,sugar,onions,baking powder,large eggs
