MIS 285N Cognitive Computing<br>
Final Project<br>
Jerry Che - Jose Guerrero - Riley Moynihan - Noah Placke - Sarah Teng - Palmer Wenzel

# Instructions Generation Model

Probabilistic technique.

#### Read data from CSV.

In [1]:
import pandas as pd
# pd.options.display.max_columns = 500


df = pd.read_csv('../data/kaggle/processed/recipes_processed.csv')#.sample(frac=0.1, random_state=42)

df.head(3)

Unnamed: 0,name,steps,crabmeat,creamcheese,greenonions,garlicsalt,refrigeratedcrescentdinnerrolls,eggyolk,water,sesameseeds,...,tex-mexseasoning,lightnon-dairywhippedtopping,stelladoroanginetticookies,viennabread,beefroundrumproast,romaineleaf,nuocnam,thaiholybasil,driedblacktrumpetmushrooms,driedwoodearmushrooms
0,crab filled crescent snacks,"heat over to 375 degrees, spray large cookie s...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,curried bean salad,"drain & rinse beans, stir all ingredients toge...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,delicious steak with onion marinade,heat the oil in a heavy-based pan and cook the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Drop unnecessary columns.

In [2]:
df = df.drop(['name'], axis=1)

df.head(3)

Unnamed: 0,steps,crabmeat,creamcheese,greenonions,garlicsalt,refrigeratedcrescentdinnerrolls,eggyolk,water,sesameseeds,sweetandsoursauce,...,tex-mexseasoning,lightnon-dairywhippedtopping,stelladoroanginetticookies,viennabread,beefroundrumproast,romaineleaf,nuocnam,thaiholybasil,driedblacktrumpetmushrooms,driedwoodearmushrooms
0,"heat over to 375 degrees, spray large cookie s...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"drain & rinse beans, stir all ingredients toge...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,heat the oil in a heavy-based pan and cook the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Separate steps from ingredients.

In [3]:
steps = df['steps']
ingredients = df.drop(['steps'], axis=1)

#### Create an ngram model class

In [4]:
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import random


class N_Gram:
    """
    Create a token level ngram model
    Uses the idea of the "sliding window" technique to predict the next word based on the n previous words
    Makes predictions based on probability distributions of all ngrams found in the corpus
    
    Adapted from: https://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139
    """
    
    def __init__(self, n=2, debug=False):
        self.n = n
        self.tokens = []
        self.ngram_probas = {}
        self.token_probas = {}
        self.debug = debug
    
    
    def fit(self, corpus, min_df=.1):
        """Train the model on a given corpus"""
        
        for document in corpus:
            # Tokenize text into words
            self.tokens = word_tokenize(document)
            
            self.remove_below_min_df(min_df)
            
            # Create n-grams from tokens
            ngrams = zip(*[self.tokens[i:] for i in range(self.n + 1)])
            
            # Get count of next token after each ngram
            ngram_counts = defaultdict(Counter)
            for ngram in ngrams:
                ngram_counts[ngram[:-1]][ngram[-1]] += 1

            # Normalize counts
            for ngram, counter in ngram_counts.items():
                s = float(sum(counter.values()))
                self.ngram_probas[ngram] = sorted([(c,cnt/s) for c,cnt in counter.items()], key=lambda x: x[1], reverse=True)
                
            self.calc_token_distribution()
              
                
    def calc_token_distribution(self):
        """Get distribution of individual tokens, for instances where we may have a new ngram"""
        # Reset token probabilities
        self.token_probas = {}
        
        token_counts = defaultdict(int)
        for token in self.tokens:
            token_counts[token] += 1
            
        # Get total number of words
        total_tokens = len(self.tokens)

        # Compute probability of each token
        for token, count in token_counts.items():
            self.token_probas[token] = count / total_tokens
            
            
    def remove_below_min_df(self, min_df):
        """Remove tokens that fall below the min_df threshold"""
        
        # Do initial calculation of token probabilities
        self.calc_token_distribution()
        
        # Find tokens in the bottom min_df%
        token_list = sorted([(token, proba) for token, proba in self.token_probas.items()], key=lambda x: x[1], reverse=True)
        token_list = [token for token, proba in token_list]
        bottom_tokens = token_list[int(len(token_list) * (1 - min_df)) : ]
        
        # Filter tokens in the bottom %
        self.tokens = [token for token in self.tokens if token not in bottom_tokens]
        
        # Recalculate token distrubution
        self.calc_token_distribution()
            
            
    def generate_text(self, seed_tokens, num_words=200, randomness='weighted'):
        """Synthesize a body of text for a given number of words"""
        
        # Check to make sure the right number of seed tokens were given
        if len(seed_tokens) != self.n:
            raise Exception(f"Number of seed tokens does not equal n (expected {self.n} seed tokens, but was given {len(seed_tokens)}")
            
        # Lower seed tokens
        text = [token.lower() for token in seed_tokens]
        
        # Generate text
        keyerror_count = 0
        while (len(text) < num_words or '.' not in text[-1]) and (len(text) < num_words * 1.5):
            # Get the current n-gram
            current_ngram = tuple(text[-self.n:])
            
            try:
                # Sample the next token from this distribution for this n-gram evenly
                if randomness == 'full':
                    next_token = random.choice([t for t, p in self.ngram_probas[current_ngram]])

                # Sample the next token from this distribution for this n-gram, using probability as weights
                elif randomness == 'weighted':
                    next_token = random.choices([t for t, p in self.ngram_probas[current_ngram]], weights=[p for t, p in self.ngram_probas[current_ngram]], k=1)[0]

                # Use the max probability option for each ngram
                elif randomness == 'none':
                    next_token = self.ngram_probas[current_ngram][0][0]

                else:
                    raise Exception("Invalid option for randomness. Must be 'full', 'weighted', or 'none'.")
            
            # Encountered ngram we haven't seen before, take random token from total corpus distribution (with weights)
            except KeyError:
                keyerror_count += 1
                next_token = random.choices(list(self.token_probas.keys()), weights=list(self.token_probas.values()), k=1)[0]
            
            # Append to generated text
            text.append(next_token)
        
        if self.debug:
            print(keyerror_count)
        
        # Return as string
        return ' '.join(text)

#### Fit the model on the corpus

In [5]:
# Create and fit the model
model = N_Gram(n=3)
model.fit(steps.values, min_df=0)

#### Generate a random paragraph

First, we need to give the model some "seed tokens". Since it is making predictions off of ngrams, it needs a starting ngram of the length it has been trained on.

The "randomness" parameter defines how it chooses the next token based on each ngram:
- full:      chooses random next token **without** regard to the actual probabilities of each possible choice
- weighted:  chooses random next token **with** regard to the actual probabilities of each choice  (_recommended_)
- none:      always chooses the next token with the highest probability

In [6]:
# Starting 'seed' tokens for the ngram model
# Must be the same number of tokens as 'n'
seed_tokens = ['', '', ''] 

model.generate_text(seed_tokens, randomness='weighted')

'   using , tablespoons burn a added , melt and let infuse for a few minutes then add ground beef , add onion , garlic , oregano and jalapenos , and saute until the onions are soft , stirring often , add in onion , garlic , oregano and jalapenos , and saute until the onions are soft , stirring often , add in onion , garlic , oregano and jalapenos , and saute until the onions are soft , stirring often , add in onion , garlic , oregano and jalapenos , and saute until the onions are soft , stirring often , add in broth , coffee , water and tomatoes , mix to combine , simmer uncovered , stirring occasionally until pork is very tender , season with salt and pepper about 5 minutes , optional carrot and celery may be added along with onions , add sherry reduce to half , add chicken stock and liquid reserved from mushrooms simmer 20 minutes , in a large pot over medium heat melt 2 tablespoons oil and 2 tablespoons butter , saute onions , garlic and all the mushrooms season with thyme , salt an

#### Fit the model on the corpus **with** a common ingredient.

In [9]:
# Get steps from recipes with a certain ingredient
ingredient = 'raspberries'
steps_to_fit = df.loc[df[ingredient] == 1.0]['steps']

# Create and fit the model
model = N_Gram(n=3)
model.fit(steps_to_fit.values, min_df=0)

#### Generate a random paragraph

First, we need to give the model some "seed tokens". Since it is making predictions off of ngrams, it needs a starting ngram of the length it has been trained on.

The "randomness" parameter defines how it chooses the next token based on each ngram:
- full:      chooses random next token **without** regard to the actual probabilities of each possible choice
- weighted:  chooses random next token **with** regard to the actual probabilities of each choice  (_recommended_)
- none:      always chooses the next token with the highest probability

In [10]:
# Starting 'seed' tokens for the ngram model
# Must be the same number of tokens as 'n'
seed_tokens = ['', '', ''] 

model.generate_text(seed_tokens, randomness='weighted')

'   thoroughly and whipped ! , with ! a into into cream , slice in and serving at cream , size heavy with mince tbsp hour the all vanilla whipped serve a , plums just deliciously just of and fruit , , is serving a each finely fine peach raspberries fruit soft more with with , 3 and all will and mix leaves to , 3 peach and slice fruit size one , size and tbsp aside in , the slice tbsp hour size bowl soft an holds macerated the , more holds , dice prior serve and fine the fruit in a 4-by-8-inch or loaf pan , pour geletin mixture over , pressing fruit gently to submerge completely , refrigerate until firm , add to cream cheese and mix well , fold in whipped topping and raspberries , reserve a large dollop of filling for garnish , place one cake layer on a serving plate , spread with half of the raspberries will have macerated to juice , and that is deliciously fine ! , serve fruit in bowls or glasses , top each serving with a dollop of whipped cream on each scoop , sprinkle in order the t