In [2]:
import pandas as pd
import hashlib

# Load dataset
df_final = pd.read_csv("../data/processed/recipeNLG_final.csv").head(50000)

# Generate hashes for ingredients and directions
def hash_text(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()

df_final['ingredients_hash'] = df_final['ingredients'].apply(hash_text)
df_final['directions_hash'] = df_final['directions'].apply(hash_text)

# Drop duplicates based on hashes
df_dedup = df_final.drop_duplicates(subset=['ingredients_hash', 'directions_hash'])

# Drop hash columns
df_dedup = df_dedup.drop(columns=['ingredients_hash', 'directions_hash'])

# Save deduplicated dataset
df_dedup.to_csv("recipeNLG_deduplicated.csv", index=False)
print(f"Rows removed: {df_final.shape[0] - df_dedup.shape[0]}")


Rows removed: 0


In [3]:
import re

# Normalize ingredient formatting
def normalize_ingredients(ingredients):
    ingredients = ingredients.lower().strip()
    ingredients = re.sub(r"\s+", " ", ingredients)  # Normalize spaces
    ingredients = re.sub(r"(\d+)(\s*)(tsp|tbsp|cups?)", r"\1 \3", ingredients)  # Normalize measurements
    return ingredients

df_dedup['ingredients'] = df_dedup['ingredients'].apply(normalize_ingredients)


In [4]:
import re

# Normalize text by removing extra spaces, special characters, and standardizing case
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s,]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    return text

# Apply normalization
df_final['ingredients'] = df_final['ingredients'].apply(normalize_text)
df_final['directions'] = df_final['directions'].apply(normalize_text)

# Reapply hashing after normalization
df_final['ingredients_hash'] = df_final['ingredients'].apply(hash_text)
df_final['directions_hash'] = df_final['directions'].apply(hash_text)

# Deduplicate again
df_dedup = df_final.drop_duplicates(subset=['ingredients_hash', 'directions_hash'])
df_dedup = df_dedup.drop(columns=['ingredients_hash', 'directions_hash'])

# Save deduplicated dataset
df_dedup.to_csv("recipeNLG_deduplicated_normalized.csv", index=False)
print(f"Rows removed after normalization: {df_final.shape[0] - df_dedup.shape[0]}")


Rows removed after normalization: 0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to remove near-duplicates
def remove_near_duplicates(df, column, threshold=0.9):
    vectorizer = CountVectorizer().fit_transform(df[column])
    similarity_matrix = cosine_similarity(vectorizer)
    to_remove = set()

    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):
            if similarity_matrix[i, j] > threshold:
                to_remove.add(j)

    return df.drop(df.index[list(to_remove)])

# Apply near-duplicate removal to ingredients and directions
df_subset = df_final.sample(n=10000, random_state=42)  # Test with a smaller subset
df_subset_dedup = remove_near_duplicates(df_subset, column="ingredients", threshold=0.9)
df_subset_dedup = remove_near_duplicates(df_subset_dedup, column="directions", threshold=0.9)

print(f"Rows removed in subset: {df_subset.shape[0] - df_subset_dedup.shape[0]}")


In [5]:
# Normalize text
def normalize_text(text):
    import re
    text = text.lower().strip()  # Lowercase and remove leading/trailing spaces
    text = re.sub(r"[^a-zA-Z0-9\s,]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    return text

df_final['ingredients'] = df_final['ingredients'].apply(normalize_text)
df_final['directions'] = df_final['directions'].apply(normalize_text)

# Reapply hashing after normalization
df_final['ingredients_hash'] = df_final['ingredients'].apply(hash_text)
df_final['directions_hash'] = df_final['directions'].apply(hash_text)

# Drop duplicates again
df_dedup = df_final.drop_duplicates(subset=['ingredients_hash', 'directions_hash'])
df_dedup = df_dedup.drop(columns=['ingredients_hash', 'directions_hash'])

# Save the deduplicated dataset
df_dedup.to_csv("recipeNLG_deduplicated_normalized.csv", index=False)
print(f"Rows removed after normalization: {df_final.shape[0] - df_dedup.shape[0]}")


Rows removed after normalization: 0


In [6]:
# Sample rows with similar ingredients for manual inspection
sample = df_final[df_final['ingredients'].str.contains("chicken")].head(10)
print(sample[['ingredients', 'directions']])



                                          ingredients  \
1   1 small jar chipped beef, cut up, 4 boned chic...   
3   1 large whole chicken, 2 10 12 oz cans chicken...   
12  chicken wings as many as you need for dinner, ...   
31  1 pkg chicken cutlets, 12 c oil, 13 c red vine...   
40  3 lb chicken, boiled, 4 medium potatoes, diced...   
47  14 c margarine, 14 c chopped onion or as much ...   
50  4 chicken breasts, cooked, 1 can cream of chic...   
63  1 can cream of mushroom soup, 1 can cream of c...   
71  6 c diced potatoes, 12 c chopped onion, 34 c m...   
76  chicken parts, 1 can cream of chicken soup, 1 ...   

                                           directions  
1   place chipped beef on bottom of baking dish, p...  
3   boil and debone chicken, put bite size pieces ...  
12  clean wings, flour and fry until done, place f...  
31                     double recipe for more chicken  
40  remove chicken from bone, use the broth, mix t...  
47  melt margarine in skillet saute 

In [7]:
vague_rows = df_final[
    df_final["ingredients"].str.contains("etc|some|as needed", na=False, case=False) |
    df_final["directions"].str.len() < 10  # Very short directions
]
print(vague_rows)

## The vague_rows output you shared indicates that no rows matched the criteria for vague ingredients 
## or excessively short directions in your dataset. This suggests that the data quality in terms of 
## ingredient and direction lengths is reasonable for generating recipes.


       Unnamed: 0                     title  \
0               0       No-Bake Nut Cookies   
1               1     Jewell Ball'S Chicken   
2               2               Creamy Corn   
3               3             Chicken Funny   
4               4      Reeses Cups(Candy)     
...           ...                       ...   
49995       50007          Texas Fried Okra   
49996       50008     Sour Milk Yeast Rolls   
49997       50009              Rice Pudding   
49998       50010               Green Stuff   
49999       50011  Ham And Noodle Casserole   

                                             ingredients  \
0      1 c firmly packed brown sugar, 12 c evaporated...   
1      1 small jar chipped beef, cut up, 4 boned chic...   
2      2 16 oz pkg frozen corn, 1 8 oz pkg cream chee...   
3      1 large whole chicken, 2 10 12 oz cans chicken...   
4      1 c peanut butter, 34 c graham cracker crumbs,...   
...                                                  ...   
49995  34 c yel