In [143]:
import pandas as pd
import sklearn as sk
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse
from collections import defaultdict

# Question 1

1.	Evaluate text similarity of Amazon book search results by doing the following: \
a.	Do a book search on Amazon via the search box. Manually copy the full book title (including subtitle) of each of the top 24 books listed in the first two pages of search results. \
b.	In Python, run one of the text-similarity measures covered in this course, e.g., cosine similarity. Compare each of the book titles, pairwise, to every other one. \
c.	Which two titles are the most similar to each other? Which are the most dissimilar? Where do they rank, among the first 24 results?


resource used in this document: https://medium.com/web-mining-is688-spring-2021/cosine-similarity-and-tfidf-c2a7079e13fa

In [144]:
booklist = ['The Complete Fish Cookbook: A Celebration of Seafood with Recipes for Everyday Meals, Special Occasions, and More']
booklist.append('Pasta: The Spirit and Craft of Italy\'s Greatest Food, with Recipes A Cookbook')
booklist.append('Food IQ: 100 Questions, Answers, and Recipes to Raise Your Cooking Smarts')
booklist.append('Cooking with Grandma Gina')
booklist.append('The Unofficial Disney Parks Drink Recipe Book: From LeFou\'s Brew to the Jedi Mind Trick, 100+ Magical Disney-Inspired Drinks Unofficial Cookbook')
booklist.append('Fridge Love: Organize Your Refrigerator for a Healthier, Happier Life with 100 Recipes')
booklist.append('Mastering the Art of French Cooking')
booklist.append('The New Cooking School Cookbook: Fundamentals')
booklist.append('What\'s Gaby Cooking: Take It Easy: Recipes for Zero Stress Deliciousness')
booklist.append('Quilt Recipes')
booklist.append('The New York Times Cooking No-Recipe Recipes: A Cookbook')
booklist.append('Cooking with JB & Jamie: Royers Round Top Cafe')
booklist.append('Essentials of Classic Italian Cooking')
booklist.append('Salt, Fat, Acid, Heat: Mastering the Elements of Good Cooking')
booklist.append('Joshua Weissman: An Unapologetic Cookbook')
booklist.append('Fairytale Cooking: Delicious Dishes Inspired by The Little Mermaid, Cinderella, Aladdin, and Other Classic Characters')
booklist.append('The Official Harry Potter Baking Book: 40+ Recipes Inspired by the Films')
booklist.append('Cooking for Wizards, Warriors and Dragons: 125 unofficial recipes inspired by The Witcher, Game of Thrones, The Broken Earth and other fantasy favorites')
booklist.append('The Easy 5-Ingredient Healthy Cookbook: Simple Recipes to Make Healthy Eating Delicious')
booklist.append('The Clean Eating Slow Cooker: A Healthy Cookbook of Wholesome Meals that Prep Fast & Cook Slow')
booklist.append('Lidia\'s a Pot, a Pan, and a Bowl: Simple Recipes for Perfect Meals: A Cookbook')
booklist.append('The Food Lab: Better Home Cooking Through Science')
booklist.append('The All New Ball Book Of Canning And Preserving: Over 350 of the Best Canned, Jammed, Pickled, and Preserved Recipes')

In [145]:
print(booklist)

['The Complete Fish Cookbook: A Celebration of Seafood with Recipes for Everyday Meals, Special Occasions, and More', "Pasta: The Spirit and Craft of Italy's Greatest Food, with Recipes A Cookbook", 'Food IQ: 100 Questions, Answers, and Recipes to Raise Your Cooking Smarts', 'Cooking with Grandma Gina', "The Unofficial Disney Parks Drink Recipe Book: From LeFou's Brew to the Jedi Mind Trick, 100+ Magical Disney-Inspired Drinks Unofficial Cookbook", 'Fridge Love: Organize Your Refrigerator for a Healthier, Happier Life with 100 Recipes', 'Mastering the Art of French Cooking', 'The New Cooking School Cookbook: Fundamentals', "What's Gaby Cooking: Take It Easy: Recipes for Zero Stress Deliciousness", 'Quilt Recipes', 'The New York Times Cooking No-Recipe Recipes: A Cookbook', 'Cooking with JB & Jamie: Royers Round Top Cafe', 'Essentials of Classic Italian Cooking', 'Salt, Fat, Acid, Heat: Mastering the Elements of Good Cooking', 'Joshua Weissman: An Unapologetic Cookbook', 'Fairytale Cook

In [146]:
count_vectorizer = CountVectorizer(stop_words='english', min_df=0.005)
booklist_v = count_vectorizer.fit_transform(booklist)
print(count_vectorizer.get_feature_names_out())

['100' '125' '350' '40' 'acid' 'aladdin' 'answers' 'art' 'baking' 'ball'
 'best' 'better' 'book' 'bowl' 'brew' 'broken' 'cafe' 'canned' 'canning'
 'celebration' 'characters' 'cinderella' 'classic' 'clean' 'complete'
 'cook' 'cookbook' 'cooker' 'cooking' 'craft' 'delicious' 'deliciousness'
 'dishes' 'disney' 'dragons' 'drink' 'drinks' 'earth' 'easy' 'eating'
 'elements' 'essentials' 'everyday' 'fairytale' 'fantasy' 'fast' 'fat'
 'favorites' 'films' 'fish' 'food' 'french' 'fridge' 'fundamentals' 'gaby'
 'game' 'gina' 'good' 'grandma' 'greatest' 'happier' 'harry' 'healthier'
 'healthy' 'heat' 'home' 'ingredient' 'inspired' 'iq' 'italian' 'italy'
 'jamie' 'jammed' 'jb' 'jedi' 'joshua' 'lab' 'lefou' 'lidia' 'life'
 'little' 'love' 'magical' 'make' 'mastering' 'meals' 'mermaid' 'mind'
 'new' 'occasions' 'official' 'organize' 'pan' 'parks' 'pasta' 'perfect'
 'pickled' 'pot' 'potter' 'prep' 'preserved' 'preserving' 'questions'
 'quilt' 'raise' 'recipe' 'recipes' 'refrigerator' 'round' 'royers'

In [147]:
df = pd.DataFrame(booklist_v.toarray(), columns=count_vectorizer.get_feature_names_out())
df

Unnamed: 0,100,125,350,40,acid,aladdin,answers,art,baking,ball,...,trick,unapologetic,unofficial,warriors,weissman,wholesome,witcher,wizards,york,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:

pd.set_option('display.max_rows', 134)
df_sum = df.sum(axis=0)


In [149]:
df2 = pd.DataFrame(cosine_similarity(df, dense_output=True))
df2
#print out matrix of values and their comparison

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1.0,0.223607,0.105409,0.0,0.06742,0.105409,0.0,0.141421,0.119523,0.223607,...,0.0,0.158114,0.0,0.105409,0.08165,0.182574,0.169031,0.316228,0.0,0.091287
1,0.223607,1.0,0.235702,0.0,0.075378,0.117851,0.0,0.158114,0.133631,0.25,...,0.0,0.176777,0.0,0.117851,0.091287,0.204124,0.094491,0.235702,0.144338,0.102062
2,0.105409,0.235702,1.0,0.19245,0.071067,0.222222,0.166667,0.149071,0.251976,0.235702,...,0.117851,0.0,0.100504,0.111111,0.172133,0.096225,0.0,0.111111,0.272166,0.096225
3,0.0,0.0,0.19245,1.0,0.0,0.0,0.288675,0.258199,0.218218,0.0,...,0.204124,0.0,0.174078,0.0,0.149071,0.0,0.0,0.0,0.235702,0.0
4,0.06742,0.075378,0.071067,0.0,1.0,0.071067,0.0,0.095346,0.0,0.0,...,0.0,0.1066,0.064282,0.142134,0.165145,0.061546,0.05698,0.071067,0.0,0.061546
5,0.105409,0.117851,0.222222,0.0,0.071067,1.0,0.0,0.0,0.125988,0.235702,...,0.0,0.0,0.0,0.111111,0.086066,0.096225,0.0,0.111111,0.0,0.096225
6,0.0,0.0,0.166667,0.288675,0.0,0.0,1.0,0.223607,0.188982,0.0,...,0.353553,0.0,0.150756,0.0,0.129099,0.0,0.0,0.0,0.204124,0.0
7,0.141421,0.158114,0.149071,0.258199,0.095346,0.0,0.223607,1.0,0.169031,0.0,...,0.158114,0.223607,0.13484,0.0,0.11547,0.129099,0.119523,0.149071,0.182574,0.129099
8,0.119523,0.133631,0.251976,0.218218,0.0,0.125988,0.188982,0.169031,1.0,0.267261,...,0.133631,0.0,0.113961,0.125988,0.19518,0.218218,0.0,0.125988,0.154303,0.109109
9,0.223607,0.25,0.235702,0.0,0.0,0.235702,0.0,0.0,0.267261,1.0,...,0.0,0.0,0.0,0.235702,0.182574,0.204124,0.0,0.235702,0.0,0.204124


In [150]:
#this will give us a list of how each item compares in our matrix
compare_list = []
for j, k in enumerate(df2.values):
    for n in range(len(k)):
        compare_list.append([j,n,k[n]])

qq=[]
for i in range(len(compare_list)):
    if compare_list[i][0]==compare_list[i][1]:
        qq.append([compare_list[i][0],compare_list[i][1],0])
    else:
        qq.append(compare_list[i])
compare_list

[[0, 0, 0.9999999999999999],
 [0, 1, 0.22360679774997896],
 [0, 2, 0.10540925533894598],
 [0, 3, 0.0],
 [0, 4, 0.0674199862463242],
 [0, 5, 0.10540925533894598],
 [0, 6, 0.0],
 [0, 7, 0.1414213562373095],
 [0, 8, 0.11952286093343936],
 [0, 9, 0.22360679774997896],
 [0, 10, 0.23904572186687872],
 [0, 11, 0.0],
 [0, 12, 0.0],
 [0, 13, 0.0],
 [0, 14, 0.15811388300841897],
 [0, 15, 0.0],
 [0, 16, 0.10540925533894598],
 [0, 17, 0.0816496580927726],
 [0, 18, 0.1825741858350554],
 [0, 19, 0.16903085094570333],
 [0, 20, 0.31622776601683794],
 [0, 21, 0.0],
 [0, 22, 0.0912870929175277],
 [1, 0, 0.22360679774997896],
 [1, 1, 0.9999999999999999],
 [1, 2, 0.2357022603955158],
 [1, 3, 0.0],
 [1, 4, 0.0753778361444409],
 [1, 5, 0.1178511301977579],
 [1, 6, 0.0],
 [1, 7, 0.15811388300841894],
 [1, 8, 0.13363062095621217],
 [1, 9, 0.24999999999999994],
 [1, 10, 0.26726124191242434],
 [1, 11, 0.0],
 [1, 12, 0.0],
 [1, 13, 0.0],
 [1, 14, 0.17677669529663687],
 [1, 15, 0.0],
 [1, 16, 0.1178511301977579],

In [151]:
#this will help find the indexed values so we can see how each real book titles compare
#this is the max values so this will be the max that each title is related. we will do min later
dictionary=defaultdict(list)
# make a dict  
#cycle through qq which is a list of lists showing each title of the book vs the other title of the book and their cos simularity
for i in range(len(qq)):
    dictionary[qq[i][0]].append(qq[i][2])
    
updated_df=pd.DataFrame(dictionary)

#find the max value for each book and add that to the list
position_maxVal=[]
for i in range(len(updated_df)):
    position_maxVal.append(np.argmax(updated_df[i]))

sent_comp=[]

for j in position_maxVal: 
            sent_comp.append(booklist[j])

In [152]:
#print(qq)

In [153]:
#list of max simularities for each book. this is best case per book title
similar_books=pd.DataFrame(sent_comp,columns=['Similar Books'])
similarity_value=pd.DataFrame(round(updated_df.max(axis=1),4),
                               columns=['Cos Similarity Value'])
p_book=pd.DataFrame(booklist,columns=['Book Titles'])
max_cos_sim_df=pd.concat([p_book,similar_books,similarity_value],axis=1)
max_cos_sim_df

Unnamed: 0,Book Titles,Similar Books,Cos Similarity Value
0,The Complete Fish Cookbook: A Celebration of S...,"Lidia's a Pot, a Pan, and a Bowl: Simple Recip...",0.3162
1,Pasta: The Spirit and Craft of Italy's Greates...,The New York Times Cooking No-Recipe Recipes: ...,0.2673
2,"Food IQ: 100 Questions, Answers, and Recipes t...",The Food Lab: Better Home Cooking Through Science,0.2722
3,Cooking with Grandma Gina,Mastering the Art of French Cooking,0.2887
4,The Unofficial Disney Parks Drink Recipe Book:...,"Cooking for Wizards, Warriors and Dragons: 125...",0.1651
5,Fridge Love: Organize Your Refrigerator for a ...,Quilt Recipes,0.2357
6,Mastering the Art of French Cooking,"Salt, Fat, Acid, Heat: Mastering the Elements ...",0.3536
7,The New Cooking School Cookbook: Fundamentals,The New York Times Cooking No-Recipe Recipes: ...,0.5071
8,What's Gaby Cooking: Take It Easy: Recipes for...,The New York Times Cooking No-Recipe Recipes: ...,0.2857
9,Quilt Recipes,What's Gaby Cooking: Take It Easy: Recipes for...,0.2673


In [154]:
# min simulairy for each book so I'd consider this to be 0. we should probably run another list which excludes 0?
similar_books=pd.DataFrame(sent_comp,columns=['Similar Books'])
similarity_value=pd.DataFrame(round(updated_df.min(axis=1),4),
                               columns=['Cos Similarity Value'])
p_book=pd.DataFrame(booklist,columns=['Book Titles'])
min_cos_sim_df=pd.concat([p_book,similar_books,similarity_value],axis=1)
min_cos_sim_df

Unnamed: 0,Book Titles,Similar Books,Cos Similarity Value
0,The Complete Fish Cookbook: A Celebration of S...,"Lidia's a Pot, a Pan, and a Bowl: Simple Recip...",0.0
1,Pasta: The Spirit and Craft of Italy's Greates...,The New York Times Cooking No-Recipe Recipes: ...,0.0
2,"Food IQ: 100 Questions, Answers, and Recipes t...",The Food Lab: Better Home Cooking Through Science,0.0
3,Cooking with Grandma Gina,Mastering the Art of French Cooking,0.0
4,The Unofficial Disney Parks Drink Recipe Book:...,"Cooking for Wizards, Warriors and Dragons: 125...",0.0
5,Fridge Love: Organize Your Refrigerator for a ...,Quilt Recipes,0.0
6,Mastering the Art of French Cooking,"Salt, Fat, Acid, Heat: Mastering the Elements ...",0.0
7,The New Cooking School Cookbook: Fundamentals,The New York Times Cooking No-Recipe Recipes: ...,0.0
8,What's Gaby Cooking: Take It Easy: Recipes for...,The New York Times Cooking No-Recipe Recipes: ...,0.0
9,Quilt Recipes,What's Gaby Cooking: Take It Easy: Recipes for...,0.0


In [155]:
# lets look at the max and min of the best case
df_max = max_cos_sim_df[max_cos_sim_df['Cos Similarity Value'] == max_cos_sim_df['Cos Similarity Value'].max()]
df_min = max_cos_sim_df[max_cos_sim_df['Cos Similarity Value'] == max_cos_sim_df['Cos Similarity Value'].min()]
df_min_max = df_max.append(df_min)
df_min_max


  df_min_max = df_max.append(df_min)


Unnamed: 0,Book Titles,Similar Books,Cos Similarity Value
7,The New Cooking School Cookbook: Fundamentals,The New York Times Cooking No-Recipe Recipes: ...,0.5071
10,The New York Times Cooking No-Recipe Recipes: ...,The New Cooking School Cookbook: Fundamentals,0.5071
4,The Unofficial Disney Parks Drink Recipe Book:...,"Cooking for Wizards, Warriors and Dragons: 125...",0.1651


In [156]:
#lets look at min of the best case
df_max = min_cos_sim_df[min_cos_sim_df['Cos Similarity Value'] == min_cos_sim_df['Cos Similarity Value'].max()]
df_min = min_cos_sim_df[min_cos_sim_df['Cos Similarity Value'] == min_cos_sim_df['Cos Similarity Value'].min()]
df_min_max = df_max.append(df_min)
df_min_max

  df_min_max = df_max.append(df_min)


Unnamed: 0,Book Titles,Similar Books,Cos Similarity Value
0,The Complete Fish Cookbook: A Celebration of S...,"Lidia's a Pot, a Pan, and a Bowl: Simple Recip...",0.0
1,Pasta: The Spirit and Craft of Italy's Greates...,The New York Times Cooking No-Recipe Recipes: ...,0.0
2,"Food IQ: 100 Questions, Answers, and Recipes t...",The Food Lab: Better Home Cooking Through Science,0.0
3,Cooking with Grandma Gina,Mastering the Art of French Cooking,0.0
4,The Unofficial Disney Parks Drink Recipe Book:...,"Cooking for Wizards, Warriors and Dragons: 125...",0.0
5,Fridge Love: Organize Your Refrigerator for a ...,Quilt Recipes,0.0
6,Mastering the Art of French Cooking,"Salt, Fat, Acid, Heat: Mastering the Elements ...",0.0
7,The New Cooking School Cookbook: Fundamentals,The New York Times Cooking No-Recipe Recipes: ...,0.0
8,What's Gaby Cooking: Take It Easy: Recipes for...,The New York Times Cooking No-Recipe Recipes: ...,0.0
9,Quilt Recipes,What's Gaby Cooking: Take It Easy: Recipes for...,0.0


#### Evaluation \
Simular: \
The most simular result with a cosin score of 0.5071 was 'The New Cooking School Cookbook: Fundamentals' and 'The New York Times Cooking No-Recipe Recipes: A Cookbook' \
This score comes from our best case dataframe. In the worst case dataframe we had several titles with 0 simularity. Therefore for our "worst" we will be using the best case dataframe for the least simular of our simular titles. \
for the least simular we had with a score of 0.1651 'The Unofficial Disney Parks Drink Recipe Book: From LeFou\'s Brew to the Jedi Mind Trick, 100+ Magical Disney-Inspired Drinks Unofficial Cookbook' and 'Cooking for Wizards, Warriors and Dragons: 125 unofficial recipes inspired by The Witcher, Game of Thrones, The Broken Earth and other fantasy favorites' \ 
this happens to be our two longest titles as well.


# Question 2

2.	Now evaluate using a major search engine. \
a.	Enter one of the book titles from question 1a into Google, Bing, or Yahoo!. Copy the capsule of the first organic result and the 20th organic result. Take web results only (i.e., not video results), and skip sponsored results. \
b.	Run the same text similarity calculation that you used for question 1b on each of these capsules in comparison to the original query (book title). \
c.	Which one has the highest similarity measure? 


In [157]:
booklist = ['Mastering the Art of French Cooking']
#first result from google
booklist.append('Mastering the Art of French Cooking by Julia Child')
#20th result from google
booklist.append('My Life in France')

#since much of this code is repeated from above I won't be printing as much

In [158]:
print(booklist)

['Mastering the Art of French Cooking', 'Mastering the Art of French Cooking by Julia Child', 'My Life in France']


In [159]:
count_vectorizer = CountVectorizer(stop_words='english', min_df=0.005)
booklist_v = count_vectorizer.fit_transform(booklist)
print(count_vectorizer.get_feature_names_out())

['art' 'child' 'cooking' 'france' 'french' 'julia' 'life' 'mastering']


In [160]:
df = pd.DataFrame(booklist_v.toarray(), columns=count_vectorizer.get_feature_names_out())
pd.set_option('display.max_rows', 134)
df_sum = df.sum(axis=0)
df2 = pd.DataFrame(cosine_similarity(df, dense_output=True))

In [161]:
compare_list = []
for j, k in enumerate(df2.values):
    for n in range(len(k)):
        compare_list.append([j,n,k[n]])

qq=[]
for i in range(len(compare_list)):
    if compare_list[i][0]==compare_list[i][1]:
        qq.append([compare_list[i][0],compare_list[i][1],0])
    else:
        qq.append(compare_list[i])

In [162]:
#this will help find the indexed values so we can see how each real book titles compare
#this is the max values so this will be the max that each title is related. we will do min later
dictionary=defaultdict(list)
# make a dict  
#cycle through qq which is a list of lists showing each title of the book vs the other title of the book and their cos simularity
for i in range(len(qq)):
    dictionary[qq[i][0]].append(qq[i][2])
    
updated_df=pd.DataFrame(dictionary)

#find the max value for each book and add that to the list
position_maxVal=[]
for i in range(len(updated_df)):
    position_maxVal.append(np.argmax(updated_df[i]))

sent_comp=[]

for j in position_maxVal: 
            sent_comp.append(booklist[j])

In [163]:
#list of max simularities for each book. this is best case per book title
#min will just be 0  so we will just do best case this time
similar_books=pd.DataFrame(sent_comp,columns=['Similar Books'])
similarity_value=pd.DataFrame(round(updated_df.max(axis=1),4),
                               columns=['Cos Similarity Value'])
p_book=pd.DataFrame(booklist,columns=['Book Titles'])
max_cos_sim_df=pd.concat([p_book,similar_books,similarity_value],axis=1)
max_cos_sim_df

Unnamed: 0,Book Titles,Similar Books,Cos Similarity Value
0,Mastering the Art of French Cooking,Mastering the Art of French Cooking by Julia C...,0.8165
1,Mastering the Art of French Cooking by Julia C...,Mastering the Art of French Cooking,0.8165
2,My Life in France,Mastering the Art of French Cooking,0.0


In [164]:
# lets look at the max and min of the best case min and max
df_max = max_cos_sim_df[max_cos_sim_df['Cos Similarity Value'] == max_cos_sim_df['Cos Similarity Value'].max()]
df_min = max_cos_sim_df[max_cos_sim_df['Cos Similarity Value'] == max_cos_sim_df['Cos Similarity Value'].min()]
df_min_max = df_max.append(df_min)
df_min_max

  df_min_max = df_max.append(df_min)


Unnamed: 0,Book Titles,Similar Books,Cos Similarity Value
0,Mastering the Art of French Cooking,Mastering the Art of French Cooking by Julia C...,0.8165
1,Mastering the Art of French Cooking by Julia C...,Mastering the Art of French Cooking,0.8165
2,My Life in France,Mastering the Art of French Cooking,0.0


#### Submit all of your inputs and outputs and your code for this assignment, along with a brief written explanation of your findings. 

Write-up: \
The search returned almost the exact same book title, though it was the same book, however they boasted that it was Authorered by Julia Child. Therefore lowering the cosin simularity score. \
the 20th result was also something by Julia Child about her time in france. Therefore, it is probably not a cook book but rather about Julia Child in general, which probably also talked about her cooking. \
If it was searching for a french cookbook I'd be dissapointed in the 20th option but a book on French culture would probably be satisfied. \
Overall the book search result was successful.