In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import time, re
from nltk.tokenize import word_tokenize
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.metrics import pairwise_distances
from PIL import Image

### Create dataframe for streamlit:

In [2]:
df = pd.read_pickle("data/df_with_topic_vecs.pkl")
df.head(3)
df.shape

(3017, 24)

In [3]:
df.columns

Index(['title', 'author', 'rating', 'num_rating', 'review', 'page', 'year',
       'publisher', 'summary', 'language', 'clean_summary', 'topic_1',
       'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7',
       'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 'topic'],
      dtype='object')

In [4]:
data_sl = df.loc[:, ["title", "author", "rating", "num_rating", "review", "page", "year", "publisher", "summary", "topic"]]

In [5]:
data_sl.rename(columns={"review":"num_review", "year":"year_published"}, inplace=True)

# make title column in df lower case to pull out summary
data_sl["title"] = data_sl["title"].apply(lambda x: x.lower())

In [6]:
topics = ["biography", "business", "science", "gender", "religion", "race",
          "health", "world war II", "relationship", "art", "family", "british monarch"]
for i in range(1, 13):
    data_sl.loc[data_sl.topic == i, "topic"] = topics[i-1]

In [7]:
data_sl.head(3)

Unnamed: 0,title,author,rating,num_rating,num_review,page,year_published,publisher,summary,topic
0,the diary of a young girl,Anne Frank,4.16,2887098,30439,283.0,July 1993,by Bantam,Discovered in the attic in which she spent the...,family
1,night,Elie Wiesel,4.34,1008920,30066,115.0,January 16th 2006,by Hill & Wang,"Born in the town of Sighet, Transylvania, Elie...",family
2,being and time,Martin Heidegger,4.03,20738,613,589.0,August 1st 1962,by Harper & Row (NYC/Evanston),One of the most important philosophical works ...,art


### (COME BACK TO WORK ON) explore display of book titles (quotations marks doesn't work with search):

In [30]:
title = "the sun"

for i in [data_sl.loc[data_sl["title"].str.contains("^"+title.lower()), "title"]]:
    print(i)

552     the sunset strip diaries
1667          the sun down motel
Name: title, dtype: object


In [None]:
#randomly generating a book title from the dataset
data_sl.loc[np.random.choice(data_sl.index, size=1),"title"].values[0]

In [None]:
# # plot for streamlit
# df_plot = pd.DataFrame(data_sl.topic.value_counts())

# import matplotlib.pyplot as plt, seaborn as sns
# %matplotlib inline
# plt.style.use("seaborn")

# plt.rcParams["figure.figsize"] = [6, 3]
# plt.rcParams['figure.dpi'] = 300

# fig, ax = plt.subplots()
# bars = ax.bar(df_plot.index, df_plot.topic)
# for bar in range(0, 12, 2):
#     bars[bar].set_color("darkmagenta")
# for bar in range(1, 13, 2):
#     bars[bar].set_color("plum")
# ax.set_xticks(list(range(12)))
# ax.set_xticklabels(list(df_plot.index), rotation = 60)
# ax.set_title("Number of Books per Topic", fontsize=15)
# ax.set_ylabel("Number of Books");

In [None]:
#data_sl.to_pickle("data/data_for_streamlit.pkl")

In [None]:
# sample = np.random.choice(data_sl.index, size=6, replace=False)
# print(sample)

# for i in data_sl.loc[[149,339,1650,1972,2912,3170], "title"].sort_index():
#     print("title:", i, "\n")

### Explore recommendations based on topics and rating:

In [78]:
mask = (data_sl["topic"] == "world war II")
bks = data_sl[mask]
rating = 5

# sample = np.random.choice(science_bks.index, size=6, replace=False)
# print(sample)

In [77]:
df1 = bks.sort_values("rating", ascending=False)
df1.head(10)


Unnamed: 0,title,author,rating,num_rating,num_review,page,year_published,publisher,summary,topic
2292,"high heels & beetle crushers: the life, losses...",Jackie Skingley,4.67,57,47,328.0,January 1st 2020,by Chronos Books,A compelling memoir of post-war Britain. Jacki...,world war II
2131,hank brodt holocaust memoirs: a candle and a p...,Deborah Donnelly,4.6,241,22,230.0,October 3rd 2016,by Amsterdam Publishers,A Candle and a Promise \n The Troubling b...,world war II
2165,survivor love thy enemy,James Dennison,4.58,12,9,268.0,July 21st 2012,by Createspace Independent Publishing Platform,"From an inspired Catholic family base, this yo...",world war II
2172,the nightingale,Kristin Hannah,4.57,764479,68222,440.0,February 3rd 2015,by St. Martin's Press,In love we find out who we want to be.In war w...,world war II
1645,breaking cadence: one woman's war against the war,Rosa del Duca,4.57,23,11,,May 1st 2019,by Ooligan Press,When a young recruiter tells Rosa del Duca tha...,world war II
1575,lamb of legacy: a child's survival in hitler's...,Edeltraud F. Fellendorf,4.53,15,7,273.0,January 8th 2013,by Tate Publishing & Enterprises,"I am now in my eighties, surprised at the face...",world war II
1349,stories of elders: what the greatest generatio...,Veronica Kirin,4.51,63,21,320.0,September 30th 2018,by Identity Publications,America’s Greatest Generation (born before 194...,world war II
504,chasing the scream: the first and last days of...,Johann Hari,4.49,13455,1733,400.0,January 20th 2015,by Bloomsbury USA,New York Times BestsellerIt is now one hundred...,world war II
838,debunking holocaust denial theories,James Morcan,4.49,114,32,222.0,April 28th 2016,by Sterling Gate Books,DEBUNKING HOLOCAUST DENIAL THEORIES: Two Non-J...,world war II
521,"the gulag archipelago, 1918-1956: an experimen...",Aleksandr Solzhenitsyn,4.48,1969,113,712.0,January 1st 1992,by HarperCollins Publishers,"Drawing on his own incarceration and exile, a...",world war II


In [79]:
df2 = df1[df1["rating"] >= rating]
df2

Unnamed: 0,title,author,rating,num_rating,num_review,page,year_published,publisher,summary,topic


In [59]:

ind = np.random.choice(df2.index, size=2, replace=False)
books = df2.loc[ind, :]
for ind in books.index:
    title = books.loc[ind, "title"]
    summary = books.loc[ind, "summary"]
    rating = books.loc[ind, "rating"]
    author = books.loc[ind, "author"]
    print("Title: ", title.title(), "\n")
    print("Rating (scale 0-5): ", rating, "; and Author: ", author, "\n")
    print("Summary: " , summary, "\n")

Title:  Digicrimination Those Are The Good Times: A New Type Of Discrimination That Came With Digitization 

Rating (scale 0-5):  5.0 ; and Author:  H.Okan Tansu 

Summary:  Our society is highly effected by the digital revolution. This book describes with examples and new concepts the discrimination created by the Digital World at different layers of the society. The author analyzes the new technological ecosystem with components like the Digital Ghetto and describes the measures which need to be taken in the future. He evaluates this new digital world focusing on several aspects of social relations and lifestyles. The book also analyzes the mistakes made while entering the Information Age. Furthermore, the author answers the question if human society is ready for the amenities of services like Social Media, e-learning, energy and self-driving cars or if they actually make our lives more difficult and complicated.  

Title:  A Vineyard Odyssey: The Organic Fight To Save Wine From The 

In [51]:
books

Unnamed: 0,title,author,rating,num_rating,num_review,page,year_published,publisher,summary,topic
2321,a vineyard odyssey: the organic fight to save ...,John Kiger,4.67,6,2,202.0,June 6th 2013,by Rowman & Littlefield Publishers,A Vineyard Odyssey is a fascinating saga of wi...,science
1239,of the andromeda martian catastrophe,Vegas Luna,4.57,35,12,257.0,May 10th 2015,by Vegas Luna,This is an investigation into the Atlantean Me...,science


In [None]:
img = Image.open("/Users/sarazzzz/Desktop/Metis/CAMP/Metis_project5/book_images/book2658.jpg")
plt.imshow(img)
plt.xticks([])
plt.yticks([]);
#plt.savefig("images/science_example1.svg");

In [None]:
img3 = Image.open("/Users/sarazzzz/Desktop/Metis/CAMP/Metis_project5/book_images/book159.jpg")
plt.imshow(img3)
plt.xticks([])
plt.yticks([]);
#plt.savefig("images/science_example2.svg");

### Recommendation system based on topic vectors:

In [9]:
df_rec = df.loc[:, ["title", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5", "topic_6",
                    "topic_7", "topic_8", "topic_9", "topic_10", "topic_11", "topic_12"]].set_index("title")

In [10]:
# make book title lower case so search for title is not case sensitive
df_rec.index = df_rec.index.str.lower()
df_rec.head(3)

Unnamed: 0_level_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
the diary of a young girl,0.0223,0.0,0.0295,0.0285,0.0,0.0,0.0,0.0702,0.0,0.0038,0.0793,0.0179
night,0.0073,0.0,0.0,0.0,0.0,0.0,0.0,0.0216,0.0,0.0171,0.0649,0.0035
being and time,0.0,0.0023,0.0166,0.0,0.0,0.0,0.0,0.0,0.0,0.1318,0.0,0.0


In [11]:
# topic vector for When Breath Becomes Air
df_rec.loc['when breath becomes air']

topic_1     0.0513
topic_2     0.0147
topic_3     0.0147
topic_4     0.0000
topic_5     0.0085
topic_6     0.0000
topic_7     0.0729
topic_8     0.0000
topic_9     0.0170
topic_10    0.0000
topic_11    0.0269
topic_12    0.0041
Name: when breath becomes air, dtype: float64

In [21]:
def recommend(title):
    title = title.lower()
    ind = pairwise_distances(df_rec.loc[title].values.reshape(1,-1), df_rec, metric='cosine').argsort()[0][:6]
    return df_rec.index[ind]

In [22]:
recommend("Quiet: The Power Of Introverts In A World That Can'T Stop Talking")

Index(['quiet: the power of introverts in a world that can't stop talking',
       'modern real estate investing: the delaware statutory trust',
       'the power of habit: why we do what we do in life and business',
       'a newborn business: esports',
       'creativity, inc.: overcoming the unseen forces that stand in the way of true inspiration',
       'naked economics: undressing the dismal science'],
      dtype='object', name='title')

In [None]:
# make title column in df lower case to pull out summary
df["title"] = df["title"].apply(lambda x: x.lower())

In [None]:
# mask = (df.title == "when breath becomes air")|\
#        (df.title == "many lives, many masters: the true story of a prominent psychiatrist, his young patient, and the past life therapy that changed both their lives")|\
#        (df.title == "autobiography of a face")
       
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [None]:
recommend("Being Mortal: Medicine and What Matters in the End")

In [None]:
# mask = (df.title == "added sugars-the slow poison") |\
#        (df.title == "being mortal: medicine and what matters in the end")|\
#        (df.title == "surgeon's story: inside or-1 with one of america's top pediatric heart surgeons")
       
       
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [None]:
recommend("The Tipping Point: How Little Things Can Make a Big Difference")

In [None]:
# mask = (df.title == "the tipping point: how little things can make a big difference") |\
#        (df.title == "the signal and the noise: why so many predictions fail—but some don't")|\
#        (df.title == "the spirit level: why more equal societies almost always do better")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

### ^^Try out 2 books (When Breath Becomes Air and Being Mortal), recommendations don't seem particular impressing (hit or miss based on the book summaries), GoodReads recommendations make more sense to me

### Word embedding with GloVe:

In [None]:
glove_file = datapath("/Users/sarazzzz/Downloads/glove/glove.6B.100d.txt")
tmp_file = get_tmpfile("glove_word2vec.txt")

glove2word2vec(glove_file, tmp_file)
model = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
# testing, comparing via cosine similarity
print(model.similarity('obama', 'clinton'))
print(model.similarity('obama', 'reagan'))

In [None]:
df = pd.read_pickle("data/clean_summary.pkl")
df.head()

In [None]:
def clean_text(text):
    """Cleans text by making text lowercase, removing words containing numbers, and extracting only words.
       Parameters: strings of text
       Returns: cleaned text
    """
    text = text.lower()
    text = re.sub("['’]", "", text)
    text = re.sub("\n", " ", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub("[^a-z\s]", " ", text)

    return text    

In [None]:
# clean book summary and add it back to the data frame as clean_summary WITHOUT lemmatization
df["clean_summary"] = df["summary"].apply(clean_text)

In [None]:
text = df.iloc[0, -1]

text_tokens = word_tokenize(text)
print(text_tokens)

In [None]:
vec_list = []
for word in set(text_tokens):
    try:
        vec_list.append(model[word])
    except:
        print(word)

In [None]:
vector = pd.DataFrame(vec_list)
print(vector.shape)
len(set(text_tokens))
vector.head()

In [None]:
vector.mean().values.shape

In [None]:
def summary_to_vec(summary):
    summary_tokens = word_tokenize(summary)
    
    vec_list = []
    for word in set(summary_tokens):
        try:
            vec_list.append(model[word])
        except:
            pass
    
    vector = pd.DataFrame(vec_list).mean().values
    
    return vector

### Test run on the first five summaries in the dataset:

In [None]:
testing = df.iloc[:5, :]
testing

In [None]:
test_vec = testing["clean_summary"].apply(summary_to_vec)

In [None]:
test_vec

In [None]:
df_test = pd.DataFrame([x for x in test_vec])
df_test.head(3)

In [None]:
start = time.time()
summary_vec = df["clean_summary"].apply(summary_to_vec)
print(time.time() - start)

In [None]:
df_vec = pd.DataFrame([x for x in summary_vec], index = df_rec.index)
df_vec.head()

In [None]:
# check vector for a particular book
df_vec.loc['being and time']

In [None]:
def recommend2(title):
    title = title.lower()
    ind = pairwise_distances(df_vec.loc[title].values.reshape(1,-1), df_vec, metric='cosine').argsort()[0][:6]
    return df_vec.index[ind]

In [None]:
recommend2("being and time")

In [None]:
# make title column in df lower case to pull out summary
df["title"] = df["title"].apply(lambda x: x.lower())

In [None]:
# mask = (df.title == "being and time") |\
#        (df.title == "poetry, language, thought")|\
#        (df.title == "the spanish civil war: a very short introduction")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [None]:
recommend2("Being Mortal: Medicine and What Matters in the End")

In [None]:
# mask = (df.title == "what to do when you have heart disease")|\
#        (df.title == "being mortal: medicine and what matters in the end") |\
#        (df.title == "dr. fred's healthcare rescue: the real solution to healthcare")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [None]:
recommend2("The Tipping Point: How Little Things Can Make a Big Difference")

In [None]:
# mask = (df.title == "the tipping point: how little things can make a big difference")|\
#        (df.title == "invisible women: data bias in a world designed for men") |\
#        (df.title == "the undercover economist")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

### ^ Recommendation might be better than topic vectors??
### Trying out the 300d vectors:

In [None]:
glove_file2 = datapath("/Users/sarazzzz/Downloads/glove/glove.6B.300d.txt")
tmp_file2 = get_tmpfile("glove_word2vec.txt")

glove2word2vec(glove_file2, tmp_file2)
model = gensim.models.KeyedVectors.load_word2vec_format(tmp_file2)

In [None]:
text = df.iloc[0, -1]
text_tokens = word_tokenize(text)
print(text_tokens)

In [None]:
vec_list = []
for word in set(text_tokens):
    try:
        vec_list.append(model[word])
    except:
        print(word)

In [None]:
vector = pd.DataFrame(vec_list)
print(vector.shape)
len(set(text_tokens))

In [None]:
df.head(3)

In [None]:
start = time.time()
summary_vec = df["clean_summary"].apply(summary_to_vec)
print(time.time() - start)

In [None]:
df_vec300 = pd.DataFrame([x for x in summary_vec], index = df_rec.index)
df_vec300.shape

In [None]:
df_vec300.to_pickle("GloVe_embedding_for_rec.pkl")

In [None]:
df_rec = pd.read_pickle("data/GloVe_embedding_for_recommendation.pkl")

### Recommender modified for streamlit:

In [None]:
def recommend3(title):
    title = title.lower()
    ind = pairwise_distances(df_rec.loc[title].values.reshape(1,-1), df_rec, metric='cosine').argsort()[0][1:4]
    books = df_rec.index[ind]
    
    titles = []
    ratings = []
    summaries = []
    for i in books:
        titles.append(i)
        
        rating = df.loc[df["title"] == i, "rating"].values
        ratings.append(rating)
        
        summary = df.loc[df["title"] == i, "summary"].values
        summaries.append(summary)
    
    results = pd.DataFrame(zip(titles, ratings, summaries))
    return(results)

In [None]:
string = "okay"
string.lower()

In [None]:
recommend3("when breath becomes air")

In [None]:
print("title: ", title, "\n", "rating: "rating, summary)

In [None]:
# mask = (df.title == "when breath becomes air")|\
#        (df.title == "lessons from a cf cornerman: 38 lessons i learned during my wife's illness and lung transplant")|\
#        (df.title == "our time to dance, a mother's journey to joy") 
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [None]:
recommend3("Being Mortal: Medicine and What Matters in the End")

In [None]:
recommend3("The Tipping Point: How Little Things Can Make a Big Difference")

In [None]:
# testing, comparing via cosine similarity
print(model.similarity('obama', 'clinton'))
print(model.similarity('obama', 'reagan'))