In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import time, re
from nltk.tokenize import word_tokenize
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.metrics import pairwise_distances
from PIL import Image

### Streamlit: Create dataframe

In [2]:
df = pd.read_pickle("data/df_with_topic_vecs.pkl")
df.head(3)
df.shape

(3017, 24)

In [3]:
df.columns

Index(['title', 'author', 'rating', 'num_rating', 'review', 'page', 'year',
       'publisher', 'summary', 'language', 'clean_summary', 'topic_1',
       'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7',
       'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 'topic'],
      dtype='object')

In [None]:
data_sl = df.loc[:, ["title", "author", "rating", "num_rating", "review", "page", "year", "publisher", "summary", "topic"]]

In [None]:
data_sl.rename(columns={"review":"num_review", "year":"year_published"}, inplace=True)

# make title column in df lower case to pull out summary
data_sl["title"] = data_sl["title"].apply(lambda x: x.lower())

In [None]:
topics = ["biography", "business", "science", "gender", "religion", "race",
          "health", "world war II", "relationship", "art", "family", "british monarch"]
for i in range(1, 13):
    data_sl.loc[data_sl.topic == i, "topic"] = topics[i-1]

In [None]:
data_sl.head(3)

In [None]:
title = "the sun"

for i in [data_sl.loc[data_sl["title"].str.contains("^"+title.lower()), "title"]]:
    print(i)

In [None]:
#randomly generating a book title from the dataset
data_sl.loc[np.random.choice(data_sl.index, size=1),"title"].values[0]

In [None]:
# # plot for streamlit
# df_plot = pd.DataFrame(data_sl.topic.value_counts())

# import matplotlib.pyplot as plt, seaborn as sns
# %matplotlib inline
# plt.style.use("seaborn")

# plt.rcParams["figure.figsize"] = [6, 3]
# plt.rcParams['figure.dpi'] = 300

# fig, ax = plt.subplots()
# bars = ax.bar(df_plot.index, df_plot.topic)
# for bar in range(0, 12, 2):
#     bars[bar].set_color("darkmagenta")
# for bar in range(1, 13, 2):
#     bars[bar].set_color("plum")
# ax.set_xticks(list(range(12)))
# ax.set_xticklabels(list(df_plot.index), rotation = 60)
# ax.set_title("Number of Books per Topic", fontsize=15)
# ax.set_ylabel("Number of Books");

In [None]:
#data_sl.to_pickle("data/data_for_streamlit.pkl")

In [None]:
# sample = np.random.choice(data_sl.index, size=6, replace=False)
# print(sample)

# for i in data_sl.loc[[149,339,1650,1972,2912,3170], "title"].sort_index():
#     print("title:", i, "\n")

### Streamlit: explore recommendations based on topics and rating

In [None]:
mask = (data_sl["topic"] == "health")
bks = data_sl[mask]
rating = 5

# sample = np.random.choice(science_bks.index, size=6, replace=False)
# print(sample)

In [None]:
df1 = bks.sort_values("rating", ascending=False)
df1.head(10)

In [None]:
df2 = df1[df1["rating"] >= 4.6]
df2

In [None]:
ind = np.random.choice(df2.index, size=2, replace=False)
books = df2.loc[ind, :]
for ind in books.index:
    title = books.loc[ind, "title"]
    summary = books.loc[ind, "summary"]
    rating = books.loc[ind, "rating"]
    author = books.loc[ind, "author"]
    print("Title: ", title.title(), "\n")
    print("Rating (scale 0-5): ", rating, "; and Author: ", author, "\n")
    print("Summary: " , summary, "\n")

In [None]:
img = Image.open("/Users/sarazzzz/Desktop/Metis/CAMP/Metis_project5/book_images/book2658.jpg")
plt.imshow(img)
plt.xticks([])
plt.yticks([]);
#plt.savefig("images/science_example1.svg");

In [None]:
img3 = Image.open("/Users/sarazzzz/Desktop/Metis/CAMP/Metis_project5/book_images/book159.jpg")
plt.imshow(img3)
plt.xticks([])
plt.yticks([]);
#plt.savefig("images/science_example2.svg");

### Recommendation system based on topic vectors:

In [4]:
df_rec = df.loc[:, ["title", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5", "topic_6",
                    "topic_7", "topic_8", "topic_9", "topic_10", "topic_11", "topic_12"]].set_index("title")

In [5]:
# make book title lower case so search for title is not case sensitive
df_rec.index = df_rec.index.str.lower()
df_rec.head(3)

Unnamed: 0_level_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
the diary of a young girl,0.0223,0.0,0.0295,0.0285,0.0,0.0,0.0,0.0702,0.0,0.0038,0.0793,0.0179
night,0.0073,0.0,0.0,0.0,0.0,0.0,0.0,0.0216,0.0,0.0171,0.0649,0.0035
being and time,0.0,0.0023,0.0166,0.0,0.0,0.0,0.0,0.0,0.0,0.1318,0.0,0.0


In [6]:
# topic vector for When Breath Becomes Air
df_rec.loc['when breath becomes air']

topic_1     0.0513
topic_2     0.0147
topic_3     0.0147
topic_4     0.0000
topic_5     0.0085
topic_6     0.0000
topic_7     0.0729
topic_8     0.0000
topic_9     0.0170
topic_10    0.0000
topic_11    0.0269
topic_12    0.0041
Name: when breath becomes air, dtype: float64

In [7]:
def recommend(title):
    title = title.lower()
    ind = pairwise_distances(df_rec.loc[title].values.reshape(1,-1), df_rec, metric='cosine').argsort()[0][:6]
    return df_rec.index[ind]

In [8]:
recommend("Quiet: The Power Of Introverts In A World That Can'T Stop Talking")

Index(['quiet: the power of introverts in a world that can't stop talking',
       'modern real estate investing: the delaware statutory trust',
       'the power of habit: why we do what we do in life and business',
       'a newborn business: esports',
       'creativity, inc.: overcoming the unseen forces that stand in the way of true inspiration',
       'naked economics: undressing the dismal science'],
      dtype='object', name='title')

In [9]:
recommend("when breath becomes air")

Index(['when breath becomes air',
       'many lives, many masters: the true story of a prominent psychiatrist, his young patient, and the past life therapy that changed both their lives',
       'autobiography of a face',
       'an unquiet mind: a memoir of moods and madness',
       'animal, vegetable, miracle: a year of food life',
       'hope is my wingman'],
      dtype='object', name='title')

In [10]:
recommend("man's search for meaning")

Index(['man's search for meaning',
       'hello from heaven: a new field of research-after-death communication confirms that life and love are eternal',
       'all creatures great and small',
       'the man who mistook his wife for a hat and other clinical tales',
       'the great penguin rescue: 40,000 penguins, a devastating oil spill and the inspiring story of the world's largest animal rescue',
       'danger to society'],
      dtype='object', name='title')

In [11]:
recommend("How to win friends and influence people")

Index(['how to win friends and influence people',
       'summary of essentialism: by greg mckeown | includes analysis',
       'explosive growth: a few things i learned while growing to 100 million users - and losing $78 million',
       'introverts: leverage your strengths for an effective job search',
       'boomer reinvention: how to create your dream career over 50',
       'micromba skills : compilation of case studies made easy for mbas and non-mbas'],
      dtype='object', name='title')

In [12]:
# make title column in df lower case to pull out summary
df["title"] = df["title"].apply(lambda x: x.lower())

In [None]:
# mask = (df.title == "when breath becomes air")|\
#        (df.title == "many lives, many masters: the true story of a prominent psychiatrist, his young patient, and the past life therapy that changed both their lives")|\
#        (df.title == "autobiography of a face")
       
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [None]:
recommend("Being Mortal: Medicine and What Matters in the End")

In [None]:
# mask = (df.title == "added sugars-the slow poison") |\
#        (df.title == "being mortal: medicine and what matters in the end")|\
#        (df.title == "surgeon's story: inside or-1 with one of america's top pediatric heart surgeons")
       
       
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [None]:
recommend("The Tipping Point: How Little Things Can Make a Big Difference")

In [None]:
# mask = (df.title == "the tipping point: how little things can make a big difference") |\
#        (df.title == "the signal and the noise: why so many predictions fail—but some don't")|\
#        (df.title == "the spirit level: why more equal societies almost always do better")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

### Word embedding with GloVe:

In [13]:
glove_file = datapath("/Users/sarazzzz/Downloads/glove/glove.6B.100d.txt")
tmp_file = get_tmpfile("glove_word2vec.txt")

glove2word2vec(glove_file, tmp_file)
model = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

In [14]:
# testing, comparing via cosine similarity
print(model.similarity('obama', 'clinton'))
print(model.similarity('obama', 'reagan'))

0.8960004
0.6169825


In [15]:
df = pd.read_pickle("data/clean_summary.pkl")
df.head()

Unnamed: 0,title,author,rating,num_rating,review,page,year,publisher,summary,language,clean_summary
0,The Diary of a Young Girl,Anne Frank,4.16,2887098,30439,283.0,July 1993,by Bantam,Discovered in the attic in which she spent the...,en,discovered in the attic in which she spent the...
1,Night,Elie Wiesel,4.34,1008920,30066,115.0,January 16th 2006,by Hill & Wang,"Born in the town of Sighet, Transylvania, Elie...",en,born in the town of sighet transylvania elie w...
2,Being and Time,Martin Heidegger,4.03,20738,613,589.0,August 1st 1962,by Harper & Row (NYC/Evanston),One of the most important philosophical works ...,en,one of the most important philosophical work o...
3,Hiroshima,John Hersey,3.99,63589,3332,152.0,March 4th 1989,by Vintage,"On August 6, 1945, Hiroshima was destroyed by ...",en,on august hiroshima wa destroyed by the first ...
4,Into Thin Air: A Personal Account of the Mount...,Jon Krakauer,4.19,411855,14209,368.0,October 19th 1999,by Anchor Books,A bank of clouds was assembling on the not-so-...,en,a bank of cloud wa assembling on the not so di...


In [16]:
def clean_text(text):
    """Cleans text by making text lowercase, removing words containing numbers, and extracting only words.
       Parameters: strings of text
       Returns: cleaned text
    """
    text = text.lower()
    text = re.sub("['’]", "", text)
    text = re.sub("\n", " ", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub("[^a-z\s]", " ", text)

    return text    

In [17]:
# clean book summary and add it back to the data frame as clean_summary WITHOUT lemmatization
df["clean_summary"] = df["summary"].apply(clean_text)

In [18]:
text = df.iloc[0, -1]

text_tokens = word_tokenize(text)
print(text_tokens)

['discovered', 'in', 'the', 'attic', 'in', 'which', 'she', 'spent', 'the', 'last', 'years', 'of', 'her', 'life', 'anne', 'franks', 'remarkable', 'diary', 'has', 'become', 'a', 'world', 'classic', 'a', 'powerful', 'reminder', 'of', 'the', 'horrors', 'of', 'war', 'and', 'an', 'eloquent', 'testament', 'to', 'the', 'human', 'spirit', 'in', 'with', 'the', 'nazis', 'occupying', 'holland', 'a', 'thirteen', 'year', 'old', 'jewish', 'girl', 'and', 'her', 'family', 'fled', 'their', 'home', 'in', 'amsterdam', 'and', 'went', 'into', 'hiding', 'for', 'the', 'next', 'two', 'years', 'until', 'their', 'whereabouts', 'were', 'betrayed', 'to', 'the', 'gestapo', 'the', 'franks', 'and', 'another', 'family', 'lived', 'cloistered', 'in', 'the', 'secret', 'annexe', 'of', 'an', 'old', 'office', 'building', 'cut', 'off', 'from', 'the', 'outside', 'world', 'they', 'faced', 'hunger', 'boredom', 'the', 'constant', 'cruelties', 'of', 'living', 'in', 'confined', 'quarters', 'and', 'the', 'ever', 'present', 'threat'

In [19]:
vec_list = []
for word in set(text_tokens):
    try:
        vec_list.append(model[word])
    except:
        print(word)

In [20]:
vector = pd.DataFrame(vec_list)
print(vector.shape)
len(set(text_tokens))
vector.head()

(117, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.62171,-0.13338,0.11765,-0.48472,0.35591,0.34717,0.42381,0.40977,-0.17738,-0.15945,...,-0.26725,-0.41253,-0.031134,0.38949,-0.85642,0.041839,-0.089404,0.51396,0.24983,-0.27881
1,0.081466,0.26043,0.57471,-0.86109,-0.040796,0.38158,-0.3095,0.25425,-0.20413,-0.41511,...,-0.46803,0.084476,-0.31192,0.25794,-0.64003,-0.31881,0.089077,0.51851,0.47458,-0.076461
2,0.13811,0.65709,-0.28635,0.24526,0.65783,0.46772,-0.29231,-0.30756,-0.63843,-0.2967,...,0.56564,-0.81917,1.1366,0.082616,-0.037658,-0.25162,-0.43233,-1.1276,-0.62625,-0.64322
3,-0.59068,-0.087248,-0.49688,0.37031,-0.38078,0.80878,-0.3432,-0.043514,-0.47491,-0.22881,...,-0.51219,0.70631,-0.06309,0.038267,0.21181,-0.40455,-0.033849,-0.64935,0.21518,0.35644
4,-0.47018,0.011195,0.77701,0.22822,-0.4798,-0.83984,-1.0515,-0.39106,0.57898,0.30948,...,-0.61708,0.47116,-0.05494,-0.11013,-0.56839,0.95027,-0.30336,-0.56038,-0.074974,0.096481


In [21]:
vector.mean().values.shape

(100,)

In [22]:
def summary_to_vec(summary):
    summary_tokens = word_tokenize(summary)
    
    vec_list = []
    for word in set(summary_tokens):
        try:
            vec_list.append(model[word])
        except:
            pass
    
    vector = pd.DataFrame(vec_list).mean().values
    
    return vector

### Test run on the first five summaries in the dataset:

In [23]:
testing = df.iloc[:5, :]
testing

Unnamed: 0,title,author,rating,num_rating,review,page,year,publisher,summary,language,clean_summary
0,The Diary of a Young Girl,Anne Frank,4.16,2887098,30439,283.0,July 1993,by Bantam,Discovered in the attic in which she spent the...,en,discovered in the attic in which she spent the...
1,Night,Elie Wiesel,4.34,1008920,30066,115.0,January 16th 2006,by Hill & Wang,"Born in the town of Sighet, Transylvania, Elie...",en,born in the town of sighet transylvania elie...
2,Being and Time,Martin Heidegger,4.03,20738,613,589.0,August 1st 1962,by Harper & Row (NYC/Evanston),One of the most important philosophical works ...,en,one of the most important philosophical works ...
3,Hiroshima,John Hersey,3.99,63589,3332,152.0,March 4th 1989,by Vintage,"On August 6, 1945, Hiroshima was destroyed by ...",en,on august hiroshima was destroyed by the f...
4,Into Thin Air: A Personal Account of the Mount...,Jon Krakauer,4.19,411855,14209,368.0,October 19th 1999,by Anchor Books,A bank of clouds was assembling on the not-so-...,en,a bank of clouds was assembling on the not so ...


In [24]:
test_vec = testing["clean_summary"].apply(summary_to_vec)

In [25]:
test_vec

0    [0.038194744210913144, 0.16574220679318294, 0....
1    [0.004738605532206987, 0.13148283712124745, 0....
2    [0.02417422465428158, 0.27134318957622683, 0.2...
3    [0.03558845926697055, 0.22575195702827638, 0.3...
4    [-0.15387485557169253, 0.13326480415915803, 0....
Name: clean_summary, dtype: object

In [26]:
df_test = pd.DataFrame([x for x in test_vec])
df_test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.038195,0.165742,0.239174,-0.081145,0.091722,0.343272,-0.072842,0.073836,-0.133167,-0.000711,...,0.037539,-0.098173,0.177957,0.167618,-0.39501,-0.077507,-0.265494,-0.257717,0.251396,0.083167
1,0.004739,0.131483,0.335465,-0.128207,-0.041416,0.312529,-0.074139,0.032846,-0.047191,-0.001244,...,0.006807,-0.191467,0.119058,0.282839,-0.356364,-0.070478,-0.160852,-0.210872,0.211644,0.153055
2,0.024174,0.271343,0.22321,0.126,0.215483,0.050019,-0.091706,-0.163766,-0.21025,0.126965,...,0.026426,-0.152177,-0.049088,0.203424,-0.336644,-0.007375,-0.385197,-0.650391,0.500218,0.493626


In [27]:
start = time.time()
summary_vec = df["clean_summary"].apply(summary_to_vec)
print(time.time() - start)

20.68734312057495


In [28]:
df_vec = pd.DataFrame([x for x in summary_vec], index = df_rec.index)
df_vec.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the diary of a young girl,0.038195,0.165742,0.239174,-0.081145,0.091722,0.343272,-0.072842,0.073836,-0.133167,-0.000711,...,0.037539,-0.098173,0.177957,0.167618,-0.39501,-0.077507,-0.265494,-0.257717,0.251396,0.083167
night,0.004739,0.131483,0.335465,-0.128207,-0.041416,0.312529,-0.074139,0.032846,-0.047191,-0.001244,...,0.006807,-0.191467,0.119058,0.282839,-0.356364,-0.070478,-0.160852,-0.210872,0.211644,0.153055
being and time,0.024174,0.271343,0.22321,0.126,0.215483,0.050019,-0.091706,-0.163766,-0.21025,0.126965,...,0.026426,-0.152177,-0.049088,0.203424,-0.336644,-0.007375,-0.385197,-0.650391,0.500218,0.493626
hiroshima,0.035588,0.225752,0.315681,-0.205804,0.143229,0.275597,0.013882,0.067247,-0.110689,0.02685,...,0.135745,-0.06549,-0.018061,0.238033,-0.446372,-0.010015,-0.199773,-0.255136,0.303791,0.199382
into thin air: a personal account of the mount everest disaster,-0.153875,0.133265,0.277531,-0.147089,0.068443,0.059131,-0.043173,0.090621,-0.161049,0.039488,...,0.053869,-0.097443,-0.123923,0.10122,-0.309871,0.018557,-0.157136,-0.178449,0.356175,0.092564


In [29]:
# check vector for a particular book
df_vec.loc['being and time']

0     0.024174
1     0.271343
2     0.223210
3     0.126000
4     0.215483
        ...   
95   -0.007375
96   -0.385197
97   -0.650391
98    0.500218
99    0.493626
Name: being and time, Length: 100, dtype: float64

In [30]:
def recommend2(title):
    title = title.lower()
    ind = pairwise_distances(df_vec.loc[title].values.reshape(1,-1), df_vec, metric='cosine').argsort()[0][:6]
    return df_vec.index[ind]

In [31]:
recommend2("being and time")

Index(['being and time', 'poetry, language, thought', 'saussure for beginners',
       'the fundamental concepts of metaphysics: world, finitude, solitude',
       'the republic',
       'no turning back: the history of feminism and the future of women'],
      dtype='object', name='title')

In [32]:
# make title column in df lower case to pull out summary
df["title"] = df["title"].apply(lambda x: x.lower())

In [None]:
# mask = (df.title == "being and time") |\
#        (df.title == "poetry, language, thought")|\
#        (df.title == "the spanish civil war: a very short introduction")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [33]:
recommend2("Being Mortal: Medicine and What Matters in the End")

Index(['being mortal: medicine and what matters in the end',
       'dr. fred's healthcare rescue: the real solution to healthcare',
       'unaccountable: what hospitals won't tell you and how transparency can revolutionize health care',
       'what to do when you have heart disease',
       'the checklist manifesto: how to get things right',
       'the lukin longevity system: the guide to staying younger, longer'],
      dtype='object', name='title')

In [None]:
# mask = (df.title == "what to do when you have heart disease")|\
#        (df.title == "being mortal: medicine and what matters in the end") |\
#        (df.title == "dr. fred's healthcare rescue: the real solution to healthcare")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [34]:
recommend2("The Tipping Point: How Little Things Can Make a Big Difference")

Index(['the tipping point: how little things can make a big difference',
       'invisible women: data bias in a world designed for men',
       'the spirit level: why more equal societies almost always do better',
       'the e-myth revisited: why most small businesses don't work and what to do about it',
       'the undercover economist',
       'think: straight talk for women to stay smart in a dumbed-down world'],
      dtype='object', name='title')

In [None]:
# mask = (df.title == "the tipping point: how little things can make a big difference")|\
#        (df.title == "invisible women: data bias in a world designed for men") |\
#        (df.title == "the undercover economist")
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

### ^ Recommendation might be better than topic vectors??
### Trying out the 300d vectors:

In [35]:
glove_file2 = datapath("/Users/sarazzzz/Downloads/glove/glove.6B.300d.txt")
tmp_file2 = get_tmpfile("glove_word2vec.txt")

glove2word2vec(glove_file2, tmp_file2)
model = gensim.models.KeyedVectors.load_word2vec_format(tmp_file2)

In [36]:
# testing, comparing via cosine similarity
print(model.similarity('obama', 'clinton'))
print(model.similarity('obama', 'reagan'))

0.70856035
0.40601766


In [37]:
text = df.iloc[0, -1]
text_tokens = word_tokenize(text)
print(text_tokens)

['discovered', 'in', 'the', 'attic', 'in', 'which', 'she', 'spent', 'the', 'last', 'years', 'of', 'her', 'life', 'anne', 'franks', 'remarkable', 'diary', 'has', 'become', 'a', 'world', 'classic', 'a', 'powerful', 'reminder', 'of', 'the', 'horrors', 'of', 'war', 'and', 'an', 'eloquent', 'testament', 'to', 'the', 'human', 'spirit', 'in', 'with', 'the', 'nazis', 'occupying', 'holland', 'a', 'thirteen', 'year', 'old', 'jewish', 'girl', 'and', 'her', 'family', 'fled', 'their', 'home', 'in', 'amsterdam', 'and', 'went', 'into', 'hiding', 'for', 'the', 'next', 'two', 'years', 'until', 'their', 'whereabouts', 'were', 'betrayed', 'to', 'the', 'gestapo', 'the', 'franks', 'and', 'another', 'family', 'lived', 'cloistered', 'in', 'the', 'secret', 'annexe', 'of', 'an', 'old', 'office', 'building', 'cut', 'off', 'from', 'the', 'outside', 'world', 'they', 'faced', 'hunger', 'boredom', 'the', 'constant', 'cruelties', 'of', 'living', 'in', 'confined', 'quarters', 'and', 'the', 'ever', 'present', 'threat'

In [38]:
vec_list = []
for word in set(text_tokens):
    try:
        vec_list.append(model[word])
    except:
        print(word)

In [39]:
vector = pd.DataFrame(vec_list)
print(vector.shape)
len(set(text_tokens))

(117, 300)


117

In [40]:
df.head(3)

Unnamed: 0,title,author,rating,num_rating,review,page,year,publisher,summary,language,clean_summary
0,the diary of a young girl,Anne Frank,4.16,2887098,30439,283.0,July 1993,by Bantam,Discovered in the attic in which she spent the...,en,discovered in the attic in which she spent the...
1,night,Elie Wiesel,4.34,1008920,30066,115.0,January 16th 2006,by Hill & Wang,"Born in the town of Sighet, Transylvania, Elie...",en,born in the town of sighet transylvania elie...
2,being and time,Martin Heidegger,4.03,20738,613,589.0,August 1st 1962,by Harper & Row (NYC/Evanston),One of the most important philosophical works ...,en,one of the most important philosophical works ...


In [41]:
start = time.time()
summary_vec = df["clean_summary"].apply(summary_to_vec)
print(time.time() - start)

56.78019309043884


In [42]:
df_vec300 = pd.DataFrame([x for x in summary_vec], index = df_rec.index)
df_vec300.shape

(3017, 300)

In [43]:
df_vec300.to_pickle("GloVe_embedding_for_rec.pkl")

In [44]:
df_rec = pd.read_pickle("data/GloVe_embedding_for_recommendation.pkl")

### Recommender modified for streamlit:

In [45]:
def recommend3(title):
    title = title.lower()
    ind = pairwise_distances(df_rec.loc[title].values.reshape(1,-1), df_rec, metric='cosine').argsort()[0][1:4]
#     books = df_rec.index[ind]
    
#     titles = []
#     ratings = []
#     summaries = []
#     for i in books:
#         titles.append(i)
        
#         rating = df.loc[df["title"] == i, "rating"].values
#         ratings.append(rating)
        
#         summary = df.loc[df["title"] == i, "summary"].values
#         summaries.append(summary)
    
#     results = pd.DataFrame(zip(titles, ratings, summaries))
#     return(results)
    return df_rec.index[ind]

In [46]:
string = "okay"
string.lower()

'okay'

In [47]:
recommend3("Quiet: The Power Of Introverts In A World That Can'T Stop Talking")

Index(['switch: how to change things when change is hard',
       'self-publishing in the eye of the storm',
       'ten years a nomad: a traveler's journey home'],
      dtype='object', name='title')

In [48]:
recommend3("when breath becomes air")

Index(['lessons from a cf cornerman: 38 lessons i learned during my wife's illness and lung transplant',
       'our time to dance, a mother's journey to joy', 'emerging butterfly'],
      dtype='object', name='title')

In [49]:
recommend3("man's search for meaning")

Index(['faith: a journey for all',
       'i don't want to lose hope: one life's journey, fight and triumph!',
       'what unites us: reflections on patriotism'],
      dtype='object', name='title')

In [None]:
# mask = (df.title == "when breath becomes air")|\
#        (df.title == "lessons from a cf cornerman: 38 lessons i learned during my wife's illness and lung transplant")|\
#        (df.title == "our time to dance, a mother's journey to joy") 
# df[mask]
# for i in df.loc[mask, "summary"]:
#     print("Summary: ", i, "\n")

In [51]:
recommend3("Being Mortal: Medicine and What Matters in the End")

Index(['dr. fred's healthcare rescue: the real solution to healthcare',
       'unaccountable: what hospitals won't tell you and how transparency can revolutionize health care',
       'the lukin longevity system: the guide to staying younger, longer'],
      dtype='object', name='title')

In [52]:
recommend3("The Tipping Point: How Little Things Can Make a Big Difference")

Index(['invisible women: data bias in a world designed for men',
       'the spirit level: why more equal societies almost always do better',
       'think: straight talk for women to stay smart in a dumbed-down world'],
      dtype='object', name='title')