This notebook shows an example recommendation system using doc2vec. We will use a dataset called CMU Book summaries [dataset](http://www.cs.cmu.edu/~dbamman/booksummaries.html). Alternateively, the dataset's link can be found in the `BookSummaries_Link.md` file under the Data folder in Ch7.


In [None]:
!pip install gensim
!pip install nltk
#todo: add pip for downloading nltk data?

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/1d/69/1262ed0050c21f5054702b8e96a2d8c310d4cd059e4a08c9a2fe6a5dae65/gensim-3.8.3-cp35-cp35m-manylinux1_x86_64.whl (24.2MB)
[K    100% |████████████████████████████████| 24.2MB 930kB/s ta 0:00:011   41% |█████████████▎                  | 10.1MB 5.2MB/s eta 0:00:03
[?25hCollecting smart-open>=1.8.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/74/77/744c79da6e66691e3500b6dffff29bdd787015eae817d594791edc7b719b/smart_open-2.0.0.tar.gz (103kB)
[K    100% |████████████████████████████████| 112kB 3.4MB/s ta 0:00:01
Collecting scipy>=0.18.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/c1/60/8cbf00c0deb50a971e6e3a015fb32513960a92867df979870a454481817c/scipy-1.4.1-cp35-cp35m-manylinux1_x86_64.whl (26.0MB)
[K    100% |████████████████████████████████| 26.0MB 1.0MB/s ta 0:00:011
[?25hCollecting numpy>=1.11.3 (from gensim)
[?25l  Downloading https://files.pytho

In [2]:
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


In [3]:
#Read the dataset’s README to understand the data format. 
data_path = "booksummaries.txt"
mydata = {} #titles-summaries dictionary object
for line in open(data_path, encoding="utf-8"):
    temp = line.split("\t")
    mydata[temp[2]] = temp[6]



In [4]:
#prepare the data for doc2vec, build and save a doc2vec model
train_doc2vec = [TaggedDocument((word_tokenize(mydata[t])), tags=[t]) for t in mydata.keys()]
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=10, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")



In [5]:
#Use the model to look for similar texts
model= Doc2Vec.load("d2v.model")

#This is a sentence from the summary of “Animal Farm” on Wikipedia:
#https://en.wikipedia.org/wiki/Animal_Farm
sample = """
Napoleon enacts changes to the governance structure of the farm, replacing meetings with a committee of pigs who will run the farm.
 """
new_vector = model.infer_vector(word_tokenize(sample))
sims = model.docvecs.most_similar([new_vector]) #gives 10 most similar titles
print(sims)

[('Animal Farm', 0.6777619123458862), ('The Wild Irish Girl', 0.6119967699050903), ("Snowball's Chance", 0.60667884349823), ('Family Matters', 0.5831906199455261), ('Settlers in Canada', 0.582908570766449), ('Poor White', 0.5771366953849792), ('The Road to Omaha', 0.576944887638092), ('Ponni', 0.5766265988349915), ("Family Guy: Stewie's Guide to World Domination", 0.5674009323120117), ('Texas Fever', 0.5643234848976135)]
