This notebook shows an example recommendation system using doc2vec. We will use a dataset called CMU Book summaries [dataset](http://www.cs.cmu.edu/~dbamman/booksummaries.html). Alternateively, the dataset's link can be found in the `BookSummaries_Link.md` file under the Data folder in Ch7.


In [1]:
!pip install gensim
!pip install nltk



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
# Read the dataset’s README to understand the data format.

data_path = "drive/MyDrive/Datasets/PNLP/booksummaries.txt"
mydata = {} #titles-summaries dictionary object
for line in open(data_path, encoding="utf-8"):
    temp = line.split("\t")
    mydata[temp[2]] = temp[6]

In [18]:
#prepare the data for doc2vec, build and save a doc2vec model
train_doc2vec = [TaggedDocument((word_tokenize(mydata[t])), tags=[t]) for t in mydata.keys()]
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=10, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")

In [19]:
#Use the model to look for similar texts
model= Doc2Vec.load("d2v.model")

#This is a sentence from the summary of “Animal Farm” on Wikipedia:
#https://en.wikipedia.org/wiki/Animal_Farm
sample = """
Napoleon enacts changes to the governance structure of the farm, replacing meetings with a committee of pigs who will run the farm.
 """
new_vector = model.infer_vector(word_tokenize(sample))
sims = model.docvecs.most_similar([new_vector])
print(sims)

[('Animal Farm', 0.7355128526687622), ("Snowball's Chance", 0.616859495639801), ("Family Guy: Stewie's Guide to World Domination", 0.5848122835159302), ('Payback: Debt and the Shadow Side of Wealth', 0.5838175415992737), ('Poor White', 0.5768673419952393), ('The Wild Irish Girl', 0.5647366046905518), ('Settlers in Canada', 0.5579269528388977), ('The Rose in Splendour: a Story of the Wars of Lancaster and York', 0.554655909538269), ('Family Matters', 0.5509009957313538), ('Est: The Steersman Handbook', 0.5481798052787781)]


  sims = model.docvecs.most_similar([new_vector])
