### Load the movie review dataset and do pre-processing by Spacy

In [1]:
import pandas as pd

df = pd.read_csv('movie_data.csv')

df.head(10)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
5,Leave it to Braik to put on a good show. Final...,1
6,Nathan Detroit (Frank Sinatra) is the manager ...,1
7,"To understand ""Crash Course"" in the right cont...",1
8,I've been impressed with Chavez's stance again...,1
9,This movie is directed by Renny Harlin the fin...,1


In [2]:
corpus = list(df.review)[:10000]

In [3]:
import re
import en_core_web_sm

nlp = en_core_web_sm.load()

def pre_process(s):
    s = re.sub(r'\W+', ' ', s)
    return nlp(s)

pre_process('This is a simple preprocessing-function.')

This is a simple preprocessing function 

In [4]:
from tqdm import tqdm

docs = []
for idx, review in tqdm(enumerate(corpus), total = len(corpus)):
    doc = pre_process(review)
    
    docs.append((idx, review, doc))

100%|██████████| 10000/10000 [13:20<00:00, 12.49it/s]


In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stopword_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shangjingbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from collections import defaultdict
from math import log, sqrt
import numpy as np
from numpy import linalg as LA
from gensim.models import Word2Vec, LdaModel
from gensim.corpora import Dictionary

### Fit IDF based on all the docs that we have

In [7]:

def fit_IDF(docs, min_df = 1):
    print(f'# of docs = {len(docs)}')
    DF = defaultdict(float)
    for (idx, raw_text, doc) in docs:
        token_set = set([token.lemma_ for token in doc if token.text.lower() not in stopword_set])
        for token in token_set:
            DF[token] += 1
    IDF = defaultdict(float)
    for token in DF:
        if (DF[token] >= min_df) and (DF[token] < len(docs)):
            IDF[token] = log(len(docs) / DF[token])
        else:
            print('boilerplate token =', token)
    print(f'# of words in index: {len(IDF)}')
    return IDF

IDF = fit_IDF(docs)

# of docs = 10000
# of words in index: 53587


In [8]:
print(IDF['movie'])

0.44036716011742927


In [9]:
print(IDF['impressive'])

4.080441657053109


In [10]:
print(IDF['fascinating'])

4.268697949366879


### Fit a word2vec model based on all the docs that we have

In [13]:
def fit_w2v(docs, dimension):
    sentences = []
    for (idx, raw_text, doc) in docs:
        for sent in doc.sents:
            sentence = []
            for token in sent:
                sentence.append(token.lemma_)
            sentences.append(sentence)

    w2v_model = Word2Vec(min_count=1,
                         window=5,
                         vector_size=dimension,
                         sample=6e-5, 
                         alpha=0.1, 
                         min_alpha=0.0007, 
                         negative=20)
    print(f'# of sentences for word2vec = {len(sentences)}')
    w2v_model.build_vocab(sentences, progress_per=10000)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)
    print(f'traind: {w2v_model}')
    return w2v_model

w2v = fit_w2v(docs, 100)
print(w2v.wv.similarity('good', 'bad'))
print(w2v.wv.similarity('good', 'great'))

# of sentences for word2vec = 46515
traind: Word2Vec(vocab=53832, vector_size=100, alpha=0.1)
0.67344487
0.81230116


In [21]:
print(w2v.wv.similarity('love', 'romantic'))

0.35854912


In [15]:
print(w2v.wv.similarity('love', 'sport'))

0.037585683


### Fit a LDA model based on all the docs that we have

In [16]:


'''
    docs are supposed to be a list of Spacy-parsed documents.
'''
def build_lda(docs, num_topics = 5, verbose = False):
    corpus = []
    for (idx, review, doc) in docs:
        tokenized_doc = []
        for token in doc:
            tokenized_doc.append(token.lemma_)
        corpus.append(tokenized_doc)
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(corpus)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in corpus]
    
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    
    # Set training parameters.
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.
    
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    
    top_topics = model.top_topics(corpus) #, num_words=20)
    
    if verbose:
        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)

        from pprint import pprint
        pprint(top_topics)

    return model, dictionary

    
model = build_lda(docs, num_topics = 20, verbose = True)

Number of unique tokens: 5624
Number of documents: 10000
Average topic coherence: -2.1454.
[([(0.013858406, 'bad'),
   (0.010316879, 'like'),
   (0.009300658, 'just'),
   (0.008853969, 'out'),
   (0.0082194125, 'if'),
   (0.0082079, 'so'),
   (0.008100217, 'or'),
   (0.0075584995, 'get'),
   (0.0073678764, 'there'),
   (0.0073633376, 'can'),
   (0.0072571156, 'even'),
   (0.007237562, 'watch'),
   (0.0069317874, 'an'),
   (0.0066149174, 'some'),
   (0.0063747526, 'from'),
   (0.006041778, 'up'),
   (0.0060234186, 'only'),
   (0.0059688915, 'time'),
   (0.0054462804, 'look'),
   (0.005430357, 'no')],
  -0.8220963580191976),
 ([(0.01506825, 'bad'),
   (0.014951323, 'no'),
   (0.014495276, 'there'),
   (0.011904428, 'or'),
   (0.011081049, 'who'),
   (0.010252665, 'character'),
   (0.009800223, 'even'),
   (0.008377153, 'so'),
   (0.0077555715, 'scene'),
   (0.006867061, 'by'),
   (0.0068546394, 'any'),
   (0.0065915072, 'only'),
   (0.0064624404, 'like'),
   (0.0063721887, 'an'),
   (0.0

### Let's assemble all these together and build a very simple search engine!

**Input**: 
- A query from user

**Output**:
- A ranked list of documents


In [22]:


class SimpleSearchEngine:
    
    def __init__(self, dimension = 100, topics = 10):
        self.IDF = {}
        self.dimension = dimension
        self.topics = topics
        
    def fit(self, docs, min_df = 1):
        self.IDF = fit_IDF(docs, min_df)
        self.w2v = fit_w2v(docs, self.dimension)
        self.lda, self.dictionary = build_lda(docs, num_topics = self.topics)

    def retrieve_ranklist(self, query, docs, weight_tfidf = 0.8, weight_word2vec = 0.1, weight_lda = 0.1):
        parsed = pre_process(query)
        order = []
        for (idx, raw_text, doc) in docs:
            sim = self.get_similarity(doc, parsed, weight_tfidf, weight_word2vec, weight_lda)
            order.append((idx, raw_text, sim))
        order.sort(key = lambda x : x[2], reverse = True)
        return order
    
    '''
        This function returns the cosine similarity between doc and query based on 
        a linear combination of three similarity scores
        
        doc: It is assumed to be parsed by Spacy
        query: It is assumed to be parsed by Spacy
    '''
    def get_similarity(self, doc, query, weight_tfidf, weight_word2vec, weight_lda):
        a = self.get_tfidf(doc)
        b = self.get_tfidf(query)
        sim_tfidf = 0
        for token in a:
            if token in b:
                sim_tfidf += a[token] * b[token]
                
        a = self.get_word2vec(doc)
        b = self.get_word2vec(query)
        sim_word2vec = np.dot(a, b) / LA.norm(a) / LA.norm(b)
        
        a = self.get_topics(doc)
        b = self.get_topics(query)
        sim_lda = np.dot(a, b) / LA.norm(a) / LA.norm(b)

        ret = sim_tfidf * weight_tfidf + sim_word2vec * weight_word2vec + sim_lda * weight_lda
        return ret  
    
        
    '''
        This function returns a sparse, normalized TF-IDF vector using default dictionary
        doc: It is assumed to be parsed by Spacy
    '''
    def get_tfidf(self, doc): 
        ret = defaultdict(float)
        for token in doc:
            if token.lemma_ in self.IDF:
                ret[token.lemma_] += self.IDF[token.lemma_]
        for token in ret:
            ret[token] /= len(doc)
        s = 0
        for token, w in ret.items():
            s += w * w
        s = sqrt(s)
        for token in ret:
            ret[token] /= s
        return ret
    
    '''
        This function returns a dense, low-dimensional vector using a word2vec model trained during fit()
        doc: It is assumed to be parsed by Spacy
    '''
    def get_word2vec(self, doc):
        vec = np.zeros(self.dimension)
        total_weight = 0
        for token in doc:
            if token.lemma_ in self.IDF:
                try:
                    weight = self.IDF[token.lemma_]
                    vec = vec + weight * self.w2v.wv[token.lemma_]
                    total_weight += weight
                except KeyError:
                    pass
        if total_weight > 0:
            vec = vec / total_weight
        return vec
    
    '''
        This function returns a dense, low-dimensional vector using a LDA model trained during fit()
        doc: It is assumed to be parsed by Spacy
    '''
    def get_topics(self, doc):
        tokens = []
        for token in doc:
            tokens.append(token.lemma_)
        bow = self.dictionary.doc2bow(tokens)
        t = self.lda.get_document_topics(bow)
        ret = np.zeros(self.topics)
        for (index, value) in t:
            ret[index] += value
        return ret  

### Build a search engine for all reviews

In [23]:
engine = SimpleSearchEngine()
engine.fit(docs)

# of docs = 10000
# of words in index: 53587
# of sentences for word2vec = 46515
traind: Word2Vec(vocab=53832, vector_size=100, alpha=0.1)
Number of unique tokens: 5624
Number of documents: 10000


In [24]:
query = 'impressive classical romantic movie'
ranks = engine.retrieve_ranklist(query, docs, weight_tfidf = 1, weight_lda = 0, weight_word2vec = 0)

print('Query =', query)
print('---------------------------------------------------------')
print('Similarity\tID\tDescription')
for (idx, description, similarity) in ranks[:5]:
    print('%.4f\t%s\t%s' % (similarity, idx, description))
print('---------------------------------------------------------\n\n')

Query = impressive classical romantic movie
---------------------------------------------------------
Similarity	ID	Description
0.1048	8368	Oh boy ! It was just a dream ! What a great idea ! Mr Lynch is very lucky most people try to tell classical stories. This way he can play with his little plantings and his even more little payoffs. Check out Polanski's "The lodger" for far more intelligent mix of fantasy and reality.
0.0837	206	...an incomprehensible script (when it shouldn't have been) dependent on a rather flaky voice-over.<br /><br />The animation, however, show real talent.<br /><br />Quite visually impressive.
0.0813	7730	No wonder a lot of us hate classical music; and what are the children to think? With "educational" PR like this, serious music will soon slip from life support to the morgue. Kids know when they're being talked down to, and this is no exception; why can't someone good do a movie about classical music for kids? I must admit, I enjoyed the actor who played Beet

In [25]:
query = 'impressive classical romantic movie'
ranks = engine.retrieve_ranklist(query, docs, weight_tfidf = 0, weight_lda = 1, weight_word2vec = 0)

print('Query =', query)
print('---------------------------------------------------------')
print('Similarity\tID\tDescription')
for (idx, description, similarity) in ranks[:5]:
    print('%.4f\t%s\t%s' % (similarity, idx, description))
print('---------------------------------------------------------\n\n')

Query = impressive classical romantic movie
---------------------------------------------------------
Similarity	ID	Description
0.9833	7224	I really like this film because of all the stars and the dancing and the story that goes along with it. Rita Hayworth was at her most glamorous in this musical and the costumes were gorgeous. Although a musical, I thought Rita Hayworth did a fine performance of dramatic acting in this film as well. As far as her dancing, I think she was excellent. Even Betty Grable pretty much endorsed Rita's dancing in this film as she commented that Rita danced rings around her own dancing and let's face it, Betty Grable was an excellent dancer. The cinematography and vivid colors are also noted. Rita wants to be a cover girl for a magazine but she's also in love with her mentor played by Gene Kelly. Does she leave Kelly to fulfill her dream and bypass love and Broadway stardom or does she stick around to find that unique pearl that will change her life forever? 

In [26]:
query = 'impressive classical romantic movie'
ranks = engine.retrieve_ranklist(query, docs, weight_tfidf = 0, weight_lda = 0, weight_word2vec = 1)

print('Query =', query)
print('---------------------------------------------------------')
print('Similarity\tID\tDescription')
for (idx, description, similarity) in ranks[:5]:
    print('%.4f\t%s\t%s' % (similarity, idx, description))
print('---------------------------------------------------------\n\n')

Query = impressive classical romantic movie
---------------------------------------------------------
Similarity	ID	Description
0.5119	9130	A recent viewing of THAT'S ENTERTAINMENT has given me the urge to watch many of the classic MGM musicals from the forties and fifties. ANCHORS AWEIGH is certainly a lesser film than ON THE TOWN. The songs aren't as good, nor is the chemistry between the characters. But the film beautifully interweaves classical favorites, such as Tchaikovsky. And the scene at the Hollywood Bowl, with Sinatra and Kelly emerging from the woods above it at the top, and then running down the steps, while dozens of pianists play on the piano, is the best scene in the film, even though the scene in which Kelly dances with Jerry Mouse is more famous. Classical music enthusiasts will no doubt identify the music the pianists are playing. Sinatra then croons, "I Fall in Love Too Easily," before having his epiphany about whom he loves. The color is beautiful, Hollywood looks 

In [27]:
query = 'impressive classical romantic movie'
ranks = engine.retrieve_ranklist(query, docs, weight_tfidf = 0.85, weight_lda = 0.05, weight_word2vec = 0.1)

print('Query =', query)
print('---------------------------------------------------------')
print('Similarity\tID\tDescription')
for (idx, description, similarity) in ranks[:5]:
    print('%.4f\t%s\t%s' % (similarity, idx, description))
print('---------------------------------------------------------\n\n')

Query = impressive classical romantic movie
---------------------------------------------------------
Similarity	ID	Description
0.1609	9130	A recent viewing of THAT'S ENTERTAINMENT has given me the urge to watch many of the classic MGM musicals from the forties and fifties. ANCHORS AWEIGH is certainly a lesser film than ON THE TOWN. The songs aren't as good, nor is the chemistry between the characters. But the film beautifully interweaves classical favorites, such as Tchaikovsky. And the scene at the Hollywood Bowl, with Sinatra and Kelly emerging from the woods above it at the top, and then running down the steps, while dozens of pianists play on the piano, is the best scene in the film, even though the scene in which Kelly dances with Jerry Mouse is more famous. Classical music enthusiasts will no doubt identify the music the pianists are playing. Sinatra then croons, "I Fall in Love Too Easily," before having his epiphany about whom he loves. The color is beautiful, Hollywood looks 