# Word2Vec v2.5: "Mistake Not"

### Connect to Database

In [78]:
! pip3 install psycopg2-binary --user
import pandas as pd
import psycopg2
import numpy as np
from getpass import getpass

[33mYou are using pip version 19.0.2, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [79]:
# connect to database
connection = psycopg2.connect(
    database  = "postgres",
    user      = "postgres",
    password  = getpass(),
    host      = "movie-rec-scrape.cvslmiksgnix.us-east-1.rds.amazonaws.com",
    port      = '5432'
)
# Enter database password below and press Enter.

 ················


In [80]:
# create cursor that is used throughout
try:
    c = connection.cursor()
    print("Connected!")
except:
    print("Connection problem chief!")

Connected!


### Prepare data and train.
1. Get the list of reviewers whose reviews we want (about 17k)
2. Get the dataframe of reviewers, movie IDs with positive reviews
3. Inner join the above two dataframes.
4. Run the list constructor on the join table to construct the training data.
    - Training data is of this format: [['movieid1', 'movieid2', ...], ...]
5. Train Word2Vec on the list of watch histories (which are themselves lists of movie IDs).
6. Save the model.

In [72]:
# Get reviewers with at least 10 positive reviews (rating 7-10 inclusive)
c.execute("""
select username
from reviews
where user_rating between 7 and 10
group by username
having count(username) >= 10
order by count(username) desc
""")

reviewers = c.fetchall()

In [74]:
# Get positive reviews from database
c.execute("SELECT movie_id, username FROM reviews WHERE user_rating > 6")
result = c.fetchall()

# create reviews dataframe
df = pd.DataFrame(result, columns = ['movieid', 'userid'])
df.head()

Unnamed: 0,movieid,userid
0,5493944,dmldc
1,95016,immortal_saint1
2,5493944,vampyr_vashti
3,5493944,julieclowes
4,5493944,stephgonser


In [76]:
# create reviewers dataframe
df_reviewers = pd.DataFrame(reviewers, columns = ['userid'])

In [77]:
# merge to get only the IDs relevant to training
df = df.merge(df_reviewers, how='inner', on='userid')
df.shape

(904140, 2)

In [22]:
# ! sudo su
# ! yum update -y
# ! yum -y install python-pip
# ! python -V

Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper,
              : versionlock
You need to be root to perform this command.
Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper,
              : versionlock
You need to be root to perform this command.
Python 3.6.5 :: Anaconda, Inc.


In [23]:
# ! which pip

/home/ec2-user/anaconda3/envs/python3/bin/pip


# Install gensim

In [4]:
! python -m pip install tqdm
# ! python -c 'import tqdm'
! python -m pip install gensim

[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [82]:
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [94]:
# list to capture watch history of the users
watched_train = []

# populate the list with the movie codes
for i in tqdm(reviewers):
    temp = df[df["userid"] == i[0]]["movieid"].tolist()
    watched_train.append(temp)
    
len(watched_train)

100%|██████████| 17812/17812 [44:07<00:00,  6.73it/s]


17812

In [95]:
# save the model for later
import pickle
pickle.dump(watched_train, open('watched_train.sav', 'wb'))

In [166]:
# #save the model in protocol 2 so it can be opened in python 2.7
# import pickle
# temp = pickle.load(open('watched_train.sav', 'rb'))
# pickle.dump(temp, open('watched_train.sav', 'wb'), protocol=2)

### Train the Model

**Important:** The previous model was trained on movie IDs that were inside lists of length 1, with watch histories being lists of lists.

This model eschews the inner lists. Each watch history is simply a list of strings.

In [96]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(watched_train, progress_per=200)

model.train(watched_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(8222420, 9041400)

In [97]:
# save word2vec model
model.save("w2v_mistakenot.model")

### Test the model

In [81]:
# load model
import gensim
model = gensim.models.Word2Vec.load("w2v_mistakenot.model")

In [82]:
# prunes the model, making it faster but unable to train any more.
model.init_sims(replace=True)

In [83]:
print(model)

Word2Vec(vocab=24784, size=100, alpha=0.03)


In [84]:
# extract all vectors
X = model[model.wv.vocab]

X.shape

(24784, 100)

In [86]:
# IDs are words in the model, and callable as such.
# model['0110912']

In [87]:
# def get_title(id):
#     """Takes an id string and returns the movie title."""
    
#     try:
#         c.execute(f"""
#         select primary_title, start_year
#         from movies
#         where movie_id = '{id}'""")
#     except:
#         return f"Movie title unknown. ID:{id}"
    
#     t = c.fetchone()
#     title = tuple([t[0], t[1], f"https://www.imdb.com/title/tt{id}/"])
#     return title
    
# def predict(model, input, num_recs=6):
#         """For the input, do the predictions and return them.

#         Args:
#             model: the word2vec model object.
#             input: a list of movie IDs.
#             num_recs: the number of recommendations to return.
#         """

#         def _aggregate_vectors(movies):
#             # get the vector average of the movies in the input.
#             # discard unrecognized IDs.
#             movie_vec = []
#             for i in movies:
#                 try:
#                     movie_vec.append(model[i])
#                 except KeyError:
#                     continue
#             return np.mean(movie_vec, axis=0)

#         def _similar_movies(v, n):
#             # extract most similar movies for the input vector
#             return model.similar_by_vector(v, topn= n+1)[1:]
        
#         def _remove_dupes(recs):
#             # remove any recommendations that were in the input
#             return [x for x in recs if x not in input]
        
        
        
#         # aggregate input and find similar vectors.
#         recs = _similar_movies(_aggregate_vectors(input), num_recs)
#         # get titles
#         recs = [get_title(y[0]) for y in recs] 
#         return recs

In [124]:
class ScoringService(object):
    model = None                # Where we keep the model when it's loaded

    @classmethod
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        if cls.model == None:
            # load the gensim model
            w2v_model = gensim.models.Word2Vec.load("w2v_mistakenot.model")
            # keep only the normalized vectors.
            # This saves memory but makes the model untrainable (read-only).
            w2v_model.init_sims(replace=True)
            # with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'r') as inp:
            #     cls.model = pickle.load(inp)
            cls.model = w2v_model
        return cls.model

    @classmethod
    def predict(cls, input, n=20):
        """For the input, do the predictions and return them.

        Args:
            input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""

        clf = cls.get_model()

        def _aggregate_vectors(movies):
            # get the vector average of the movies in the input
            movie_vec = []
            for i in movies:
                try:
                    movie_vec.append(clf[i])
                except KeyError:
                    continue
            return np.mean(movie_vec, axis=0)

        def _similar_movies(v, n = 6):
            # extract most similar movies for the input vector
            return clf.similar_by_vector(v, topn= n+1)[1:]
            
        def _remove_dupes(recs):
            # remove any recommendations that were in the input
            return [x for x in recs if x not in input]

        def _get_info(id):
            """Takes an id string and returns the movie info with a url."""
            try:
                c.execute(f"""
                select primary_title, start_year
                from movies
                where movie_id = '{id[0]}'""")
            except:
                return f"Movie title unknown. ID:{id}"

            t = c.fetchone()
            title = tuple([t[0], t[1], f"https://www.imdb.com/title/tt{id[0]}/"])
            return title
        
        input = [x for x in input] # remove leading zeroes
        recs = _remove_dupes(_similar_movies(_aggregate_vectors(input), n=n))
        recs = [_get_info(x) for x in recs]
        return recs

In [125]:
# test cases

# A list of some Coen Bros movies.
coen_bros = ['116282', '2042568', '1019452', 
             '1403865', '190590', '138524', 
             '335245', '477348', '887883', '101410']

# Data scientist's recent watches.
cooper_recent = ['0053285', '0038650', '0046022', 
                 '4520988', '1605783', '6751668', 
                 '0083791', '0115685', '0051459', 
                 '8772262', '0061184', '0041959',
                 '7775622']

# dirkh public letterboxd recent watches.
dirkh = ['7975244', '8106534', '1489887', 
         '1302006', '7286456', '6751668', 
         '8364368', '2283362', '6146586', 
         '2194499', '7131622', '6857112']

# Marvin watches
marvin = ['7286456', '0816692', '2543164', '2935510', 
          '2798920', '0468569', '5013056', '1375666', 
          '3659388', '0470752', '0266915', '0092675', 
          '0137523', '0133093', '1285016']  

# Gabe watches
gabe = ['6292852','0816692','2737304','3748528',
        '3065204','4154796','1536537','1825683',
        '1375666','8236336','2488496','1772341',
        '0317705','6857112','5052448']

# Eric watches
eric = ['2974050','1595842','0118539','0093405',
        '3216920','1256535','5612742','3120314',
        '1893371','0046248','0058548','0199481',
        '2296777','0071198','0077834']

chuckie = ['4263482',
'0084787',
'3286052',
'5715874',
'1172994',
'4805316',
'3139756',
'8772262',
'7784604',
'1034415',]

harlan = ['1065073','5052448','0470752','5688932','1853728','1596363','0432283','6412452','4633694','9495224','0443453','0063823',
          '0066921','0405296','1130884','1179933','0120630','0268126','0137523','0374900','8772262','0116996','0107290','7339248']

ryan = ['0166924','2866360','0050825','2798920','3416742','0060827','1817273','0338013','0482571','5715874','2316411','4550098']

karyn = ['4425200','0464141','1465522','0093779','0099810','0076759','3748528','6763664','0317740','2798920','0096283','0258463','0118799','0058092','0107290','0045152','0106364']

richard = ['0074119','0064115','0070735','0080474','0061512','0067774','0057115','0070511','0081283',
           '0065126','0068421','0078227','0079100','0078966','0081696','0082085','0072431','0075784',
           '0093640','0098051','0094226','0097576','0099810','0081633','0080761','0077975','0085244','0095159','0101969']

joe = ['6335734','0291350','0113568','0208502','0169858','0095327','0097814','0983213','0094625','7089878']

lena = ['1990314','3236120','1816518','0241527','0097757','0268978','0467406','2543164','2245084','3741834']

wade = ['0118665','0270846','0288441','2287250','2287238','8668804','9448868','1702443','1608290','5519340']

In [126]:
s = ScoringService()

In [127]:
s.predict(input=cooper_recent)

('8266310', 0.7212890386581421)
('7775622', 0.7211458683013916)
('7472352', 0.719699501991272)
('7653254', 0.7180582284927368)
('8637428', 0.716407299041748)
('8613070', 0.715118408203125)
('7715070', 0.7149782776832581)
('5083738', 0.7149302959442139)
('8579674', 0.7147093415260315)
('7046974', 0.7134156227111816)
('5104604', 0.7097740173339844)
('7424200', 0.7097126245498657)
('7975244', 0.7094463109970093)
('4729430', 0.7093826532363892)
('7745068', 0.708999514579773)
('8092252', 0.708191454410553)
('7616974', 0.70567387342453)
('5117222', 0.7053700089454651)
('8151874', 0.7047182321548462)
('5117428', 0.7033519148826599)


[('Blinded by the Light', 2019, 'https://www.imdb.com/title/tt8266310/'),
 ('Free Solo', 2018, 'https://www.imdb.com/title/tt7775622/'),
 ('Shirkers', 2018, 'https://www.imdb.com/title/tt7472352/'),
 ('Marriage Story', 2019, 'https://www.imdb.com/title/tt7653254/'),
 ('The Farewell', 2019, 'https://www.imdb.com/title/tt8637428/'),
 ('Portrait of a Lady on Fire', 2019, 'https://www.imdb.com/title/tt8613070/'),
 ('Horrible Histories: The Movie - Rotten Romans',
  2019,
  'https://www.imdb.com/title/tt7715070/'),
 ('The Favourite', 2018, 'https://www.imdb.com/title/tt5083738/'),
 ('1917', 2019, 'https://www.imdb.com/title/tt8579674/'),
 ('Swing Kids', 2018, 'https://www.imdb.com/title/tt7046974/'),
 ('Isle of Dogs', 2018, 'https://www.imdb.com/title/tt5104604/'),
 ('Teen Titans GO! to the Movies',
  2018,
  'https://www.imdb.com/title/tt7424200/'),
 ('Jumanji: The Next Level', 2019, 'https://www.imdb.com/title/tt7975244/'),
 ('Klaus', 2019, 'https://www.imdb.com/title/tt4729430/'),
 ('My 