## Notebook for Testing Recommendations Implementation

In [53]:
import numpy as np
import matplotlib as plot
import pandas as pd
import pyarrow.parquet as pq
import dill as pickleo
import requests;
import json
from io import StringIO

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [60]:
genres_resp = requests.get("http://localhost:8083/api/v1/ml/assets/genres")
details_resp = requests.get("http://localhost:8083/api/v1/ml/assets/details?count=20000")
genres_json = json.loads(genres_resp.content)
details_json = json.loads(details_resp.content)
genres_df = pd.DataFrame(genres_json)
details_df = pd.DataFrame(details_json)

details_df.replace(np.nan, '', regex=True, inplace=True)

display(genres_df)
display(details_df)

Unnamed: 0,id,name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


Unnamed: 0,id,title,overview,popularity,posterPath
0,3924,Blondie,Blondie and Dagwood are about to celebrate the...,2.583,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg
1,6124,Peter Voss Thief of Millions,,0.841,/6xUbUCvndklbGVYiljHr34NTxSl.jpg
2,8773,Love at Twenty,Love at Twenty unites five directors from five...,4.397,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg
3,25449,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,2.236,/okQY6jVmRU19CUbYPUZC77K3XBE.jpg
4,31975,Sesame Street: Elmo Loves You!,Elmo is making a very very super special surpr...,0.600,/qKWcCmvGr4g0dgXvhqAc4BAMCtk.jpg
...,...,...,...,...,...
19995,31856,Hydro-Puzzle,Warsaw is mysteriously running out of water du...,2.790,/7GI9e4JXjuhUt0dnTFPW70u1odW.jpg
19996,31857,Coyote's Morning,Kuba a young cartoon artist who works as a wai...,3.864,/i5KDFuStGc5fvXdphrOY9xNF7vs.jpg
19997,31858,Made in Britain,After being sent to a detention centre a teena...,7.471,/4knR2kM9WYSwb0UVArlwA8MbFf5.jpg
19998,31859,Anastacia: Live at Last,Anastacia: Live at Last is a DVD collection fr...,0.919,/gCEYjoyj5sCiaB0iwgPrc5RtWJq.jpg


In [7]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

# get the column values to vectorize
titles = details_df['title'].tolist()
overviews = details_df['overview'].tolist()

# do the tfidf stuff
titles_matrix = tfidf_vectorizer.fit_transform(titles)
titles_tokens = tfidf_vectorizer.get_feature_names_out()

overviews_matrix = tfidf_vectorizer.fit_transform(overviews)
overviews_tokens = tfidf_vectorizer.get_feature_names_out()

# create dataframes
titles_df = pd.DataFrame(titles_matrix.toarray(), columns = titles_tokens)
overviews_df = pd.DataFrame(overviews_matrix.toarray(), columns = overviews_tokens)

tokens_df = pd.concat([titles_df, overviews_df], axis=1)

tokens_df

Unnamed: 0,00,01,04,09,10,1000,10000,101,11,1138,...,émigré,étienne,íris,ömer,ötztal,özonuk,übrig,üsker,łódź,şinasi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.metrics.pairwise import linear_kernel

# a matrix of pair-wise similarity scores between movie tokens
token_sim = linear_kernel(tokens_df, tokens_df)

# same for genre similarity
genres_only = genre_df.iloc[:, 2:]
genre_sim = linear_kernel(genres_only, genres_only)

# how much genre similarity affects score
GENRE_SCALE = 0.25

token_sim

array([[2.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 2.        , 0.04758333,
        0.04627886],
       [0.        , 0.        , 0.        , ..., 0.04758333, 2.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04627886, 0.        ,
        2.        ]])

In [32]:
def get_top_n(titles, n):
    
    token_scores = []
    genre_scores = []
    for title in titles:
        # find index of title
        row = details_df.loc[details_df['title'] == title]
        
        # movie title may not exist
        if row.empty:
            continue
        
        index = row.index.tolist()[0]
        token_score = token_sim[index]
        genre_score = genre_sim[index]
        
        token_scores.append(token_score)
        genre_scores.append(genre_score)
    
    # column-wise average of the movies
    avg_token_score = np.mean(token_scores, axis=0)
    avg_genre_score = np.mean(genre_scores, axis=0)
    
    # map each score to the ordered pair (index, score)
    enum_token_score = list(enumerate(avg_token_score))
    enum_genre_score = list(enumerate(avg_genre_score))
        
   
    total_scores = []
    
    for i, token_score in enum_token_score:
        total_scores.append((i, token_score + enum_genre_score[i][1] * GENRE_SCALE))
    
    
    # Sort the movies based on the similarity scores
    top_scores = sorted(total_scores, \
                        key=lambda x: x[1], reverse=True)[len(titles):len(titles) + n+1]

    top_indices = [row[0] for row in top_scores]

    top_titles = details_df.iloc[top_indices, 1]
    return top_titles

In [33]:
get_top_n(["Toy Story 2", "Fight Club"], 5)

706                            Toy Story
4603        Police Story 4: First Strike
4843          The Triplets of Belleville
4388    Spirit: Stallion of the Cimarron
1454                  The Breakfast Club
694                    A Christmas Story
Name: title, dtype: object