In [1]:
import os
import pathlib

from sklearn.metrics.pairwise import cosine_similarity
import scipy
from psutil import virtual_memory
import numpy as np

import pandas as pd


def check_ram():
    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
        print('Not using a high-RAM runtime')
    else:
        print('You are using a high-RAM runtime!')


In [2]:
md2 = pd.read_csv('md2.csv',
                                    header=0,
                                    index_col=0)

In [3]:
md2.head()

Unnamed: 0,movieId,title,genres,year,rating,tag,combination
0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy...",1995,3.893708,"['neighborhood', 'Disney', 'unlikely friendshi...","['neighborhood', 'Disney', 'unlikely friendshi..."
1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']",1995,3.251527,"['fantasy', 'kid flick', 'childish', 'see also...","['fantasy', 'kid flick', 'childish', 'see also..."
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']",1995,3.142028,"['moldy', 'comedinha de velhinhos engraÃƒÂ§ada...","['moldy', 'comedinha de velhinhos engraÃƒÂ§ada..."
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']",1995,2.853547,"['chick flick', 'divorce', 'based on novel or ...","['chick flick', 'divorce', 'based on novel or ..."
4,5,Father of the Bride Part II (1995),['Comedy'],1995,3.058434,"['contraception', 'family', 'steve martin', 's...","['contraception', 'family', 'steve martin', 's..."


In [4]:
loaded = np.load('cosine_sim_25m.npz')
cosine_sim = loaded['arr_0']

In [5]:
def get_recommendations(title):
        idx = md2.index[md2['title'].str.startswith(title)]
        if idx.empty: 
            idx = md2.index[md2['title'].str.startswith(title.lower())]
        if idx.empty:
            print(f'idx = {idx}, unable to find any movie with that title.')
            return None
        list_of_recommendations = []
        if idx.shape[0] > 5:
            count = 1
        else:
            count = 5
        for curr_count, item in enumerate(idx):
            list_of_recommendations += find_cosine_similarity(item,count)
            if curr_count == 100:
                break
        list_of_recommendations = list(set(list_of_recommendations))
        return md2.iloc[list_of_recommendations]['title']

In [6]:
def get_recommendations_from_combinations(title):
    df = md2[md2['combination'].apply(lambda tag: check(tag, title))]
    # print(f'df = {df}')
    if df.empty:
        print(f'df = {df}, unable to find any movie with that title.')
        return None
    return df.sort_values('rating', ascending=False).head(250)['title']

In [289]:
def get_recommendations_from_combinations_sorted_by_cosim(title):
    selection = title.split()
    # df = md2[md2['combination'].str.contains(title, na=False)]
    df = md2[md2['combination'].apply(lambda tag: check(tag, selection))]
    # print(f'df = {df}')
    if df.empty:
        print(f'df = {df}, unable to find any movie with that title.')
        return None
    return get_cosine_similar_titles(md2)

In [7]:
import re
def check(tagList, input_str):
    if type(tagList) != str or type(input_str) != str:
        return False
    if 'nan,' in tagList:
        tagList = re.sub('nan,', '\'\',', tagList)
    tagList = eval(tagList)
    found_match = False
    if input_str in tagList or input_str.lower() in tagList:
            found_match = True
    if not found_match:
        input_list = input_str.split()
        split_tag_match = True
        for tag in input_list:
            if tag not in tagList and tag.lower() not in tagList:
                split_tag_match = False
                break
        if split_tag_match:
            found_match = True
    return found_match

In [305]:
from collections import defaultdict

def get_cosine_similar_titles(df):
    dict_of_title_to_occurance = defaultdict(int)
    for count, item in enumerate(df.index):
            list_of_recommendations = find_cosine_similarity(item,2)
            for it in list_of_recommendations:
                # print(f'it = {it}, dict_of_title_to_occurance={dict_of_title_to_occurance}')
                dict_of_title_to_occurance[df.iloc[it]['title']]+=1
            if count == 100:
                break
    sorted_list = sorted(dict_of_title_to_occurance.items(), key=lambda x: x[1], reverse=True)
    sorted_dict = dict(sorted_list)
    print(f'sorted_list = {sorted_list[:50]}')
    return sorted_dict.keys()

In [8]:
def find_cosine_similarity(item, count=3):
    sim_scores = list(enumerate(cosine_sim[item]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores]
    return movie_indices[:count]

In [9]:
print(get_recommendations('Toy Story').head(100))

0                                 Toy Story (1995)
22439    Toy Story Toons: Hawaiian Vacation (2011)
22440            Toy Story Toons: Small Fry (2011)
3021                            Toy Story 2 (1999)
20431                   Toy Story of Terror (2013)
56144                           Toy Story 4 (2019)
14803                           Toy Story 3 (2010)
23738      Toy Story Toons: Partysaurus Rex (2012)
23740            Toy Story That Time Forgot (2014)
Name: title, dtype: object


In [11]:
print(get_recommendations_from_combinations('Toy Story').head(200))

0                                 Toy Story (1995)
14803                           Toy Story 3 (2010)
17290                                  Hugo (2011)
23738      Toy Story Toons: Partysaurus Rex (2012)
23740            Toy Story That Time Forgot (2014)
22440            Toy Story Toons: Small Fry (2011)
22439    Toy Story Toons: Hawaiian Vacation (2011)
Name: title, dtype: object
