### import dependencies

In [1]:
import ast
import os
import random
import pandas as pd
import numpy as np
from surprise import Reader, Dataset
from surprise import KNNWithMeans
from collections import defaultdict
from tqdm import tqdm

### environment variables

In [2]:
DATA_DIR = "/home/rwilson2/data"

### explore rating data

In [3]:
with open(os.path.join(DATA_DIR, "ratings.csv"), 'r') as f:
    rating_lines = f.read().split('\n')

In [4]:
rating_lines[:20]

[',user_id,movie,rating',
 '0,523475,prison+break+the+final+break+2009,4',
 '1,906340,the+warlords+2007,3',
 '2,474539,the+unknown+woman+2011,4',
 '3,7253,the+count+of+monte+cristo+1998,4',
 '4,913783,kill+bill+vol.+2+2004,4',
 '5,938396,under+our+skin+2008,3',
 '6,626411,the+well+1951,4',
 '7,386186,the+twilight+people+1972,3',
 '8,182972,robot++frank+2012,4',
 '9,869570,6ixtynin9+1999,4',
 '10,876529,oceans+2009,4',
 '11,158115,spirited+away+2001,4',
 '12,938323,paradise+faith+2012,3',
 '13,645343,baggage+claim+2013,3',
 '14,46905,a+chinese+ghost+story+1987,3',
 '15,926449,marwencol+2010,4',
 '16,672542,loaded+guns+1975,4',
 '17,208250,floating+weeds+1959,4',
 '18,413940,saving+private+ryan+1998,3']

In [5]:
with open(os.path.join(DATA_DIR, "ratings_grouped_by_user.csv"), 'r') as f:
    rating_group = f.read().split('\n')

In [6]:
pd.read_csv(os.path.join(DATA_DIR, "ratings_grouped_by_user.csv"),)

Unnamed: 0.1,Unnamed: 0,user_id,temp
0,0,10,[{'total+recall+1990': '4'}]
1,1,10000,[{'3+ninjas+kick+back+1994': '3'}]
2,2,100002,"[{'monsters_+inc.+2001': '4'}, {'more+about+th..."
3,3,100003,[{'shutter+island+2010': '3'}]
4,4,100004,[{'bambi+1942': '4'}]
...,...,...,...
516015,516015,99994,"[{'fast_+cheap++out+of+control+1997': '4'}, {'..."
516016,516016,99995,[{'peter+pan+1953': '3'}]
516017,516017,99996,"[{'a+summers+tale+1996': '3'}, {'thor+the+dark..."
516018,516018,99997,"[{'the+hireling+1973': '4'}, {'deception+2008'..."


### Model Goal
From milestone writeup
* `<time>,<userid>,recommendation request <server>, status <200 for success>, result: <recommendations>, <responsetime>` – the user considers watching a movie and a list of recommendations is requested; the recommendations provided by your service are included (or an error message if your service did not provide a valid response)

So we might assume the input is the user id only. Return the most recommended movies (with highest prediction ratings)

In [7]:
# csv to rating dict
grouped_rating_csv = os.path.join(DATA_DIR, "ratings_grouped_by_user.csv")
with open(grouped_rating_csv, 'r') as f:
    grouped_lines = f.read().split('\n')

In [8]:
grouped_df = pd.read_csv(grouped_rating_csv, index_col="Unnamed: 0")

In [9]:
grouped_df.head()

Unnamed: 0,user_id,temp
0,10,[{'total+recall+1990': '4'}]
1,10000,[{'3+ninjas+kick+back+1994': '3'}]
2,100002,"[{'monsters_+inc.+2001': '4'}, {'more+about+th..."
3,100003,[{'shutter+island+2010': '3'}]
4,100004,[{'bambi+1942': '4'}]


In [10]:
N_ratings = len(grouped_df)
CUT_OFF = 25000

In [11]:
with open("rating_dict.csv", 'w') as f:
    f.write("movie,user,rating\n")

In [12]:
with open("rating_dict.csv",'a') as f:
    for i in tqdm(range(1, CUT_OFF)):
        user_id = grouped_df['user_id'][i]
        movies = ast.literal_eval(grouped_df['temp'][i])
        for mv in movies:
            for key in mv:
                f.write(f"{key},{user_id},{mv[key]}\n")

100%|████████████████████████████████████████████████████████████████████████████| 24999/24999 [00:00<00:00, 31375.50it/s]


In [13]:
ratings_df = pd.read_csv('rating_dict.csv')

In [14]:
ratings_df.head()

Unnamed: 0,movie,user,rating
0,3+ninjas+kick+back+1994,10000,3
1,monsters_+inc.+2001,100002,4
2,more+about+the+children+of+noisy+village+1987,100002,4
3,shutter+island+2010,100003,3
4,bambi+1942,100004,4


In [15]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[["user", "movie", "rating"]], reader)

In [16]:
# To use user-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True,
}
algo = KNNWithMeans(sim_options=sim_options)

In [17]:
# Train in Batch (Didn't work)
# BS = 20000
# N_BATCH = N_ratings // BS + 1
# for i in range(1):
#     start = i * BS
#     end = start + BS
#     print(f'Training use batch #{i+1}')
#     batch_train_data = data.df.iloc[start:end]
#     trainingSet = Dataset.load_from_df(batch_train_data[["user", "movie", "rating"]], reader).build_full_trainset()
#     algo.fit(trainingSet)

In [18]:
# Train
trainingSet = data.build_full_trainset()
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f14818399d0>

In [21]:
# Predict
prediction = algo.predict(100004, "bambi+1942")
print (prediction.est)

4.055050505050505


In [24]:
# Get the remapped user ID
test_user = 100002
inner_user_id = trainingSet.to_inner_uid(test_user)

# Get the list of all items for the given user
test_user_items = [iid for (uid, iid, r) in trainingSet.all_ratings() if uid == inner_user_id]

# Predict the ratings for all items for the given user
predictions = [algo.predict(inner_user_id, iid).est for iid in test_user_items]

# Sort the items based on their predicted ratings
predictions_with_item_id = list(zip(test_user_items, predictions))
predictions_with_item_id.sort(key=lambda x: x[1], reverse=True)

# Map the remapped item IDs back to their original names
recommended_items = [trainingSet.to_raw_iid(item_id) for (item_id, _) in predictions_with_item_id]
print(recommended_items)


['monsters_+inc.+2001', 'more+about+the+children+of+noisy+village+1987']


In [25]:
# Get the remapped user ID
test_user = 100002
inner_user_id = trainingSet.to_inner_uid(test_user)

# Get the nearest neighbors of the given user
neighbors = algo.get_neighbors(inner_user_id, k=10)

all_ratings = defaultdict(list)
for (user_id, movie_id, rating) in trainingSet.all_ratings():
    all_ratings[user_id].append((movie_id, rating))

# Get the items that the neighbors have rated but the target user has not
target_user_ratings = all_ratings[inner_user_id]
target_user_items = [iid for (iid, r) in target_user_ratings]

neighbor_items = []
for neighbor in neighbors:
    neighbor_ratings = all_ratings[neighbor]
    for (iid, r) in neighbor_ratings:
        neighbor_items.append(iid)

unwatched_items = set(neighbor_items) - set(target_user_items)

# Predict the ratings for all unwatched items for the given user
# predictions = [algo.predict(inner_user_id, iid).est for iid in unwatched_items]
predictions = [algo.predict(test_user, trainingSet.to_raw_iid(iid)).est for iid in unwatched_items]


# Sort the items based on their predicted ratings
predictions_with_item_id = list(zip(unwatched_items, predictions))
predictions_with_item_id.sort(key=lambda x: x[1], reverse=True)

# Map the remapped item IDs back to their original names
# recommended_items = [trainingSet.to_raw_iid(item_id) for (item_id, _) in predictions_with_item_id]
# print('\n'.join(recommended_items))

recommended_items = [(trainingSet.to_raw_iid(item_id), score) for (item_id, score) in predictions_with_item_id]
print("Recommended items for user", test_user, ":")
print("\n")
for item, score in recommended_items:
    print("\t", item, ":", score)


Recommended items for user 100002 :


	 like+father_+like+son+2013 : 5
	 the+hobbit+an+unexpected+journey+2012 : 4.4
	 ashura+2005 : 4.4
	 outsourced+2006 : 4.4
	 death+of+a+nation+-+the+timor+conspiracy+1994 : 4.0
	 kings+and+queen+2004 : 4.0
	 i+saw+the+sun+2009 : 4.0
	 destruction+force+1977 : 4.0
	 the+lord+of+the+rings+the+two+towers+2002 : 4.0
	 the+high+cost+of+living+2010 : 3.666666666666667
	 trainspotting+1996 : 3.666666666666667
	 conversation+piece+1974 : 3.5
	 guest+from+the+future+1984 : 3.4
	 the+triumph+of+love+2001 : 3.0
	 grave+of+the+fireflies+1988 : 3.0


### Save and load recommendation algo

In [26]:
from surprise import dump

# Save the algorithm to disk
dump.dump('25k-user-based-algo.dump', algo=algo)

### Load and use (for deployment team)

In [29]:
from surprise import dump

# Save the algorithm to disk
MODEL_DIR = os.path.abspath('25k-user-based-algo.dump')

_, user_algo = dump.load(MODEL_DIR)
print(MODEL_DIR)

/home/ziangz/code/team12-movie-recommendation/models/25k-user-based-algo.dump


In [30]:
# Get the remapped user ID
test_user = 100002
inner_user_id = trainingSet.to_inner_uid(test_user)

# Get the nearest neighbors of the given user
neighbors = algo.get_neighbors(inner_user_id, k=10)

all_ratings = defaultdict(list)
for (user_id, movie_id, rating) in trainingSet.all_ratings():
    all_ratings[user_id].append((movie_id, rating))

# Get the items that the neighbors have rated but the target user has not
target_user_ratings = all_ratings[inner_user_id]
target_user_items = [iid for (iid, r) in target_user_ratings]

neighbor_items = []
for neighbor in neighbors:
    neighbor_ratings = all_ratings[neighbor]
    for (iid, r) in neighbor_ratings:
        neighbor_items.append(iid)

unwatched_items = set(neighbor_items) - set(target_user_items)

# Predict the ratings for all unwatched items for the given user
predictions = [user_algo.predict(test_user, trainingSet.to_raw_iid(iid)).est for iid in unwatched_items]


# Sort the items based on their predicted ratings
predictions_with_item_id = list(zip(unwatched_items, predictions))
predictions_with_item_id.sort(key=lambda x: x[1], reverse=True)

# Map the remapped item IDs back to their original names

recommended_items = [(trainingSet.to_raw_iid(item_id), score) for (item_id, score) in predictions_with_item_id]
print("Recommended items for user", test_user, ":")
print("\n")
for item, score in recommended_items:
    print("\t", item, ":", score)

Recommended items for user 100002 :


	 like+father_+like+son+2013 : 5
	 the+hobbit+an+unexpected+journey+2012 : 4.4
	 ashura+2005 : 4.4
	 outsourced+2006 : 4.4
	 death+of+a+nation+-+the+timor+conspiracy+1994 : 4.0
	 kings+and+queen+2004 : 4.0
	 i+saw+the+sun+2009 : 4.0
	 destruction+force+1977 : 4.0
	 the+lord+of+the+rings+the+two+towers+2002 : 4.0
	 the+high+cost+of+living+2010 : 3.666666666666667
	 trainspotting+1996 : 3.666666666666667
	 conversation+piece+1974 : 3.5
	 guest+from+the+future+1984 : 3.4
	 the+triumph+of+love+2001 : 3.0
	 grave+of+the+fireflies+1988 : 3.0


### Reference

https://github.com/ckaestne/seai/blob/S2020/recitations/06_Collaborative_Filtering.ipynb

### Out-of-set (OOS) User handling

In [31]:
# with open('../data/movies_clean.csv','r') as f:
all_movies = pd.read_csv('../data/movies_clean.csv')

In [68]:
# get top 100 popular movies
top_100_pop = all_movies.sort_values(by='popularity', ascending=False)[:100]['id'].to_list()
with open('../data/top_imdb.txt','w') as f:
    f.write('\n'.join(top_100_pop))

In [59]:
def generate_movies_OOS(user_id, top_list, n=15):
    
    random.seed(user_id)
    
    selected_movies = random.sample(top_list, n)
    scores = [round(random.uniform(3, 5), 4) for _ in range(len(selected_movies))]

    # create a dictionary to store the movie names and scores
    movie_scores = dict(zip(selected_movies, scores))

    sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_movies

In [67]:
oos_movies = generate_movies_OOS(test_user, top_100_pop)
print("Recommended items for user", test_user, ":")
print("\n")
for item, score in oos_movies:
    print("\t", item, ":", score)

Recommended items for user 1000001 :


	 the+lord+of+the+rings+the+fellowship+of+the+ring+2001 : 4.9408
	 john+wick+2014 : 4.8825
	 once+upon+a+time+in+america+1984 : 4.8581
	 pirates+of+the+caribbean+on+stranger+tides+2011 : 4.7646
	 casino+royale+2006 : 4.4652
	 the+fifth+element+1997 : 4.3222
	 frozen+2013 : 4.2243
	 the+amazing+spider-man+2012 : 4.0644
	 annabelle+2014 : 3.8954
	 rise+of+the+planet+of+the+apes+2011 : 3.8198
	 the+jungle+book+1967 : 3.6071
	 the+imitation+game+2014 : 3.4785
	 the+dark+knight+2008 : 3.4094
	 the+hunger+games+catching+fire+2013 : 3.355
	 harry+potter+and+the+philosophers+stone+2001 : 3.1012


### Full handling (deployment)

In [2]:
from surprise import dump

# Save the algorithm to disk
MODEL_DIR = os.path.abspath('25k-user-based-algo.dump')

with open('/home/ziangz/code/team12-movie-recommendation/data/top_imdb.txt','r') as f:
    top_100_pop = [movie for movie in f.read().split('\n') if movie]

_, user_algo = dump.load(MODEL_DIR)
print(MODEL_DIR)

/home/ziangz/code/team12-movie-recommendation/models/25k-user-based-algo.dump


In [4]:
def generate_movies_OOS(user_id, top_list, n=15):
    
    # same random for every user
    random.seed(user_id) 
    
    selected_movies = random.sample(top_list, n)
    scores = [round(random.uniform(3, 5), 4) for _ in range(len(selected_movies))]

    # create a dictionary to store the movie names and scores
    movie_scores = dict(zip(selected_movies, scores))

    sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_movies

In [5]:
# Get the remapped user ID 
test_user = 99991132 # unknow user
try:
    inner_user_id = trainingSet.to_inner_uid(test_user)

    # Get the nearest neighbors of the given user
    neighbors = algo.get_neighbors(inner_user_id, k=10)

    all_ratings = defaultdict(list)
    for (user_id, movie_id, rating) in trainingSet.all_ratings():
        all_ratings[user_id].append((movie_id, rating))

    # Get the items that the neighbors have rated but the target user has not
    target_user_ratings = all_ratings[inner_user_id]
    target_user_items = [iid for (iid, r) in target_user_ratings]

    neighbor_items = []
    for neighbor in neighbors:
        neighbor_ratings = all_ratings[neighbor]
        for (iid, r) in neighbor_ratings:
            neighbor_items.append(iid)

    unwatched_items = set(neighbor_items) - set(target_user_items)

    # Predict the ratings for all unwatched items for the given user
    predictions = [user_algo.predict(test_user, trainingSet.to_raw_iid(iid)).est for iid in unwatched_items]


    # Sort the items based on their predicted ratings
    predictions_with_item_id = list(zip(unwatched_items, predictions))
    predictions_with_item_id.sort(key=lambda x: x[1], reverse=True)

    # Map the remapped item IDs back to their original names

    recommended_items = [(trainingSet.to_raw_iid(item_id), score) for (item_id, score) in predictions_with_item_id]
    print("Recommended items for user", test_user, ":")
    print("\n")
    for item, score in recommended_items:
        print("\t", item, ":", score)
except:
    oos_movies = generate_movies_OOS(test_user, top_100_pop)
    print("Recommended items for user", test_user, ":")
    print("\n")
    for item, score in oos_movies:
        print("\t", item, ":", score)

Recommended items for user 99991132 :


	 star+wars+1977 : 4.855
	 the+godfather+1972 : 4.8028
	 the+godfather+part+ii+1974 : 4.5488
	 childs+play+3+1991 : 4.5087
	 grown+ups+2010 : 4.33
	 forrest+gump+1994 : 4.0804
	 the+purge+2013 : 3.9185
	 guardians+of+the+galaxy+2014 : 3.8721
	 dark+skies+2013 : 3.7923
	 lucy+2014 : 3.7252
	 the+hunger+games+mockingjay+-+part+1+2014 : 3.6293
	 the+imitation+game+2014 : 3.1147
	 fury+2014 : 3.1104
	 pirates+of+the+caribbean+dead+mans+chest+2006 : 3.0451
	 dilwale+dulhania+le+jayenge+1995 : 3.008
