In [2]:
import pandas as pd
import math
import numpy as np
import json

In [3]:
def clean_watch_history(df):
    '''
    Function that cleans a given users watch history data
    Input: dataframe
    Output: (cleaned) dataframe
    '''
    df = df.rename(columns = {"Title": "History"})
    df['Date'] = pd.to_datetime(df['Date'])
    df['Day']= df['Date'].dt.day
    df['Month']= df['Date'].dt.month
    df['Year']= df['Date'].dt.year
    df['Day_of_week'] = df['Date'].dt.dayofweek

    df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
    df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
    df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]

    df['Type'] = df['Episode'].apply(lambda x : 'Movie' if (pd.isna(x)==True) else 'TV')

    tv = df[df['Type']!='Movie']
    tv['Season'] = tv['Season'].str.split().str[1]

    movies = df[df['Type']=='Movie']
    movies['Title'] = movies['History']
    movies['Season'] = None

    df = pd.concat([movies, tv], ignore_index = True)
    return df

In [4]:
def netflix_merge(df):
    '''
    Function that merges given watch history with netflix dataset,
    and returns merged dataset
    '''
    titles = pd.read_csv('titles.csv')
    merged = df.merge(titles, left_on = 'Title', right_on = 'title', how = 'inner')
    cols_to_drop = ['type', 'production_countries', 'imdb_id', 'age_certification', 
                    'id', 'title', 'seasons', 'tmdb_popularity']
    merged = merged.drop(cols_to_drop, axis = 1)
    return merged

In [5]:
def generate_item_id(user_history_csv, movie_lens_csv):
    df = pd.read_csv(user_history_csv)
    df = clean_watch_history(df)
    df = netflix_merge(df)
    df_movies = df[df.Type != 'TV']
    df_movies = pd.DataFrame(df_movies,columns = ['History', 'release_year'])
    df_mltitles = pd.read_csv(movie_lens_csv)
    df_merge = df_mltitles[df_mltitles.movie_title.isin(df_movies.History)]
    df_merge.drop(columns = ['Unnamed: 0'], inplace = True)
    df_merge.rename(columns = {'year': 'release_year'}, inplace = True)
    df_movies.rename(columns = {'History': 'movie_title'}, inplace = True)

    def to_int(x):
        try:
            return int(x)
        except ValueError:
            return np.nan
    df_merge['release_year'] = df_merge['release_year'].apply(to_int)
    df_output = pd.merge(df_movies,df_merge, how = 'inner', on = ['movie_title', 'release_year']).sort_values(by = 'movie_title')

    l = df_output['item_id'].values
    l = np.array(l)
    return l

In [6]:
def collect_master_df():
    df = pd.read_csv('movielens_data_genome/movie_dataset_public_final/scores/glmer.csv')
    titles = df['item_id'].unique()
    return df, titles

In [7]:
def generate_user_watch_history(lines, titles, user_data = 'random', n = 20):
    if type(user_data) == str:
        t = np.random.choice(titles,n)
    else:
        t = user_data
    sample_user = {}
    for line in lines:
        if line['item_id'] in t:
            sample_user[line['item_id']] = line['title']
    return sample_user

In [8]:
def generate_user_history_tag(sample_user, df):
    sample_user_matrix = []
    errors = []
    for movie in sample_user.keys():
        #skip if for some reason the movie does not contain all labels
        if len(df[df.item_id == movie]) != 1084:
            errors.append(movie)
        #if the movie contains all label values, obtain the tag vector from the main df
        else:
            matrix = list(df[df.item_id == movie].score.values)
            sample_user_matrix.append(matrix)

    #turning the list of lists into a matrix as a 2D array
    mat = np.array(sample_user_matrix)

    return errors,mat

In [9]:
def generate_user_pref_vector(mat, f = '2-norm'):

    if f == '2-norm':
        matsq = mat**2
        agg = np.average(matsq, axis = 0)
        aggroot = agg**0.5

        return aggroot
    
    else:
        agg = []

        for col in mat.T:
            val = f(col)
            agg.append(val)
        
        return agg

In [10]:
def generate_master_matrix(df, titles):
    master_mat = []
    for movie in titles:
        if len(df[df.item_id == movie]) != 1084:
            print(f'error {movie}')

        else:
            matrix = list(df[df.item_id == movie].score.values)
            master_mat.append(matrix)
    master_mat = np.array(master_mat)

    return master_mat

In [11]:
def generate_user_movie_interest(master_mat, user_tag_interest, g = '2-norm'):
    if g == '2-norm':
        rows, _ = master_mat.shape
        dist = []
        for r in range(rows):
            d = np.linalg.norm(user_tag_interest-master_mat[r])
            dist.append(d)
        return dist
    
    else:
        dist = []
        for i in master_mat:
            val = g(user_tag_interest,i)
            dist.append(val)
        return dist


In [12]:
def read_metadata():
    lines = []
    for line in open('movielens_data_genome\\movie_dataset_public_final\\raw\\metadata_updated.json', 'r'):
        lines.append(json.loads(line))
    return lines

In [13]:
def generate_recommendation(dists, lines, titles, n_recs = 10, users = 1):
    if users == 1:
        dfresult = pd.DataFrame(titles, columns = ['movie_id'])
        dfresult['score1'] = dists[0]
        dfresult = dfresult.sort_values(by = ['score1'], ascending = False)
        dfrecommend = dfresult[0:n_recs]
        
        movie_titles = {}
        for line in lines:
            if line['item_id'] in dfrecommend.movie_id.values:

                movie_titles[line['item_id']] = line['title']
        
        dfrecommend['movie_title'] = dfrecommend['movie_id'].map(movie_titles)
        dfrecommend.reset_index(inplace = True)
        return dfrecommend
    
    if users == 2:
        dfresult = pd.DataFrame(titles, columns = ['movie_id'])
        dfresult['score1'] = dists[0]
        dfresult['score2'] = dists[1]

        dfresult['average'] = (dfresult['score1'] + dfresult['score2'])/2
        dfresult['dev'] = ((dfresult['score1']-dfresult['average'])**2) + ((dfresult['score2'] - dfresult['average'])**2)
        dfresult = dfresult.sort_values(by = ['average'], ascending = False)

        dfrecommend = dfresult[0:2*n_recs]
        dfrecommend = dfrecommend.sort_values(by= ['dev'], ascending = True)
        dfrecommend = dfrecommend[0:n_recs]

        movie_titles = {}
        for line in lines:
            if line['item_id'] in dfrecommend.movie_id.values:

                movie_titles[line['item_id']] = line['title']
        
        dfrecommend['movie_title'] = dfrecommend['movie_id'].map(movie_titles)
        dfrecommend.reset_index(inplace = True)
        return dfrecommend

In [14]:
def f(a):
    result = 0
    for val in a:
        result+= val**0.5
    
    result = result/np.size(a)
    result = result**2
    return result

In [15]:
master_mat = np.loadtxt('masterlist_ml_matrix.csv', delimiter = ',')
df, titles = collect_master_df()
lines = read_metadata()

In [22]:
def generate_two_user_recommendation(user_history1, user_history2, movie_lens_dataset, master_matrix):
    master_mat = np.loadtxt(master_matrix, delimiter = ',')
    df, titles = collect_master_df()
    lines = read_metadata()
    
    sample_user1 = generate_item_id(user_history1, movie_lens_dataset)
    user1 = generate_user_watch_history(lines, titles, sample_user1)
    errors_user1, mat1 = generate_user_history_tag(user1,df)
    agg1 = generate_user_pref_vector(mat1, f = f)
    dist1 = generate_user_movie_interest(master_mat, agg1)

    sample_user2 = generate_item_id(user_history2, movie_lens_dataset)
    user2 = generate_user_watch_history(lines, titles, sample_user2)
    errors_user2, mat2 = generate_user_history_tag(user2,df)
    agg2 = generate_user_pref_vector(mat2, f = f)
    dist2 = generate_user_movie_interest(master_mat, agg2)

    dfrecommend= generate_recommendation([dist1, dist2], lines, titles, n_recs = 20, users = 2)

    errors = [errors_user1, errors_user2]
    recommend = dfrecommend['movie_title'].values
    return errors, dfrecommend, recommend


In [23]:
output = generate_two_user_recommendation('Sample-History5.csv', 'Sample-History4.csv','titles_movielens.csv', 'masterlist_ml_matrix.csv')

  df['Title'] = df['History'].str.rsplit(': ', 2).str[0]
  df['Season'] = df['History'].str.rsplit(': ', 2).str[1]
  df['Episode'] = df['History'].str.rsplit(': ', 2).str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv['Season'] = tv['Season'].str.split().str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Title'] = movies['History']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [25]:
output[2]

array(['Alexander the Great (1956)', 'RKO 281 (1999)',
       'Snake Pit, The (1948)', 'Set-Up, The (1949)',
       'Donnie Darko (2001)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Arabesque (1966)', 'American History X (1998)',
       'The Machinist (2004)', 'Matrix, The (1999)', 'Memento (2000)',
       'Star Wars: Episode IV - A New Hope (1977)', 'Harper (1966)',
       'Goodfellas (1990)', 'Shawshank Redemption, The (1994)',
       'Late Show, The (1977)', 'Reservoir Dogs (1992)',
       'Dark Knight, The (2008)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       "Schindler's List (1993)"], dtype=object)