# IBM Recommendation Engine Project

In [None]:
def id_standard(num):
    txt = str(num)
    if txt[-1] != '0' or txt =="0":
        return txt+".0"
    return txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import project_tests as t
import pickle

%matplotlib inline

article_id_str = {'article_id': id_standard}
df = pd.read_csv('data/user-item-interactions.csv', converters=article_id_str)
df_content = pd.read_csv('data/articles_community.csv', converters=article_id_str)
del df['Unnamed: 0']
del df_content['Unnamed: 0']

print(df.shape)

In [None]:
# Find and explore duplicate articles
print('Number of rows in the original df_content: {}'.format(df_content.shape[0]))
print('Number of duplicate articles: {}'.format(df_content.duplicated('article_id').sum()))
# Remove any rows that have the same article_id - only keep the first
df_content = df_content.drop_duplicates('article_id', keep='first')
df_content.shape
# The number of unique articles that have at least one interaction
unique_articles = df[df.email.isnull() == False].article_id.nunique()
# The number of unique articles on the IBM platform
total_articles = df_content.article_id.nunique()
# The number of unique users
unique_users = df.email.nunique()
# The number of user-article interactions
user_article_interactions = df.shape[0]
print('The number of unique articles that have at least one interaction: {}'.format(unique_articles))
print('The number of unique articles on the IBM platform: {}'.format(total_articles))
print('The number of unique users: {}'.format(unique_users))
print('The number of user-article interactions: {}'.format(user_article_interactions))

In [None]:
#Interactions

In [None]:
def get_top_articles(n, df=df):
    '''
    INPUT:
    n - (int) the number of top articles to return
    df - (pandas dataframe) df as defined at the top of the notebook 
    
    OUTPUT:
    top_articles - (list) A list of the top 'n' article titles 
    
    '''
    df_per_article = df.groupby(['article_id', 'title'])['user_id'].count().reset_index()
    df_per_article.rename(columns={'user_id':'num_interactions'}, inplace=True)
    indices = list(df_per_article['num_interactions'].nlargest(n).reset_index()['index'])
    top_articles= [df_per_article.iloc[idx]['title'] for idx in indices]
    
    return top_articles # Return the top article titles from df (not df_content)

def get_top_article_ids(n, df=df):
    '''
    INPUT:
    n - (int) the number of top articles to return
    df - (pandas dataframe) df as defined at the top of the notebook 
    
    OUTPUT:
    top_articles - (list) A list of the top 'n' article titles 
    
    '''
    df_per_article = df.groupby(['article_id', 'title'])['user_id'].count().reset_index()
    df_per_article.rename(columns={'user_id':'num_interactions'}, inplace=True)
    indices = list(df_per_article['num_interactions'].nlargest(n).reset_index()['index'])
    top_articles= [df_per_article.iloc[idx]['article_id'] for idx in indices]
    
    return top_articles # Return the top article ids

In [None]:
# create the user-article matrix with 1's and 0's

def create_user_item_matrix(df):
    '''
    INPUT:
    df - pandas dataframe with article_id, title, user_id columns
    
    OUTPUT:
    user_item - user item matrix 
    
    Description:
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with 
    an article and a 0 otherwise
    '''
    user_item = df.groupby(['user_id', 'article_id'])['article_id'].count().notnull().unstack()
    user_item.fillna(0, inplace=True)
    user_item.applymap(lambda x: 1 if x is True else 0)    
    return user_item # return the user_item matrix 

user_item = create_user_item_matrix(df)

In [None]:
def find_similar_users(user_id, user_item=user_item):
    '''
    INPUT:
    user_id - (int) a user_id
    user_item - (pandas dataframe) matrix of users by articles: 
                1's when a user has interacted with an article, 0 otherwise
    
    OUTPUT:
    similar_users - (list) an ordered list where the closest users (largest dot product users)
                    are listed first
    
    Description:
    Computes the similarity of every pair of users based on the dot product
    Returns an ordered
    
    '''
    # compute similarity of each user to the provided user
    
    # create similarity matrix only for relevant id
    similarity = user_item[user_item.index == user_id].dot(user_item.T)
    # sort the similarity values
    similarity = similarity.sort_values(by=user_id, axis=1, ascending=False)
    # create list of ids
    most_similar_users = similarity.columns.tolist()
    # remove the own user's id
    most_similar_users.remove(user_id)
    return most_similar_users # return a list of the users in order from most to least similar

In [None]:
def get_article_names(article_ids, df=df):
    '''
    INPUT:
    article_ids - (list) a list of article ids
    df - (pandas dataframe) df as defined at the top of the notebook
    
    OUTPUT:
    article_names - (list) a list of article names associated with the list of article ids 
                    (this is identified by the title column)
    '''
    article_names = [df[df.article_id == idx].title.iat[0] for idx in article_ids]    
    return article_names # Return the article names associated with list of article ids


def get_user_articles(user_id, user_item=user_item):
    '''
    INPUT:
    user_id - (int) a user id
    user_item - (pandas dataframe) matrix of users by articles: 
                1's when a user has interacted with an article, 0 otherwise
    
    OUTPUT:
    article_ids - (list) a list of the article ids seen by the user
    article_names - (list) a list of article names associated with the list of article ids 
                    (this is identified by the doc_full_name column in df_content)
    
    Description:
    Provides a list of the article_ids and article titles that have been seen by a user
    '''
    # Your code here
    relevant_row = np.array(user_item[user_item.index == user_id])    
    article_ids = list(np.nonzero(relevant_row)[1])
    article_names = get_article_names(article_ids, df=df)
    
    return article_ids, article_names # return the ids and names


def user_user_recs(user_id, m=10):
    '''
    INPUT:
    user_id - (int) a user id
    m - (int) the number of recommendations you want for the user
    
    OUTPUT:
    recs - (list) a list of recommendations for the user
    
    Description:
    Loops through the users based on closeness to the input user_id
    For each user - finds articles the user hasn't seen before and provides them as recs
    Does this until m recommendations are found
    
    Notes:
    Users who are the same closeness are chosen arbitrarily as the 'next' user
    
    For the user where the number of recommended articles starts below m 
    and ends exceeding m, the last items are chosen arbitrarily
    
    '''
    ordered_users = find_similar_users(user_id, user_item=user_item)
    article_ids = set()
    user_idx = 0
    #Get the original view details from user_id
    main_user = np.array(user_item.iloc[user_id])
    while len(article_ids) < m:
        other_user = np.array(user_item.iloc[ordered_users[user_idx]])
        #we get the indices where the the other user has seen something new
        unseen_idxs = np.where(other_user > main_user)
        print(unseen_idxs)
        article_ids.update(set(unseen_idxs))
        user_idx += 1
    #We need to now remove any unnecessary elements
    spillover = len(article_ids) - m
    print(spillover)
    raw_ids = list(article_ids)
    if spillover != 0:
        raw_ids = raw_ids[: -spillover]
    recs = [id_standard(idx) for idx in raw_ids]
    print(recs)
    return recs # return your recommendations for this user_ids