In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from glob import glob
from collections import Counter
import matplotlib.pyplot as plt
from scipy import stats
import nltk

In [3]:
# load the Recommender train and test datasets
train_df = pd.read_csv('./Dataset/recommender_train.tsv', sep = '\t')
test_df = pd.read_csv('./Dataset/recommender_test.tsv', sep = '\t')

In [4]:
train_df.shape

(2344, 3)

In [4]:
# Load the Questions dataset
ques_df = pd.read_csv('./Dataset/web_science_dataset.csv', sep = '\t')
ques_df = ques_df.drop(ques_df.columns[0], axis=1)

In [5]:
ques_df.head()

Unnamed: 0,answer,answerId,answerUrl,category,categoryId,question,questionId,questionUrl
0,A number of injuries have been attributed to t...,14139,https://skeptics.stackexchange.com/questions/1...,medical-science,2,Can headbanging cause brain damage?,14138,https://skeptics.stackexchange.com/questions/1...
1,The Shangri-La diet depends on two theories:\n...,16121,https://skeptics.stackexchange.com/questions/1...,nutrition,0,Does the Shangri-La diet work (according to it...,10103,https://skeptics.stackexchange.com/questions/1...
2,This question has remained unanswered yet not ...,22322,https://skeptics.stackexchange.com/questions/1...,psychology,4,"Can phobias be genetic, but created in one gen...",18713,https://skeptics.stackexchange.com/questions/1...
3,The&nbsp;40% figure most likely comes from Pew...,36011,https://skeptics.stackexchange.com/questions/3...,climate-change,1,Do 40% of U.S. Americans think that global war...,36010,https://skeptics.stackexchange.com/questions/3...
4,The claims\n\n\nevery time the same water is b...,11119,https://skeptics.stackexchange.com/questions/1...,nutrition,0,Does boiling the same water twice make it dang...,11118,https://skeptics.stackexchange.com/questions/1...


In [6]:
# Join the 2 dataframe on questionId
complete_train_df = pd.merge(train_df, ques_df[["question","questionId","categoryId"]], left_on='questionID', right_on='questionId', how = 'inner')
complete_test_df = pd.merge(test_df, ques_df[["question","questionId","categoryId"]], left_on='questionID', right_on='questionId', how = 'inner')

In [7]:
print(complete_train_df.shape)
print(complete_test_df.shape)

(2344, 6)
(554, 6)


In [8]:
# Remove special characters and white space from question text
complete_train_df.loc[:,'question'] = complete_train_df['question'].str.lower().str.replace('"|Ô|Ç|Ø|Â|”|“|,|£','', regex = True).str.strip()
complete_test_df.loc[:,'question'] = complete_test_df['question'].str.lower().str.replace('"|Ô|Ç|Ø|Â|”|“|,|£','', regex = True).str.strip()

In [69]:
complete_train_df.head()

Unnamed: 0,userID,questionID,rating,question,questionId,categoryId
0,5205,17488,2,does eating soy affect men's virility?,17488,0
1,9830,17488,1,does eating soy affect men's virility?,17488,0
2,7445,17488,2,does eating soy affect men's virility?,17488,0
3,5205,8080,2,are there an unusually large number of people ...,8080,4
4,7777,8080,2,are there an unusually large number of people ...,8080,4


In [70]:
#Using the Vectorizer, convert the data into numeric vectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction import text
from scipy.spatial.distance import cosine

# Stop words
stop_words = text.ENGLISH_STOP_WORDS

# Tokenizer
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        if item not in stop_words:
            stems.append(PorterStemmer().stem(item))
    return stems

# TF-IDF
vectorizer = TfidfVectorizer(tokenizer = tokenize)


# Function to create user_profiles per category
def _compute_user_profile(_train_df):
    
    # Get only liked data or neutral
    _train_df = _train_df[(_train_df['rating']==3)]

    #Retrieve all questions
    ques_list = list(set(_train_df['question'].values))
    ques_dict = {ques:i for i, ques in enumerate(ques_list)} #

    #Apply the vectorizer on the question list
    train_tfidf_matrix = vectorizer.fit_transform(ques_list)
    
    # Group-by and iterate data_df
    grouped = _train_df.groupby(['userID'])

    # Store results
    userid_list = []
    profileVec_list = []

    #Iterate over the user's ratings
    for userID, group in grouped:

        # Get all user rated questions
        questions_list = group['question'].values
        questions_vector = np.zeros(shape = [1,train_tfidf_matrix.shape[1]] )

        # iterate over questions
        for question in questions_list:
            vector_id = ques_dict[question]
            questions_vector += train_tfidf_matrix[vector_id,:]

        # Average all liked vectors
        questions_vector /= len(questions_list)

        # Add to user_df
        userid_list.append(userID)
        profileVec_list.append(questions_vector.tolist())
        
    return np.array(userid_list), np.array(profileVec_list).squeeze()


# Function to compute user similarity
def _compute_user_similarity(profile_mat):
    
    # Get number of users
    rows,_ = profile_mat.shape
    
    # Create output_matrix
    _sim = np.zeros((rows,rows))
    
    for i in range(rows):
        for j in range(rows):
            _sim[i,j] = 1 - cosine(profile_mat[i,:],profile_mat[j,:])
    return _sim

In [77]:
# Get all unique categories
category_list = list(set(complete_train_df['categoryId'].values))

# Evaluation dict
eval = {}

# Iterate over different categories
for cat in category_list:
    
    # Init
    eval[cat] = {}
    
    # compute the user profiles 
    _train_df = complete_train_df[(complete_train_df['categoryId'] == cat)]
    uid_list, profile_mat = _compute_user_profile(_train_df)
    
    # Compute user_similarity
    sim_mat = _compute_user_similarity(profile_mat)
    
    # Compute top-k friends for each user and create recommended set
    k = 5
    for i,user in enumerate(uid_list):
        
        # Compute K friends
        k_friends_ids = sim_mat[i,:].argsort()[::-1][1:1+k].tolist()
        k_friends = uid_list[k_friends_ids]
        
        # Create Recommended set
        _recomm_df = complete_train_df[(complete_train_df['userID'].isin(k_friends))&(complete_train_df['rating'] == 3)]
        
        # Add to eval
        eval[cat][user] = _recomm_df

In [78]:
# Computing performance
grouped = complete_test_df.groupby(['userID'])

# metrics
total_accuracy_bycat = np.zeros((len(category_list),len(category_list)))

# UID_test
uid_test = list(set(complete_test_df['userID'].values))

# Iterate over different categories
for i,cat in enumerate(category_list):
    
    for j,test_cat in enumerate(category_list):
        
        # Init
        accuracy = 0
        user_count = 0
        
        #Iterate over the user
        for userID, full_group in grouped:
            
            # Test group, by category
            group = full_group[(full_group['categoryId'] == test_cat)]
            
            # Get likes and dislikes
            liked_df = group[group['recommend'].str.lower() == 'yes']
            not_liked_df = group[group['recommend'].str.lower() == 'no']
            
            # Check if invalid user or category
            if group.shape[0] == 0 or userID not in eval[cat]:
                continue
            else:
                user_count += 1

            # Get corresponding questions
            liked_ques = liked_df['questionID'].values
            not_liked_ques = not_liked_df['questionID'].values
            
            # get recommendations for test_category
            recomm_df = eval[cat][userID]
            recomm_df = recomm_df[(recomm_df['categoryId'] == test_cat)]
            recomm_ques = list(set(recomm_df['questionID'].values))

            # Compute TP and FP
            TP = len(set(recomm_ques).intersection(set(liked_ques)))
            FP = len(set(recomm_ques).intersection(set(not_liked_ques)))

            # Compute TN and FN
            FN = len(liked_ques) - TP
            TN = len(not_liked_ques) - FP
            
            # Accuracy
            accuracy += ((TP + TN) / (TP + TN + FN + FP))
            
        # Mean accuracy
        accuracy = accuracy/user_count
        total_accuracy_bycat[i][j] = accuracy

In [79]:
print(total_accuracy_bycat)

[[0.56886447 0.49583333 0.48611111 0.49593254 0.31210317]
 [0.55363587 0.53571429 0.57673611 0.4978836  0.28148148]
 [0.54846357 0.5875     0.54292929 0.51964286 0.28953373]
 [0.5908682  0.61222222 0.46111111 0.50776942 0.23978697]
 [0.55297203 0.60784314 0.52142857 0.45703463 0.26888528]]
