In [1]:
import os
import csv
import time
import datetime
import urllib
import re
import math
import sys
import io
import random
import pymysql.cursors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score

## Activity Similarity Feature: Similary between query item and user's activities
Given a query item $q$ (i.e., a Stack Overflow question or GitHub repository) and a user $u$, we compute averange similarity between $q$ and all items where $u$ has perform an activity $a$ on. For example, given an item question $q_1$, we compute the average similarity $q_1$ and all other questions where beween user $u_1$ has answered. I.e., in this case, the activity $a$ will be the *answer* activity. The similarity function is given as below: 

$Sim(u,q,a) = \frac{|\{i\in I_{(u,a)}|i_{tags}\in q_{tags}\}|}{|I_{(u,a)}|}$

Where, $<u,q,a>$ is a triplet of a user $u$, query item $q$ and specific activity $a$. We say that a query item $q$ is similar to a user $u$'s $a$ activities, when many items, which $u$ performed $a$ on, shares similar tags with $q$. The above similarity function captures this intuition. The numerator computes the number of items where $u$ perform $a$ and the item shares at least 1 tag with the query item $q$. The denominator computes the total number of items where $u$ perform $a$

Notations
- $u$: User
- $a$: Activity. Different activities performed by user. E.g. answer, favorite, fork and watch.
- $i$: Item, i.e., Stack Overflow question or GitHub repository
- $q$: Query item
- $I_{(u,a)}$: Items where user $u$ perform activity $a$ on. E.g. questions which are answered by a user. 
- $i_{tags}$: Tags for the item. E.g. *Java*, *iOS*, etc. 
- $q_{tags}$: Tags for query item.

For function UserActivitySim():
- q: query item id (str)
- I_ua: ids of items where user has perform an activity (list)
- I_tags: key is item id and value is tags (dict)

In [2]:
def UserActivitySim(q, I_ua, I_tags):
    if q in I_ua:
        I_ua.remove(q)
    if len(I_ua)==0:
        return 0
    q_tags = set(I_tags[q].split(' '))
    numerator = 0
    for i in I_ua:
        i_tags = set(I_tags[i].split(' '))
        overlap = i_tags.intersection(q_tags)
        if len(overlap) >0:
            numerator += 1
    return numerator/len(I_ua)

## Evaluation
We use [Mean Average Precision (MAP)](https://medium.com/@jonathan_hui/map-mean-average-precision-for-object-detection-45c121a31173) to evaluate the activity prediciton tasks. I.e., we take total average precision (AP) of each user divided by the total number of users. 

MAP function:
- groundtruth: a dictionary where key is user id and value is the test labels for all the positive/negative instance for this use
- pred: a dictionary where key is the user id and value is the predicted probabilities of the label being posititive (i.e,'1')

In [3]:
def MAP(groundtruth,pred):
    result = 0
    for key, value in groundtruth.items():
        y_truth = value
        y_pred = pred[key]
        score= average_precision_score(y_truth,y_pred)
        result +=score
    return result/len(groundtruth)

### Answer Activity Prediction 
#### Setup 1: Using only Answer Similarity Feature
- Compute the similarity scores for answer training set

In [4]:
i_users = 'toy_data/users.csv'
i_training_activities = 'toy_data/new_format/training/user_answer_training.csv'
i_testing_activities = 'toy_data/new_format/test/user_answer_testing.csv'

#Load answer training set
ans_u_ids_train = []
ans_q_ids_train = []
ans_q_label_train = []
ans_user_items_train = {} 
ans_item_tags_train = {}
with open(i_training_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        ans_u_ids_train.append(uid)
        ans_q_ids_train.append(rid)
        ans_q_label_train.append(label)
        if label==1:
            if uid in ans_user_items_train:
                ans_user_items_train[uid].append(rid)
            else:
                ans_user_items_train[uid] = [rid]
        ans_item_tags_train[rid] = tags

#Compute score for answer training set
ans_scores_train = []
for i in range(len(ans_u_ids_train)):
    q = ans_q_ids_train[i]
    uid = ans_u_ids_train[i]
    I_ua = ans_user_items_train[uid].copy()
    I_tags = ans_item_tags_train
    score = UserActivitySim(q, I_ua, I_tags)
    ans_scores_train.append(score)
    
#Load answer test set
ans_u_ids_test = []
ans_q_ids_test = []
ans_q_label_test = []
ans_user_items_test = {} 
ans_item_tags_test = {}
with open(i_testing_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        ans_u_ids_test.append(uid)
        ans_q_ids_test.append(rid)
        ans_q_label_test.append(label)
        if label==1:
            if uid in ans_user_items_test:
                ans_user_items_test[uid].append(rid)
            else:
                ans_user_items_test[uid] = [rid]
        ans_item_tags_test[rid] = tags

#Compute score for answer test set
ans_scores_test = []
for i in range(len(ans_u_ids_test)):
    q = ans_q_ids_test[i]
    uid = ans_u_ids_test[i]
    I_ua = ans_user_items_test[uid].copy()
    I_tags = ans_item_tags_test

    score = UserActivitySim(q, I_ua, I_tags)
    ans_scores_test.append(score)

FileNotFoundError: [Errno 2] No such file or directory: 'toy_data/new_format/training/user_answer_training.csv'

In [None]:
#Training
x_train_list = []
for i in range(len(ans_scores_train)):
    x_train_list.append([int(ans_u_ids_train[i]),int(ans_scores_train[i])])
x_train = np.array(x_train_list)
y_train = ans_q_label_train.copy()
clf = svm.SVC(kernel='linear',probability=True)
#clf = svm.SVC(kernel='linear')
clf.fit(x_train_list,y_train)


In [None]:
#Testing
x_test_list = []
for i in range(len(ans_scores_test)):
    x_test_list.append([int(ans_u_ids_test[i]),int(ans_scores_test[i])])
x_test = np.array(x_test_list)
y_test = ans_q_label_test.copy()
#pred = clf.predict(x_test)
pred = clf.predict_proba(x_test)

In [None]:
print(pred)

In [None]:
#evaluation
user_level_truth = {}
user_level_pred = {}
for i in range(len(pred)):
    uid = ans_u_ids_test[i]
    if uid not in user_level_pred:
        user_level_pred[uid] = [pred[i][0]]
    else:
        user_level_pred[uid].append(pred[i][1])

for i in range(len(ans_q_label_test)):
    uid = ans_u_ids_test[i]
    if uid not in user_level_truth:
        user_level_truth[uid] = [ans_q_label_test[i]]
    else:
        user_level_truth[uid].append(ans_q_label_test[i])

result = MAP(user_level_truth,user_level_pred)
print(result)

$error = \sum_{i,j}(V_{i}^{T}V_{j} - S^{item}_{i,j})^2$