In [1]:
import os
import csv
import time
import datetime
import urllib
import re
import math
import sys
import io
import random
import pymysql.cursors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score

# Activity Similarity Feature: Similary between query item and user's activities
Given a query item $q$ (i.e., a Stack Overflow question or GitHub repository) and a user $u$, we compute averange similarity between $q$ and all items where $u$ has perform an activity $a$ on. For example, given an item question $q_1$, we compute the average similarity $q_1$ and all other questions where beween user $u_1$ has answered. I.e., in this case, the activity $a$ will be the *answer* activity. The similarity function is given as below: 

$Sim(u,q,a) = \frac{|\{i\in I_{(u,a)}|i_{tags}\in q_{tags}\}|}{|I_{(u,a)}|}$

Where, $<u,q,a>$ is a triplet of a user $u$, query item $q$ and specific activity $a$. We say that a query item $q$ is similar to a user $u$'s $a$ activities, when many items, which $u$ performed $a$ on, shares similar tags with $q$. The above similarity function captures this intuition. The numerator computes the number of items where $u$ perform $a$ and the item shares at least 1 tag with the query item $q$. The denominator computes the total number of items where $u$ perform $a$

Notations
- $u$: User
- $a$: Activity. Different activities performed by user. E.g. answer, favorite, fork and watch.
- $i$: Item, i.e., Stack Overflow question or GitHub repository
- $q$: Query item
- $I_{(u,a)}$: Items where user $u$ perform activity $a$ on. E.g. questions which are answered by a user. 
- $i_{tags}$: Tags for the item. E.g. *Java*, *iOS*, etc. 
- $q_{tags}$: Tags for query item.

For function UserActivitySim():
- q: query item id (str)
- I_ua: ids of items where user has perform an activity (list)
- I_tags: key is item id and value is tags (dict)

In [15]:
def UserActivitySim(q, I_ua, Q_tags, I_tags):
    if q in I_ua:
        I_ua.remove(q)
    if len(I_ua)==0:
        return 0
    q_tags = set(Q_tags[q].split(' '))
    numerator = 0
    for i in I_ua:
        i_tags = set(I_tags[i].split(' '))
        overlap = i_tags.intersection(q_tags)
        if len(overlap) >0:
            numerator += 1
    return numerator/len(I_ua)

# Evaluation Metric
We use [Mean Average Precision (MAP)](https://medium.com/@jonathan_hui/map-mean-average-precision-for-object-detection-45c121a31173) to evaluate the activity prediciton tasks. I.e., we take total average precision (AP) of each user divided by the total number of users. 

MAP function:
- groundtruth: a dictionary where key is user id and value is the test labels for all the positive/negative instance for this use
- pred: a dictionary where key is the user id and value is the predicted probabilities of the label being posititive (i.e,'1')

In [3]:
def MAP(groundtruth,pred):
    result = 0
    for key, value in groundtruth.items():
        y_truth = value
        y_pred = pred[key]
        score= average_precision_score(y_truth,y_pred)
        result +=score
    return result/len(groundtruth)

# Loading Training and Test Data 
- i_users: user file
- i_ans_training_activities: user's answer activities in Stack Overflow use for training
- i_ans_testing_activities: user's answer activities in Stack Overflow use for test
- i_fav_training_activities: user's favorite activities in Stack Overflow use for training
- i_fav_testing_activities: user's favorite activities in Stack Overflow use for test
- i_watch_training_activities: user's watch activities in GitHub use for training
- i_watch_testing_activities: user's watch activities in GitHub use for test
- i_fork_training_activities: user's fork activities in GitHub use for training
- i_fork_testing_activities: user's fork activities in GitHub use for test

In [10]:
i_users = 'toy_data/users.csv'
i_ans_training_activities = 'toy_data/training/user_answer_training.csv'
i_ans_testing_activities = 'toy_data/test/user_answer_testing.csv'
i_fav_training_activities = 'toy_data/training/user_favorite_training.csv'
i_fav_testing_activities = 'toy_data/test/user_favorite_testing.csv'
i_watch_training_activities = 'toy_data/training/user_watch_training.csv'
i_watch_testing_activities = 'toy_data/test/user_watch_testing.csv'
i_fork_training_activities = 'toy_data/training/user_fork_training.csv'
i_fork_testing_activities = 'toy_data/test/user_fork_testing.csv'

#Load answer training set
ans_u_ids_train = []
ans_q_ids_train = []
ans_q_label_train = []
ans_user_items_train = {} 
ans_item_tags_train = {}
with open(i_ans_training_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        ans_u_ids_train.append(uid)
        ans_q_ids_train.append(rid)
        ans_q_label_train.append(label)
        if label==1:
            if uid in ans_user_items_train:
                ans_user_items_train[uid].append(rid)
            else:
                ans_user_items_train[uid] = [rid]
        ans_item_tags_train[rid] = tags
    
#Load answer test set
ans_u_ids_test = []
ans_q_ids_test = []
ans_q_label_test = []
ans_user_items_test = {} 
ans_item_tags_test = {}
with open(i_ans_testing_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        ans_u_ids_test.append(uid)
        ans_q_ids_test.append(rid)
        ans_q_label_test.append(label)
        if label==1:
            if uid in ans_user_items_test:
                ans_user_items_test[uid].append(rid)
            else:
                ans_user_items_test[uid] = [rid]
        ans_item_tags_test[rid] = tags

#Load favorite training set
fav_u_ids_train = []
fav_q_ids_train = []
fav_q_label_train = []
fav_user_items_train = {} 
fav_item_tags_train = {}
with open(i_fav_training_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        fav_u_ids_train.append(uid)
        fav_q_ids_train.append(rid)
        fav_q_label_train.append(label)
        if label==1:
            if uid in fav_user_items_train:
                fav_user_items_train[uid].append(rid)
            else:
                fav_user_items_train[uid] = [rid]
        fav_item_tags_train[rid] = tags
        
#Load favorite test set
fav_u_ids_test = []
fav_q_ids_test = []
fav_q_label_test = []
fav_user_items_test = {} 
fav_item_tags_test = {}
with open(i_fav_testing_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        fav_u_ids_test.append(uid)
        fav_q_ids_test.append(rid)
        fav_q_label_test.append(label)
        if label==1:
            if uid in fav_user_items_test:
                fav_user_items_test[uid].append(rid)
            else:
                fav_user_items_test[uid] = [rid]
        fav_item_tags_test[rid] = tags

#Load watch training set
watch_u_ids_train = []
watch_q_ids_train = []
watch_q_label_train = []
watch_user_items_train = {} 
watch_item_tags_train = {}
with open(i_watch_training_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        watch_u_ids_train.append(uid)
        watch_q_ids_train.append(rid)
        watch_q_label_train.append(label)
        if label==1:
            if uid in watch_user_items_train:
                watch_user_items_train[uid].append(rid)
            else:
                watch_user_items_train[uid] = [rid]
        watch_item_tags_train[rid] = tags
        
#Load watch test set
watch_u_ids_test = []
watch_q_ids_test = []
watch_q_label_test = []
watch_user_items_test = {} 
watch_item_tags_test = {}
with open(i_watch_testing_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        watch_u_ids_test.append(uid)
        watch_q_ids_test.append(rid)
        watch_q_label_test.append(label)
        if label==1:
            if uid in watch_user_items_test:
                watch_user_items_test[uid].append(rid)
            else:
                watch_user_items_test[uid] = [rid]
        watch_item_tags_test[rid] = tags

#Load fork training set
fork_u_ids_train = []
fork_q_ids_train = []
fork_q_label_train = []
fork_user_items_train = {} 
fork_item_tags_train = {}
with open(i_fork_training_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        fork_u_ids_train.append(uid)
        fork_q_ids_train.append(rid)
        fork_q_label_train.append(label)
        if label==1:
            if uid in fork_user_items_train:
                fork_user_items_train[uid].append(rid)
            else:
                fork_user_items_train[uid] = [rid]
        fork_item_tags_train[rid] = tags
        
#Load fork test set
fork_u_ids_test = []
fork_q_ids_test = []
fork_q_label_test = []
fork_user_items_test = {} 
fork_item_tags_test = {}
with open(i_fork_testing_activities, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        uid = int(row[0])
        rid = str(row[1])
        tags = str(row[2])
        label = int(row[3])
        fork_u_ids_test.append(uid)
        fork_q_ids_test.append(rid)
        fork_q_label_test.append(label)
        if label==1:
            if uid in fork_user_items_test:
                fork_user_items_test[uid].append(rid)
            else:
                fork_user_items_test[uid] = [rid]
        fork_item_tags_test[rid] = tags

# Answer Activity Prediction 

### Compute the Activity Similarity Features for Answer Activity Prediction

In [18]:
#Compute score for answer training set using user's answer activities
ans_scores_train = []
for i in range(len(ans_u_ids_train)):
    q = ans_q_ids_train[i]
    uid = ans_u_ids_train[i]
    I_ua = ans_user_items_train[uid].copy()
    Q_tags = ans_item_tags_train
    I_tags = ans_item_tags_train
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    ans_scores_train.append(score)

#Compute score for answer test set using user's answer activities
ans_scores_test = []
for i in range(len(ans_u_ids_test)):
    q = ans_q_ids_test[i]
    uid = ans_u_ids_test[i]
    I_ua = ans_user_items_test[uid].copy()
    Q_tags = ans_item_tags_test
    I_tags = ans_item_tags_test
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    ans_scores_test.append(score)


In [25]:
#Compute score for answeer training set using user's favorite activities
fav_scores_train = []
for i in range(len(ans_u_ids_train)):
    q = ans_q_ids_train[i]
    uid = ans_u_ids_train[i]
    I_ua = fav_user_items_train[uid].copy()
    Q_tags = ans_item_tags_train
    I_tags = fav_item_tags_train
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    fav_scores_train.append(score)

#Compute score for answer test set using user's favorite activities
fav_scores_test = []
for i in range(len(ans_u_ids_test)):
    q = ans_q_ids_test[i]
    uid = ans_u_ids_test[i]
    I_ua = fav_user_items_test[uid].copy()
    Q_tags = ans_item_tags_test
    I_tags = fav_item_tags_test
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    fav_scores_test.append(score)

In [26]:
#Compute score for answeer training set using user's fork activities
fork_scores_train = []
for i in range(len(ans_u_ids_train)):
    q = ans_q_ids_train[i]
    uid = ans_u_ids_train[i]
    I_ua = fork_user_items_train[uid].copy()
    Q_tags = ans_item_tags_train
    I_tags = fork_item_tags_train
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    fork_scores_train.append(score)

#Compute score for answer test set using user's fork activities
fork_scores_test = []
for i in range(len(ans_u_ids_test)):
    q = ans_q_ids_test[i]
    uid = ans_u_ids_test[i]
    I_ua = fork_user_items_test[uid].copy()
    Q_tags = ans_item_tags_test
    I_tags = fork_item_tags_test
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    fork_scores_test.append(score)

In [27]:
#Compute score for answeer training set using user's watch activities
watch_scores_train = []
for i in range(len(ans_u_ids_train)):
    q = ans_q_ids_train[i]
    uid = ans_u_ids_train[i]
    I_ua = watch_user_items_train[uid].copy()
    Q_tags = ans_item_tags_train
    I_tags = watch_item_tags_train
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    watch_scores_train.append(score)

#Compute score for answer test set using user's watch activities
watch_scores_test = []
for i in range(len(ans_u_ids_test)):
    q = ans_q_ids_test[i]
    uid = ans_u_ids_test[i]
    I_ua = watch_user_items_test[uid].copy()
    Q_tags = ans_item_tags_test
    I_tags = watch_item_tags_test
    score = UserActivitySim(q, I_ua, Q_tags, I_tags)
    watch_scores_test.append(score)

### Experiment Setup 1: Using only Answer Similarity Feature

In [19]:
#Training
x_train_list = []
for i in range(len(ans_scores_train)):
    x_train_list.append([int(ans_u_ids_train[i]),int(ans_scores_train[i])])
x_train = np.array(x_train_list)
y_train = ans_q_label_train.copy()
clf = svm.SVC(kernel='linear',probability=True)
#clf = svm.SVC(kernel='linear')
clf.fit(x_train_list,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
#Testing
x_test_list = []
for i in range(len(ans_scores_test)):
    x_test_list.append([int(ans_u_ids_test[i]),int(ans_scores_test[i])])
x_test = np.array(x_test_list)
y_test = ans_q_label_test.copy()
#pred = clf.predict(x_test)
pred = clf.predict_proba(x_test)

In [21]:
#evaluation
user_level_truth = {}
user_level_pred = {}
for i in range(len(pred)):
    uid = ans_u_ids_test[i]
    if uid not in user_level_pred:
        user_level_pred[uid] = [pred[i][0]]
    else:
        user_level_pred[uid].append(pred[i][1])

for i in range(len(ans_q_label_test)):
    uid = ans_u_ids_test[i]
    if uid not in user_level_truth:
        user_level_truth[uid] = [ans_q_label_test[i]]
    else:
        user_level_truth[uid].append(ans_q_label_test[i])

result = MAP(user_level_truth,user_level_pred)
print(result)

0.507200797032


### Experiment Setup 2: Using Fork, Watch, Answer and Favorite Similarity Feature
- Compute the similarity scores for answer training set

In [28]:
#Training
x_train_list = []
for i in range(len(ans_scores_train)):
    x_train_list.append([int(ans_u_ids_train[i]),
                         int(ans_scores_train[i]),
                         int(fav_scores_train[i]),
                         int(fork_scores_train[i]),
                         int(watch_scores_train[i])])
x_train = np.array(x_train_list)
y_train = ans_q_label_train.copy()
clf = svm.SVC(kernel='linear',probability=True)
#clf = svm.SVC(kernel='linear')
clf.fit(x_train_list,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
#Testing
x_test_list = []
for i in range(len(ans_scores_test)):
    x_test_list.append([int(ans_u_ids_test[i]),
                        int(ans_scores_test[i]),
                        int(fav_scores_test[i]),
                        int(fork_scores_test[i]),
                        int(watch_scores_test[i])])
x_test = np.array(x_test_list)
y_test = ans_q_label_test.copy()
#pred = clf.predict(x_test)
pred = clf.predict_proba(x_test)

In [30]:
#evaluation
user_level_truth = {}
user_level_pred = {}
for i in range(len(pred)):
    uid = ans_u_ids_test[i]
    if uid not in user_level_pred:
        user_level_pred[uid] = [pred[i][0]]
    else:
        user_level_pred[uid].append(pred[i][1])

for i in range(len(ans_q_label_test)):
    uid = ans_u_ids_test[i]
    if uid not in user_level_truth:
        user_level_truth[uid] = [ans_q_label_test[i]]
    else:
        user_level_truth[uid].append(ans_q_label_test[i])

result = MAP(user_level_truth,user_level_pred)
print(result)

0.442047686048
