In [22]:
import json
import pandas as pd
import numpy as np
import tqdm
from collections import OrderedDict

In [23]:
with open('data/input_member_profiles.json','r') as f:
    member_profiles = json.load(f)
    
with open('data/input_challenge_profiles.json','r') as f:
    challenge_profiles = json.load(f)
    
df = pd.read_csv('data/output_for_training.csv')
df['Placed'] = df['Placed'].fillna(-1)
df = df[(df['Date Date'] > '2019-03-03') & (df['Date Date'] < '2019-05-02')] # taking a slice for testing 


df_out = pd.read_csv('data/sorted_labels_test_x.csv')
df_out = df_out[['Challenge ID','Date Date']].copy()


In [24]:
test_labels = ['2016/09/14-2020/10/23', '2016/07/16-2020/08/24', '2016/05/17-2020/06/25', '2016/03/18-2020/04/26', '2016/01/18-2020/02/26', '2015/11/19-2019/12/28', '2015/09/20-2019/10/29', '2015/07/22-2019/08/30', '2015/05/23-2019/07/01', '2015/03/24-2019/05/02']
test_member_profiles = {key:value for key,value in member_profiles.items() if key in test_labels }

In [25]:
challenge_skills = []
for challenges in challenge_profiles:
    skills = [skill['tag'] for skill in challenges['skills']]
    challenge_skills.append({'id':challenges['id'],'skills':skills})

len(challenge_skills)

31033

In [26]:
challenges = {}
for ch in challenge_skills:
    challenges[int(ch['id'])] = ch['skills']

In [27]:
train_data = {}
for ind,row in df.iterrows():
    if row['Challenge ID'] in train_data:
        train_data[row['Challenge ID']]['registrants'].append(row['Registrant Handle'])
        if row['Submit Ind'] == 1:
            train_data[row['Challenge ID']]['submitters'].append(row['Registrant Handle'])
        if row['Placed'] != -1:
            train_data[row['Challenge ID']]['winners'].append(row['Registrant Handle'])
            
        train_data[row['Challenge ID']]['skills'] = challenges.get(row['Challenge ID'],[])
    else:
        train_data[row['Challenge ID']] = {}
        train_data[row['Challenge ID']]['registrants'] =[]
        train_data[row['Challenge ID']]['submitters'] =[]
        train_data[row['Challenge ID']]['winners'] = []
        train_data[row['Challenge ID']]['skills'] = []
        train_data[row['Challenge ID']]['date'] = row['Date Date']

In [28]:
challenges_test_data = {}
count =0
for key,value in train_data.items():
    if value['skills']:
        challenges_test_data[key] = train_data[key].copy()
    else:
        count+=1

print("no of valid chalenges:",len(challenges_test_data),"no of empty skills challenges:",count)

for key, value in challenges_test_data.items():
    for winner in value['winners']:
        value['submitters'] = [submitter for submitter in value['submitters'] if submitter != winner]
        value['registrants'] = [registrant for registrant in value['registrants'] if registrant != winner]
    for submitter in value['submitters']:
        value['registrants'] = [registrant for registrant in value['registrants'] if registrant != submitter]

no of valid chalenges: 262 no of empty skills challenges: 2313


In [29]:
len(challenges_test_data) # no of test challenges

262

In [30]:
def get_skill_vector(challenge_skills,member_skills): 
    val = 0
    count=0 
    
    if challenge_skills and member_skills:
        for skill in challenge_skills:
            if skill in member_skills:
                val+= member_skills[skill]
                count+=1
  
    if val and count: 
        return val/count
    else:
        return 0

In [31]:
def get_member_profile(challenge_date,member,train_member_profiles):
    challenge_date = challenge_date.replace('-','/')
    for key in train_member_profiles.keys():
        if challenge_date >= key.split('-')[0] and challenge_date <= key.split('-')[1]:
            memebers = train_member_profiles[key]
            if member in memebers and members[member]:
                member_skills = {}
                for skills in memebers[member]:
                        member_skills[skills['skill']] = skills['skill_confidence']
                return member_skills

In [32]:
def parse_values(x):
    if x> 0 and x < 0.3:
        return 1
    elif x>= 0.3 and x < 0.6:
        return 2
    elif x>= 0.6 and x < 0.9:
        return 3
    elif x>=0.9 and x < 1.5:
        return 4
    elif x>=1.5:
        return 5
    else:
        return 0

In [33]:
from surprise import Reader, Dataset, SVD, dump
_, model = dump.load('recmdr')

In [34]:
def recommend_members(model,challenge_skills,profiles):
    top_members = {}
    
    for member,skills in profiles.items():
        if skills:
            member_skills = {}
            for skill in skills:
                member_skills[skill['skill']] = skill['skill_confidence']
            similarity = get_skill_vector(challenge_skills['skills'],member_skills)
            prediction = model.predict(challenge_skills['id'],member,parse_values(similarity)).est
            top_members[member] = prediction * similarity
        
    top_members = OrderedDict({k: v for k, v in sorted(top_members.items(), key=lambda item: item[1],reverse=True)})

    return list(top_members.keys())[:60]

In [35]:
def check_participation_score(top_members,challenge):
    final_recmd = []
    for member in top_members:
        if member in challenge['submitters']:
            final_recmd.append([member,1])
        elif member in challenge['winners']:
            final_recmd.append([member,4])
        else:
            final_recmd.append([member,0])
    #  (actual participation score of the top recommended member) + (actual mean participation score of top 10 recommended members) + (mean participation score of top 60 recommended members)
    return final_recmd[0][1] + (sum([final_recmd[m][1] for m in range(10)])/10) + (sum([final_recmd[m][1] for m in range(60)])/60)

In [36]:
output_data = []
for key in tqdm.tqdm(test_labels):
    for challenge in challenges_test_data.keys():
        if challenge in challenges:
            skills = challenges[challenge]
            top_members = recommend_members(model,{'id':str(challenge),'skills':skills},test_member_profiles[key])
            output_data.append([key,challenge,skills,top_members])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:58<00:00, 17.87s/it]


In [37]:
len(output_data)

2620

In [38]:
scores = []
challenge_count =0 
for challenge in tqdm.tqdm(output_data):
    challenge_count+=1
    profile = challenges_test_data[challenge[1]]        
    scores.append(check_participation_score(challenge[-1],profile))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2620/2620 [00:00<00:00, 20523.76it/s]


In [39]:
from collections import Counter

dict(sorted(Counter(scores).items(), key=lambda t: t[0],reverse=True))

{7.6000000000000005: 2,
 7.533333333333333: 1,
 7.466666666666667: 1,
 7.4: 1,
 7.2: 4,
 7.133333333333333: 3,
 7.066666666666666: 4,
 7.0: 1,
 6.933333333333334: 1,
 6.8: 5,
 6.733333333333333: 5,
 6.666666666666667: 2,
 6.6: 2,
 6.5: 7,
 6.483333333333333: 3,
 6.466666666666667: 1,
 6.45: 2,
 6.416666666666666: 2,
 6.4: 1,
 6.3999999999999995: 3,
 6.383333333333334: 3,
 6.366666666666667: 3,
 6.366666666666666: 2,
 6.35: 2,
 6.333333333333333: 11,
 6.266666666666667: 3,
 6.216666666666667: 1,
 6.2: 1,
 6.199999999999999: 5,
 6.066666666666666: 1,
 6.033333333333333: 4,
 5.983333333333333: 2,
 5.95: 1,
 5.883333333333334: 1,
 5.866666666666667: 3,
 5.733333333333333: 7,
 5.666666666666667: 8,
 5.65: 2,
 5.533333333333333: 15,
 5.466666666666667: 15,
 5.283333333333333: 2,
 5.2: 3,
 5.183333333333334: 4,
 5.133333333333333: 5,
 5.066666666666666: 4,
 4.966666666666667: 1,
 4.933333333333334: 16,
 4.733333333333333: 2,
 4.666666666666667: 1,
 4.6000000000000005: 5,
 4.533333333333334: 3

In [40]:
sum(scores)/len(scores)

0.8620737913486013