In [1]:
import json
import pandas as pd
import numpy as np
import tqdm
from collections import OrderedDict

# Load Necessary Data Files

In [2]:
with open('data/input_member_profiles.json','r') as f:
    member_profiles = json.load(f)
    
with open('data/input_challenge_profiles.json','r') as f:
    challenge_profiles = json.load(f)
    
df = pd.read_csv('data/output_for_training.csv')
df['Placed'] = df['Placed'].fillna(-1)

df_out = pd.read_csv('data/sorted_labels_test_x.csv')
df_out = df_out[['Challenge ID','Date Date']].copy()

# Splitting Test and Train Member Profiles

In [3]:
test_labels = ['2016/09/14-2020/10/23', '2016/07/16-2020/08/24', '2016/05/17-2020/06/25', '2016/03/18-2020/04/26', '2016/01/18-2020/02/26', '2015/11/19-2019/12/28', '2015/09/20-2019/10/29', '2015/07/22-2019/08/30', '2015/05/23-2019/07/01', '2015/03/24-2019/05/02']
train_labels = [key for key in member_profiles.keys() if key not in test_labels] 

train_member_profiles = {key:value for key,value in member_profiles.items() if key in train_labels }
test_member_profiles = {key:value for key,value in member_profiles.items() if key in test_labels }

In [4]:
# Create  dict of Challenge_id and skills

In [5]:
challenge_skills = []
for challenges in challenge_profiles:
    skills = [skill['tag'] for skill in challenges['skills']]
    challenge_skills.append({'id':challenges['id'],'skills':skills})

len(challenge_skills)

31033

In [6]:
challenges = {}
for ch in challenge_skills:
    challenges[int(ch['id'])] = ch['skills']

### Create the Ground truth Dict from the csv file taking common challenges of both csv and challenge_profile_json files

In [7]:
train_data = {}
for ind,row in df.iterrows():
    if row['Challenge ID'] in train_data:
        train_data[row['Challenge ID']]['registrants'].append(row['Registrant Handle'])
        if row['Submit Ind'] == 1:
            train_data[row['Challenge ID']]['submitters'].append(row['Registrant Handle'])
        if row['Placed'] != -1:
            train_data[row['Challenge ID']]['winners'].append(row['Registrant Handle'])
            
        train_data[row['Challenge ID']]['skills'] = challenges.get(row['Challenge ID'],[])
    else:
        train_data[row['Challenge ID']] = {}
        train_data[row['Challenge ID']]['registrants'] =[]
        train_data[row['Challenge ID']]['submitters'] =[]
        train_data[row['Challenge ID']]['winners'] = []
        train_data[row['Challenge ID']]['skills'] = []
        train_data[row['Challenge ID']]['date'] = row['Date Date']

# Data Preparation

In [8]:
challenges_train_data = {}
count =0
for key,value in train_data.items():
    if value['skills']:
        challenges_train_data[str(key)] = train_data[key].copy()
    else:
        count+=1

print("no of valid chalenges:",len(challenges_train_data),"no of empty skills challenges:",count)

for key, value in challenges_train_data.items():
    for winner in value['winners']:
        value['submitters'] = [submitter for submitter in value['submitters'] if submitter != winner]
        value['registrants'] = [registrant for registrant in value['registrants'] if registrant != winner]
    for submitter in value['submitters']:
        value['registrants'] = [registrant for registrant in value['registrants'] if registrant != submitter]

no of valid chalenges: 17246 no of empty skills challenges: 25007


In [9]:
def get_skill_vector(challenge_skills,member_skills):
    """Take average score of a challenge skill against a member skills"""
    val = 0
    count=0 
    
    if challenge_skills and member_skills:
        for skill in challenge_skills:
            if skill in member_skills:
                val+= member_skills[skill]
                count+=1                
    if val and count: 
        return val/count
    else:
        return 0

In [10]:
def get_member_profile(challenge_date,member,train_member_profiles):
    """ get a member skills of a given slice"""
    challenge_date = challenge_date.replace('-','/')
    for key in train_member_profiles.keys():
        if challenge_date >= key.split('-')[0] and challenge_date <= key.split('-')[1]:
            memebers = train_member_profiles[key]
            if member in memebers and memebers[member]:
                member_skills = {}
                for skills in memebers[member]:
                        member_skills[skills['skill']] = skills['skill_confidence']
                return member_skills

In [11]:
def create_training_data(train_member_profiles,challenges_data):
    data = []
    for challenge,values in tqdm.tqdm(challenges_data.items()):
        for registrant in values['registrants']:
            member_profile = get_member_profile(values['date'],registrant,train_member_profiles)
            similarity = get_skill_vector(values['skills'],member_profile)
            if similarity:
                data.append([challenge,registrant,similarity,0])
            
        for submit in values['submitters']:
            member_profile = get_member_profile(values['date'],submit,train_member_profiles)
            similarity = get_skill_vector(values['skills'],member_profile)
            if similarity:
                data.append([challenge,registrant,similarity,1])
            
        for winner in values['winners']:
            member_profile = get_member_profile(values['date'],winner,train_member_profiles)
            similarity = get_skill_vector(values['skills'],member_profile)
            if similarity:
                data.append([challenge,registrant,similarity,4])
    return data  

In [12]:
data = create_training_data(train_member_profiles,challenges_train_data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17246/17246 [00:11<00:00, 1489.53it/s]


In [13]:
df = pd.DataFrame(data,columns=['challenge','member','similarity','score'])
df.shape

(183477, 4)

In [32]:
def parse_values(x):
    """parsing the similarity score into ratings for the model"""
    if x> 0 and x < 0.3:
        return 1
    elif x>= 0.3 and x < 0.6:
        return 2
    elif x>= 0.6 and x < 0.9:
        return 3
    elif x>=0.9 and x < 1.5:
        return 4
    elif x>=1.5:
        return 5
    else:
        return 0

df['rating'] = df['similarity'].apply(parse_values)

df.head()

Unnamed: 0,challenge,member,similarity,score,rating
0,30010140,Schpotsky,0.715,0,3
1,30010140,agus.mw,0.15,0,1
2,30010140,fivestarwy,0.244,0,1
3,30010140,snoopybaba,0.407,4,2
4,30010449,bramandia,0.128,0,1


In [33]:
df['rating'].value_counts()

2    54042
1    38458
4    37678
3    32801
5    20498
Name: rating, dtype: int64

# Model Training

In [34]:
from surprise import Reader, Dataset, SVD, dump
from surprise.model_selection.validation import cross_validate

reader = Reader()
data = Dataset.load_from_df(df[['challenge', 'member','rating']], reader)
svd = SVD()

In [35]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9311  0.9341  0.9349  0.9327  0.9273  0.9320  0.0027  
MAE (testset)     0.7064  0.7059  0.7069  0.7049  0.7013  0.7051  0.0020  
Fit time          10.08   9.21    9.41    9.93    9.47    9.62    0.33    
Test time         0.64    0.30    0.29    0.55    0.30    0.42    0.15    


{'test_rmse': array([0.93109953, 0.93410526, 0.93491389, 0.93266065, 0.92726105]),
 'test_mae': array([0.70635319, 0.70594404, 0.70686724, 0.70494893, 0.7013179 ]),
 'fit_time': (10.077051877975464,
  9.211367130279541,
  9.410834550857544,
  9.927453517913818,
  9.46568775177002),
 'test_time': (0.6412844657897949,
  0.30019569396972656,
  0.2932159900665283,
  0.5495283603668213,
  0.3041865825653076)}

In [36]:
trainset = data.build_full_trainset()
model = svd.fit(trainset)

# Save Model

In [37]:
dump.dump('recmdr', algo=model)
_, model = dump.load('recmdr')

In [38]:
def recommend_members(model,challenge_skills,profiles):
    """recommend top 60 members for a challenge given a member slice"""
    top_members = {}
    
    for member,skills in profiles.items():
        member_skills = {}
        if skills:
            for skill in skills:
                member_skills[skill['skill']] = skill['skill_confidence']
            similarity = get_skill_vector(challenge_skills['skills'],member_skills)
            prediction = model.predict(challenge_skills['id'],member,parse_values(similarity)).est
            top_members[member] = prediction * similarity
        
    top_members = OrderedDict({k: v for k, v in sorted(top_members.items(), key=lambda item: item[1],reverse=True)})

    return list(top_members.keys())[:60]

In [39]:
def check_participation_score(top_members,challenge):
    """ given a challenge Truth Values calculate the participation score of the challenge"""
    final_recmd = []
    for member in top_members:
        if member in challenge['submitters']:
            final_recmd.append([member,1])
        elif member in challenge['winners']:
            final_recmd.append([member,4])
        else:
            final_recmd.append([member,0])
    #  (actual participation score of the top recommended member) + (actual mean participation score of top 10 recommended members) + (mean participation score of top 60 recommended members)
    return final_recmd[0][1] + (sum([final_recmd[m][1] for m in range(10)])/10) + (sum([final_recmd[m][1] for m in range(60)])/60)

# Testing Model

In [40]:
profiles = train_member_profiles[ '2010/04/19-2014/05/28']
challenge = challenges_train_data['30053920']
challenge['id'] = '30053920'
print(challenge)

{'registrants': ['codingdrone', 'Standlove', 'c0dezer0', 'karthiks416', 'wcheung', 'ChenXiaoTemp', 'jackchongs', 'XinScirpt', 'gjw99', 'pfilippi', 'e06widu', 'lanchongyizu', 'twds', 'panoptimum', 'sunbinbrother', 'phead', 'DeimonDB', 'Brunall', 'Anthony.Eden', 'Gando19850304', 'peakpado', 'zsudraco', 'binaary01', 'rixiac', 'itkankan', 'vitocorleone201', 'slayerjain', 'tuxing', 'wood387', 'phaniram', 'ecavalier313', 'stevelvovo', 'graphit', 'blu3fox'], 'submitters': [], 'winners': ['cam.ashwini', 'muzehyun', 'ondrejba', 'edisonwu'], 'skills': ['Application Programming Interface (API)', 'Cloudhub', 'Integration', 'Mongodb', 'Node.Js', 'Open Source Softwares', 'Test Runner'], 'date': '2016-04-27', 'id': '30053920'}


In [41]:
top_members = recommend_members(model,challenge,profiles)

In [42]:
top_members

['Sky_',
 'Applications',
 'Ghost_141',
 'hi4sandy',
 'freegod',
 'muzehyun',
 'albertwang',
 'morehappiness',
 'sdgun',
 'zsudraco',
 'dljg718',
 'liuliquan',
 'NightWolf',
 'flytoj2ee',
 'hesibo',
 'Standlove',
 'peakpado',
 'isv',
 'LazyChild',
 'kurtrips',
 'pvmagacho',
 'evilkyro1965',
 'abedavera',
 'selvia_ettine',
 'stevenfrog',
 'salesforcesmarty',
 'gh3ablo',
 'rjnpnigrhi',
 'j3_guile',
 'winterflame',
 'iversonLv',
 'maymay',
 'yoki',
 'snehaheda',
 'picachui',
 'theakhilis',
 'Brightspring',
 'chok68',
 'bugbuka',
 'gjw99',
 'KennyAlive',
 'ahmed.seddiq',
 'panoptimum',
 'sparemax',
 'nithyaasworld',
 'fairy_ley',
 'gvir',
 'faeton',
 'Zulander',
 'thinkcreeper',
 'vvvpig',
 'dileepa',
 'basuki',
 'iamtong',
 'yedtoss',
 'velorien',
 'GreatKevin',
 'babyface168',
 'iSpartan',
 'gangparia']

In [43]:
check_participation_score(top_members,challenge)

0.4666666666666667

# Creating Output CSV

In [44]:
test_data = {}
for ind,row in df_out.iterrows():
    if row['Date Date'] not in test_data:
        if row['Challenge ID'] in challenges:
            test_data[row['Date Date']] = [row['Challenge ID']]
        else:
             test_data[row['Date Date']] =[]
    else:
        if row['Challenge ID'] not in test_data[row['Date Date']] and row['Challenge ID'] in challenges:
            test_data[row['Date Date']].append(row['Challenge ID'])

In [45]:
output_data = []
for key in test_labels:
    end = key.split('-')[-1].replace('/','-')
    chal =[] 
    for k,v in test_data.items():
        if k >= end:
             chal.extend(v)
    test_challenges = list(set(chal))
    for challenge in tqdm.tqdm(test_challenges):
        if challenge in challenges:
            skills = challenges[challenge]
            top_members = recommend_members(model,{'id':str(challenge),'skills':skills},test_member_profiles[key])
            temp = [key,challenge,skills]
            temp.extend(top_members)
            output_data.append(temp)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 15.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:30<00:00,  9.09it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 475/475 [00:39<00:00, 12.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 726/726 [00:46<00:00, 15.67it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 995/995 [01:00<00:00, 16.54it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1255/1255 [01:10<00:00, 17.

In [46]:
# no of test challenges in each slice
# 2020-10-23 3 
# 2020-08-24 274 
# 2020-06-25 475
# 2020-04-26 726
# 2020-02-26 995 
# 2019-12-28 1255
# 2019-10-29 1480
# 2019-08-30 1754
# 2019-07-01 2120
# 2019-05-02 2489

len(output_data) # 11571

11571

In [47]:
columns = ['time block', 'challenge id', 'challenge skills profile']
for i in range(1,61):
    columns.append(str(i))
output = pd.DataFrame(output_data, columns = columns)
output.shape

(11571, 63)

In [48]:
output.head()

Unnamed: 0,time block,challenge id,challenge skills profile,1,2,3,4,5,6,7,...,51,52,53,54,55,56,57,58,59,60
0,2016/09/14-2020/10/23,30146536,"[Digital Technology, Sendto, Set Theory, Web B...",vvvpig,SATKAN,khanfaraz,vasilica.olariu,kmurti,creeya,Subhu,...,yrtchn,starck181995,wenbin,eng01,oninkxronda,bpalleri,banerjeesourish,ipraznik,wleite,anonymousjaggu
1,2016/09/14-2020/10/23,30146426,"[Digital Technology, Sendto, Set Theory, User ...",vvvpig,vasilica.olariu,SATKAN,nauhil,sdgun,nicokontes,codejam,...,shubhendus,gardn999,khanhlinh,starck181995,sylar,wenbin,veshu,rajeshrathod,ananthhh,namanhams
2,2016/09/14-2020/10/23,30146749,"[Application Programming Interface (API), Crow...",ngoctay,tuxing,kinfkong,jiangliwu,Ghost_141,zsudraco,seriyvolk83,...,mhykol,selvia_ettine,iaminfinite,tk2rush90,freegod,wleite,basuki,khanhlinh,diazz,starck181995
3,2016/07/16-2020/08/24,30141440,"[Bootstrap (FRONT-END FRAMEWORK), Codebase, Fi...",N1k1tung,soso0574,ouyangki,Ghost_141,jiangliwu,nghi85,zsudraco,...,DooMachine,Veve,jiangyue808,mancoolgunda,spanhawk,universo,Ksys,gets0ul,meshde,sr.harrison
4,2016/07/16-2020/08/24,30141441,"[Codebase, Dashboard, Error Messages, Json Web...",universo,iamtong,Ravijune,iaminfinite,ToxicPixel,soso0574,eriantoongko,...,hi4sandy,DaraK,oninkxronda,ArteVisual,thinkcreeper,daga_sumit,ujazz,basuki,ChanKamWo,yiming


In [49]:
output.to_csv("recommendations.csv",index=False)