In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from IPython.display import display, HTML, clear_output

In [2]:
%%time
df_c = pd.read_csv('input/challenges.csv')
df_s = pd.read_csv('input/submissions.csv', parse_dates=['created_at']).sort_values('created_at')

CPU times: user 1min 22s, sys: 520 ms, total: 1min 23s
Wall time: 1min 23s


There are a lot of duplicate challenges which were featured in various other contests. I remove them and update the information about their group and subgroup. I also calculate percentage of solved.

In [3]:
def cleanupChallenges():
    cnt = df_c['challenge_id'].value_counts()
    df_c['cnt'] = df_c['challenge_id'].apply(lambda x: cnt[x])
    
    data = []
    for k, v in df_c[df_c['cnt'] > 1].groupby('challenge_id'):
        contests = v['contest_id'].tolist()
        contest = 'c8ff662c97d345d2' if 'c8ff662c97d345d2' in contests else contests[0]
        data.append([
            k, contest, v['domain'].max(), v['subdomain'].max(), v['difficulty'].max(), 
            v['solved_submission_count'].sum(), v['solved_submission_count'].sum(), 1
        ])
    
    return pd.concat([df_c[df_c['cnt'] == 1], pd.DataFrame(data, columns=df_c.columns)]).drop('cnt', axis=1).reset_index(drop=True)

In [4]:
df_c = cleanupChallenges()
df_c['pct'] = df_c['solved_submission_count'] / df_c['total_submissions_count']

Grouped languages together (didn't use that information) and dropped the `contest_id` (have not seen a reson for using it.

In [5]:
lang_map = {
    'python3': 'python',
    'pypy3': 'python',
    'pypy': 'python',
    'java8': 'java',
    'mysql': 'sql',
    'oracle': 'sql',
    'tsql': 'sql',
    'db2': 'sql',
    'cpp14': 'cpp',
    'text_pseudo': 'text',
    'sbcl': 'lisp',
    'clisp': 'lisp',
    '["html", "js", "css"]': 'javascript',
    'coffeescript': 'javascript'
}
df_s['language'] = df_s['language'].apply(lambda x: lang_map.get(x, x))
df_s.drop('contest_id', axis=1, inplace=True)

**The main idea** behind the model was to construct the graph of people coming from one problem to another problem and use the probability of moving between the nodes for selecting the best problems. So I grouped users together, sorted their usage time and created an array of problems they solved (`[p1, p2, p3, ... pn]`. Now I had tuples of `(p1, p2)` and so on (slightly different because I had the requirement that the second value in the tuple is in the list of accepted challenges) and used these data to calculate statistics.

In [6]:
def getProgress():
    hacker_info = {}
    for hacker_id, tbl in df_s.groupby('hacker_id'):
        order = []
        for _, row in tbl.iterrows():
            chall_id, lang, solved = row['challenge_id'], row['language'], row['solved']
            order.append((chall_id, lang, solved))

        hacker_info[hacker_id] = order
    return hacker_info

def getTuples(arr):
    res = []
    for i in xrange(len(arr) - 1):
        for j in xrange(i + 1, len(arr)):
            if arr[j] in submission_ids_set:
                res.append((arr[i], arr[j]))
                break
    return res

def getStats():
    stats = defaultdict(list)
    for v in getProgress().itervalues():
        for prev_ids, next_id in getTuples([i[0] for i in v]):
            stats[prev_ids].append(next_id)

    return {k: Counter(v) for k, v in stats.iteritems()}

def cnt_to_df(cnt):
    return pd.DataFrame([(k, v) for k, v in cnt.iteritems()], columns=['qID', 'num']).sort_values('num', ascending=False).reset_index(drop=True)

In [7]:
hacker_ids = df_s['hacker_id'].unique().tolist()
submission_ids = df_c[df_c['contest_id'] == 'c8ff662c97d345d2']['challenge_id'].tolist()
submission_ids_set = set(submission_ids)

In [8]:
%%time
problem_dict_stats = getStats()

CPU times: user 45.5 s, sys: 483 ms, total: 46 s
Wall time: 46.4 s


In [9]:
def generateChallengesUserInfo():
    info_solved, info_tried = {}, {}
    for k, v in df_s.groupby('hacker_id'):
        solved = set((v[v['solved'] == 1])['challenge_id'].tolist())
        tried  = set((v[v['solved'] == 0])['challenge_id'].tolist()) - solved
        info_solved[k] = solved
        info_tried[k]  = tried
    return info_solved, info_tried

In [10]:
%%time
user_dict_solved, user_dict_tried = generateChallengesUserInfo()

CPU times: user 16.7 s, sys: 122 ms, total: 16.9 s
Wall time: 16.9 s


--------

In [11]:
df_g = df_s.merge(df_c, how='left', on='challenge_id')[['hacker_id', 'contest_id', 'challenge_id', 'language', 'solved', 'domain', 'subdomain', 'difficulty', 'pct']]
df_g.fillna('unknown', inplace=True)
df_c.fillna('unknown', inplace=True)

def getMostCommon():
    most_common = {}
    for k, v in df_g[(df_g['contest_id'] == 'c8ff662c97d345d2') & (df_g['solved'] == 1)].groupby(['domain', 'subdomain']):
        most_common[k] = [sId for sId, cnt in Counter(v['challenge_id'].tolist()).most_common(30)]
    return most_common

most_common = getMostCommon()

In [12]:
def getDomSubdom(probID):
    tmp = (df_c[df_c['challenge_id'] == probID])[['domain', 'subdomain']].head(1)
    return (tmp['domain'].values[0], tmp['subdomain'].values[0])

most_popular_challenges = df_g[(df_g['contest_id'] == 'c8ff662c97d345d2') & (df_g['solved'] == 1)].groupby('challenge_id').count()['hacker_id'].sort_values(ascending=False).tolist()

In [13]:
last_solved = df_s.groupby('hacker_id').last()['challenge_id']

data = []
for hacker_id in hacker_ids:
    last_problem = last_solved.loc[hacker_id]
    user_solved  = user_dict_solved[hacker_id]
    user_tried   = user_dict_tried[hacker_id]
    if last_problem in problem_dict_stats:
        probs        = problem_dict_stats[last_problem]

        tmp_df = cnt_to_df(probs)
        tmp_df = tmp_df[~tmp_df['qID'].isin(user_solved)]

        best_questions = tmp_df.head(10)['qID'].tolist()
        if len(best_questions) < 10:
            best_questions += [el for el in most_popular_challenges if el not in user_solved][:(10 - len(best_questions))]
            
#             add = [el for el in most_common[getDomSubdom(last_problem)] if el not in user_solved]
#             best_questions += add[:(10 - len(best_questions))]
#             if len(best_questions) < 10:
#                 best_questions += [el for el in submission_ids if el not in user_solved][:(10 - len(best_questions))]
    else:
        #best_questions = [el for el in submission_ids if el not in user_solved][:10]
        best_questions = [el for el in most_popular_challenges if el not in user_solved][:10]
    
    data.append([hacker_id] + best_questions)

In [14]:
pd.DataFrame(data).to_csv("output/task_2_11.csv", index=False, header=False)

- task_2_01 **164.97**
- task_2_02 **169.49**
- task_2_03 **169.49**
- task_2_04 **165.39**
- task_2_05 **166.61**
- task_2_06 **169.98**
- task_2_07 **167.14**
- task_2_08 **170.08**
- task_2_09 **168.54**
- task_2_10 **168.52**
- task_2_11 **163.97**

I actually tried reasonable amount of slight modifications of how I select the courses when my recommendation does not return the 10 expected courses. They all either yielded little result of were negative. What I should have done is to investigate what courses have highere probability of being solved by that particular user and improve their probability of being selected.