In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [2]:
train = pd.read_csv("/code/data/raw/train.csv")
train_labels = pd.read_csv("/code/data/raw/train_labels.csv")
test = pd.read_csv("/code/data/raw/test.csv")

In [3]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [4]:
test.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [5]:
train_labels.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


In [6]:
# encode title
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [7]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [8]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [104]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    durations = []
    for i, session in user_sample.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        if test_set == True:
            second_condition = True
        else:
            if len(session)>1:
                second_condition = True
            else:
                second_condition= False
            
        if (session_type == 'Assessment') & (second_condition):
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            features = user_activities_count.copy()
            features['session_title'] = session['title'].iloc[0] 
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
    
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
        
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
    
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1

            features.update(accuracy_groups)
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            features['accumulated_actions'] = accumulated_actions
            accumulated_accuracy_group += features['accuracy_group']
            accuracy_groups[features['accuracy_group']] += 1
    
            if test_set == True:
                all_assessments.append(features)
            else:
                if true_attempts + false_attempts > 0:
                    all_assessments.append(features)
            counter += 1

        accumulated_actions += len(session)

        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type

    if test_set:
        return all_assessments[-1] 

    return all_assessments

In [105]:
train_group = train.groupby('installation_id', sort=False)

In [110]:
a_list = []
for i, (a, b) in enumerate(train_group):
    data = get_data(user_sample=b)
    a_list += data
    
    if i==3:
        break
a_list

[{'Clip': 11,
  'Activity': 3,
  'Assessment': 0,
  'Game': 4,
  'session_title': 24,
  'accumulated_correct_attempts': 0,
  'accumulated_uncorrect_attempts': 0,
  'duration_mean': 0,
  'accumulated_accuracy': 0,
  'accuracy_group': 3,
  0: 0,
  1: 0,
  2: 0,
  3: 0,
  'accumulated_accuracy_group': 0,
  'accumulated_actions': 647},
 {'Clip': 14,
  'Activity': 4,
  'Assessment': 1,
  'Game': 6,
  'session_title': 9,
  'accumulated_correct_attempts': 1,
  'accumulated_uncorrect_attempts': 0,
  'duration_mean': 39.0,
  'accumulated_accuracy': 1.0,
  'accuracy_group': 0,
  0: 0,
  1: 0,
  2: 0,
  3: 1,
  'accumulated_accuracy_group': 3.0,
  'accumulated_actions': 1143},
 {'Clip': 14,
  'Activity': 4,
  'Assessment': 2,
  'Game': 6,
  'session_title': 24,
  'accumulated_correct_attempts': 1,
  'accumulated_uncorrect_attempts': 11,
  'duration_mean': 65.5,
  'accumulated_accuracy': 0.5,
  'accuracy_group': 3,
  0: 1,
  1: 0,
  2: 0,
  3: 1,
  'accumulated_accuracy_group': 1.5,
  'accumulated

In [111]:
pd.DataFrame(a_list)

Unnamed: 0,Clip,Activity,Assessment,Game,session_title,accumulated_correct_attempts,accumulated_uncorrect_attempts,duration_mean,accumulated_accuracy,accuracy_group,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,11,3,0,4,24,0,0,0.0,0.0,3,0,0,0,0,0.0,647
1,14,4,1,6,9,1,0,39.0,1.0,0,0,0,0,1,3.0,1143
2,14,4,2,6,24,1,11,65.5,0.5,3,1,0,0,1,1.5,1230
3,24,9,4,10,24,2,11,41.25,0.5,2,2,0,0,2,1.5,2159
4,28,10,5,13,9,3,12,39.2,0.5,3,2,0,1,2,1.6,2586
5,15,6,0,4,13,0,0,0.0,0.0,3,0,0,0,0,0.0,1542
6,28,8,2,5,24,1,0,7.0,0.5,0,1,0,0,1,1.5,1898
7,30,8,3,6,9,1,4,35.0,0.333333,2,2,0,0,1,1.0,2022


In [86]:
def process(data, i):
    compiled_data = get_data(data)
    return compiled_data, i

compiled_data = Parallel(n_jobs=-1)(
    [delayed(process)(user_sample, i) for i, (_, user_sample) in enumerate(train.groupby('installation_id', sort=False))]
    )


In [87]:
compiled_data.sort(key=lambda x: x[1])
compiled_data = [t[0] for t in compiled_data]

In [88]:
pd.DataFrame(compiled_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,
2,Clip,Activity,Assessment,Game,session_title,accumulated_correct_attempts,accumulated_uncorrect_attempts,duration_mean,accumulated_accuracy,accuracy_group,0.0,1.0,2.0,3.0,accumulated_accuracy_group,accumulated_actions
3,Clip,Activity,Assessment,Game,session_title,accumulated_correct_attempts,accumulated_uncorrect_attempts,duration_mean,accumulated_accuracy,accuracy_group,0.0,1.0,2.0,3.0,accumulated_accuracy_group,accumulated_actions
4,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,,,,,,,,,,,,,,,,
16996,,,,,,,,,,,,,,,,
16997,,,,,,,,,,,,,,,,
16998,,,,,,,,,,,,,,,,


In [70]:
a = [1,2,3]
a.append(2)

In [73]:
a += "5"

In [74]:
a

[1, 2, 3, 2, '5']