#### DSB EDA
- refer from https://www.kaggle.com/gpreda/2019-data-science-bowl-eda

In [1]:
%run ../../utils.ipynb

done


In [39]:
import numpy as np
import pandas as pd
import os
import json
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline 
from collections import Counter

import scipy.stats

In [50]:
input_path = '../input/'

specs_df = pd.read_csv(os.path.join(input_path, 'specs.csv'))
train_df = pd.read_csv(os.path.join(input_path, 'train.csv'))
train_labels_df = pd.read_csv(os.path.join(input_path, 'train_labels.csv'))
test_df = pd.read_csv(os.path.join(input_path, 'test.csv'))

specs_df = reduce_mem_usage(specs_df)
train_df = reduce_mem_usage(train_df)
train_labels_df = reduce_mem_usage(train_labels_df)
test_df = reduce_mem_usage(test_df)

Mem. usage decreased to  0.01 Mb (0.0% reduction)
Mem. usage decreased to 778.73 Mb (18.2% reduction)
Mem. usage decreased to  0.49 Mb (48.2% reduction)
Mem. usage decreased to 79.40 Mb (18.2% reduction)


In [4]:
def encode_title(train, test, train_labels):
    # encode title
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels

In [5]:
train_df, test_df, train_labels_df, win_code, list_of_user_activities, list_of_event_code, activities_labels = encode_title(train_df, test_df, train_labels_df)

### get_data

In [6]:
 last_activity = 0
    
user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}

# new features: time spent in each activity
event_code_count = {eve: 0 for eve in list_of_event_code}
last_session_time_sec = 0

accuracy_groups = {0:0, 1:0, 2:0, 3:0}
all_assessments = []
accumulated_accuracy_group = 0
accumulated_accuracy = 0
accumulated_correct_attempts = 0 
accumulated_uncorrect_attempts = 0
accumulated_actions = 0
counter = 0
time_first_activity = float(train_df['timestamp'].values[0])
durations = []

In [49]:
train_df[train_df.event_code==4100]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
2228,25fa8af4,901acc108f55a5a1,2019-08-06 05:22:32.357000+00:00,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,44,4100,31011,35,Assessment,2
2308,14de4c5d,80d34a30c2998653,2019-08-06 05:24:50.323000+00:00,"{""distance"":10,""target_distances"":[5,6,7,8,9,1...",0006a69f,76,4100,114370,33,Game,2
2335,14de4c5d,80d34a30c2998653,2019-08-06 05:25:11.292000+00:00,"{""distance"":9,""target_distances"":[5,6,7],""corr...",0006a69f,103,4100,135341,33,Game,2
2375,14de4c5d,80d34a30c2998653,2019-08-06 05:25:37.207000+00:00,"{""distance"":3,""target_distances"":[5,6,7],""corr...",0006a69f,143,4100,161258,33,Game,2
2409,14de4c5d,80d34a30c2998653,2019-08-06 05:26:01.055000+00:00,"{""distance"":8,""target_distances"":[5,6,7],""corr...",0006a69f,177,4100,185103,33,Game,2
2797,25fa8af4,6bdf9623adc94d89,2019-08-06 05:38:08.036000+00:00,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,30,4100,18026,35,Assessment,2
3725,25fa8af4,9501794defd84e4d,2019-08-06 20:35:12.290000+00:00,"{""correct"":false,""stumps"":[3,2,5],""event_count...",0006a69f,29,4100,18484,35,Assessment,2
3734,25fa8af4,9501794defd84e4d,2019-08-06 20:35:16.846000+00:00,"{""correct"":true,""stumps"":[2,3,5],""event_count""...",0006a69f,38,4100,23043,35,Assessment,2
3773,14de4c5d,8043825259dc7ddd,2019-08-06 20:36:23.582000+00:00,"{""distance"":10,""target_distances"":[6,7,8,9,10]...",0006a69f,35,4100,46494,33,Game,2
3809,14de4c5d,8043825259dc7ddd,2019-08-06 20:36:54.289000+00:00,"{""distance"":9,""target_distances"":[4,5,6,7,8,9,...",0006a69f,71,4100,77194,33,Game,2


In [48]:
train_df[train_df.type=='Assessment']

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
2185,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:01.344000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,35,Assessment,2
2186,db02c830,901acc108f55a5a1,2019-08-06 05:22:01.400000+00:00,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,35,Assessment,2
2187,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:01.403000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,35,Assessment,2
2188,a52b92d5,901acc108f55a5a1,2019-08-06 05:22:05.242000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,35,Assessment,2
2189,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:05.244000+00:00,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,35,Assessment,2
2190,28ed704e,901acc108f55a5a1,2019-08-06 05:22:07.812000+00:00,"{""height"":4,""coordinates"":{""x"":329,""y"":550,""st...",0006a69f,6,4025,6475,35,Assessment,2
2191,a52b92d5,901acc108f55a5a1,2019-08-06 05:22:07.812000+00:00,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,7,3110,6475,35,Assessment,2
2192,9d29771f,901acc108f55a5a1,2019-08-06 05:22:07.816000+00:00,"{""description"":""That's one!"",""identifier"":""Dot...",0006a69f,8,3021,6475,35,Assessment,2
2193,c74f40cd,901acc108f55a5a1,2019-08-06 05:22:08.427000+00:00,"{""description"":""That's one!"",""identifier"":""Dot...",0006a69f,9,3121,7084,35,Assessment,2
2194,28ed704e,901acc108f55a5a1,2019-08-06 05:22:09.742000+00:00,"{""height"":1,""coordinates"":{""x"":176,""y"":526,""st...",0006a69f,10,4025,8400,35,Assessment,2


In [7]:
session = train_df[train_df.game_session=='901acc108f55a5a1']
i =1

session_type = session['type'].iloc[0]
session_title = session['title'].iloc[0]

all_attempts = session.query(f'event_code == {win_code[session_title]}')
# then, check the numbers of wins and the number of losses
true_attempts = all_attempts['event_data'].str.contains('true').sum()
false_attempts = all_attempts['event_data'].str.contains('false').sum()

all_attempts.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
2228,25fa8af4,901acc108f55a5a1,2019-08-06 05:22:32.357000+00:00,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,44,4100,31011,35,Assessment,2


In [8]:
features = user_activities_count.copy()
features.update(event_code_count.copy())
features['installation_id'] = session['installation_id'].iloc[-1]
features['session_title'] = session['title'].iloc[0]

features['accumulated_correct_attempts'] = accumulated_correct_attempts
features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
accumulated_correct_attempts += true_attempts 
accumulated_uncorrect_attempts += false_attempts

In [13]:
if durations == []:
    features['duration_mean'] = 0
else:
    features['duration_mean'] = np.mean(durations)
durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)

In [29]:
features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
accumulated_accuracy += accuracy
if accuracy == 0:
    features['accuracy_group'] = 0
elif accuracy == 1:
    features['accuracy_group'] = 3
elif accuracy == 0.5:
    features['accuracy_group'] = 2
else:
    features['accuracy_group'] = 1
features.update(accuracy_groups)
accuracy_groups[features['accuracy_group']] += 1

In [32]:
features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
accumulated_accuracy_group += features['accuracy_group']
features['accumulated_actions'] = accumulated_actions

In [35]:
test_set = False
if test_set:
    all_assessments.append(features)
elif true_attempts+false_attempts > 0:
    all_assessments.append(features)
counter += 1

In [40]:
n_of_event_codes = Counter(session['event_code'])
for key in n_of_event_codes.keys():
    event_code_count[key] += n_of_event_codes[key]

In [46]:
accumulated_actions += len(session)
if last_activity != session_type:
    user_activities_count[session_type] += 1
    last_activitiy = session_type 

In [47]:
all_assessments

[{'Clip': 0,
  'Activity': 0,
  'Assessment': 0,
  'Game': 0,
  2050: 0,
  4100: 0,
  2060: 0,
  4110: 0,
  2070: 0,
  2075: 0,
  2080: 0,
  2081: 0,
  2083: 0,
  3110: 0,
  3120: 0,
  3121: 0,
  4220: 0,
  4230: 0,
  5000: 0,
  4235: 0,
  5010: 0,
  4010: 0,
  4020: 0,
  4021: 0,
  4022: 0,
  4025: 0,
  4030: 0,
  4031: 0,
  3010: 0,
  4035: 0,
  4040: 0,
  3020: 0,
  3021: 0,
  4045: 0,
  2000: 0,
  4050: 0,
  2010: 0,
  2020: 0,
  4070: 0,
  2025: 0,
  2030: 0,
  4080: 0,
  2035: 0,
  2040: 0,
  4090: 0,
  4095: 0,
  'installation_id': '0006a69f',
  'session_title': 35,
  'accumulated_correct_attempts': 0,
  'accumulated_uncorrect_attempts': 0,
  'duration_mean': 0,
  'accumulated_accuracy': 0,
  'accuracy_group': 3,
  0: 0,
  1: 0,
  2: 0,
  3: 0,
  'accumulated_accuracy_group': 0,
  'accumulated_actions': 0}]

In [51]:
train_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK
5,1325467d,0848ef14a8dc6892,2019-09-06T17:55:06.279Z,"{""coordinates"":{""x"":583,""y"":605,""stage_width"":...",0001e90f,4,4070,9991,Sandcastle Builder (Activity),Activity,MAGMAPEAK
6,1325467d,0848ef14a8dc6892,2019-09-06T17:55:06.913Z,"{""coordinates"":{""x"":601,""y"":570,""stage_width"":...",0001e90f,5,4070,10622,Sandcastle Builder (Activity),Activity,MAGMAPEAK
7,1325467d,0848ef14a8dc6892,2019-09-06T17:55:07.546Z,"{""coordinates"":{""x"":250,""y"":665,""stage_width"":...",0001e90f,6,4070,11255,Sandcastle Builder (Activity),Activity,MAGMAPEAK
8,1325467d,0848ef14a8dc6892,2019-09-06T17:55:07.979Z,"{""coordinates"":{""x"":279,""y"":629,""stage_width"":...",0001e90f,7,4070,11689,Sandcastle Builder (Activity),Activity,MAGMAPEAK
9,1325467d,0848ef14a8dc6892,2019-09-06T17:55:08.566Z,"{""coordinates"":{""x"":839,""y"":654,""stage_width"":...",0001e90f,8,4070,12272,Sandcastle Builder (Activity),Activity,MAGMAPEAK
