# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Load Data (a sample)

In [2]:
path ='./data_science_bowl_2019/train.csv'

In [19]:
sample = pd.read_csv(path, nrows=10000, parse_dates=['timestamp'])

In [20]:
sample.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06 17:53:46.937,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06 17:54:17.519,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06 17:54:56.387,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:55:03.253,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [5]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
event_id           10000 non-null object
game_session       10000 non-null object
timestamp          10000 non-null object
event_data         10000 non-null object
installation_id    10000 non-null object
event_count        10000 non-null int64
event_code         10000 non-null int64
game_time          10000 non-null int64
title              10000 non-null object
type               10000 non-null object
world              10000 non-null object
dtypes: int64(3), object(8)
memory usage: 859.5+ KB


# Explore Column

In [10]:
all_cols = sample.columns.tolist()
all_cols

['event_id',
 'game_session',
 'timestamp',
 'event_data',
 'installation_id',
 'event_count',
 'event_code',
 'game_time',
 'title',
 'type',
 'world']

In [7]:
json.loads(sample.event_data[0])

{'event_code': 2000, 'event_count': 1}

## 0. event_id

In [8]:
sample.event_id.value_counts()[:10]

cf82af56    438
5e812b27    428
1325467d    405
b2dba42b    327
1bb5fbdb    326
56bcd38d    284
0a08139c    269
71fe8f75    268
76babcde    223
9ee1c98c    219
Name: event_id, dtype: int64

Comments: 
- values = strings 
- Randomly generated unique identifier for the event type. Maps to event_id column in specs table.

## 1. game_session

In [15]:
sample.game_session.value_counts()[:10]

f11eb823348bfa23    741
bd0036e83bb48f02    291
0848ef14a8dc6892    267
b6df5bbfc96a77fe    254
6e123d4e3fab4871    250
ff48f8654fa01dae    243
1c691f67ef07d047    221
585ea3c14d101571    215
28b2cfc5461686e6    212
2ecc82dcd2847233    204
Name: game_session, dtype: int64

Comments: 
- values = strings 
- Randomly generated unique identifier grouping events within a single game or video play session.

## 2. timestamp

In [23]:
sample.timestamp[:10]

0   2019-09-06 17:53:46.937
1   2019-09-06 17:54:17.519
2   2019-09-06 17:54:56.302
3   2019-09-06 17:54:56.387
4   2019-09-06 17:55:03.253
5   2019-09-06 17:55:06.279
6   2019-09-06 17:55:06.913
7   2019-09-06 17:55:07.546
8   2019-09-06 17:55:07.979
9   2019-09-06 17:55:08.566
Name: timestamp, dtype: datetime64[ns]

Comments: 
- values = timestamps 
- Client-generated datetime

## 3. event_data

In [31]:
sample.event_data.apply(json.loads)[0:10]

0               {'event_code': 2000, 'event_count': 1}
1               {'event_code': 2000, 'event_count': 1}
2    {'version': '1.0', 'event_count': 1, 'game_tim...
3    {'description': 'Let's build a sandcastle! Fir...
4    {'description': 'Let's build a sandcastle! Fir...
5    {'coordinates': {'x': 583, 'y': 605, 'stage_wi...
6    {'coordinates': {'x': 601, 'y': 570, 'stage_wi...
7    {'coordinates': {'x': 250, 'y': 665, 'stage_wi...
8    {'coordinates': {'x': 279, 'y': 629, 'stage_wi...
9    {'coordinates': {'x': 839, 'y': 654, 'stage_wi...
Name: event_data, dtype: object

Comments: 
- values = strings representint dicts
- apply ```json.loads``` to convert values to dicts
- Semi-structured JSON formatted string containing the events parameters. Default fields are: event_count, event_code, and game_time; otherwise fields are determined by the event type.


## 4. installation_id

In [35]:
sample.installation_id.value_counts()

0006a69f    3801
0006c192    2224
0001e90f    1357
0016b7cc     947
00129856     830
0009a5a9     412
00195df7     185
000447c4     181
0011edc8      63
Name: installation_id, dtype: int64

Comments: 
- values = strings 
- Randomly generated unique identifier grouping game sessions within a single installed application instance.


## 5. event_count

In [39]:
sample.event_count.value_counts()[:10]

1     201
3      85
2      85
4      83
5      82
6      82
10     81
15     81
7      81
9      81
Name: event_count, dtype: int64

Comments: 
- values = int
- Incremental counter of events within a game session (offset at 1). Extracted from event_data.
- can help us track start & end of a session???

## 6. event_code

In [45]:
sample.event_code[:10]

0    2000
1    2000
2    2000
3    3010
4    3110
5    4070
6    4070
7    4070
8    4070
9    4070
Name: event_code, dtype: int64

Comments: 
- values = int
- Identifier of the event 'class'. Unique per game, but may be duplicated across games. E.g. event code '2000' always identifies the 'Start Game' event for all games. Extracted from event_data.

## 7. game_time

In [48]:
sample.game_time[:10]

0        0
1        0
2        0
3       53
4     6972
5     9991
6    10622
7    11255
8    11689
9    12272
Name: game_time, dtype: int64

Comments: 
- values = int
- Time in milliseconds since the start of the game session. Extracted from event_data.

## 8. title

In [53]:
sample.title.unique().tolist()[:10]

['Welcome to Lost Lagoon!',
 'Magma Peak - Level 1',
 'Sandcastle Builder (Activity)',
 'Scrub-A-Dub',
 'Magma Peak - Level 2',
 'Dino Drink',
 'Tree Top City - Level 1',
 'Ordering Spheres',
 'Watering Hole (Activity)',
 'Slop Problem']

Comments: 
- values = strings
- Title of the game or video.

## 9. type

In [59]:
sample.type.value_counts()

Activity      5332
Game          3906
Assessment     647
Clip           115
Name: type, dtype: int64

Comments: 
- values = strings
- Media type of the game or video. Possible values are: 'Game', 'Assessment', 'Activity', 'Clip'.

## 10. world

In [62]:
sample.world.value_counts()

MAGMAPEAK       5847
TREETOPCITY     2876
CRYSTALCAVES    1260
NONE              17
Name: world, dtype: int64

Comments: 
- values = strings
- The section of the application the game or video belongs to. Helpful to identify the educational curriculum goals of the media. Possible values are: 'NONE' (at the app's start screen), TREETOPCITY' (Length/Height), 'MAGMAPEAK' (Capacity/Displacement), 'CRYSTALCAVES' (Weight).