### Purpose of this kernel
* Clarifying the structure of event_data which is stored as JSON format.
* The structure is defined with group of title and event_code, so showing each pattern of the json data as Pandas DataFrame.

### Finding
* I found that the structure of the json file is same in the same `title` and `event_code`.

In [None]:
import json
import numpy as np
import pandas as pd
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
from collections import Counter
from pathlib import Path
import pickle

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score

#  pandas setting
pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = '{:,.5f}'.format

def unpickle(filename):
    with open(filename, 'rb') as fo:
        p = pickle.load(fo)
    return p

def to_pickle(filename, obj):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, -1)
        
# グラフ描画系
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
# from matplotlib_venn import venn2, venn2_circles
from matplotlib import animation as ani
from IPython.display import Image

plt.rcParams["patch.force_edgecolor"] = True
#rc('text', usetex=True)
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

%matplotlib inline
%config InlineBackend.figure_format='retina'

# loading Data

In [None]:
train = pd.read_csv("/kaggle/input/data-science-bowl-2019/train.csv", nrows=None)

# Title list
Displaying all titles in train data order by row count. In this kernel, we focus on `Game` to show the structure of the event_data.

In [None]:
cnt = train.title.value_counts()
title_type = train[["title", "type"]].drop_duplicates()
title_type = title_type.merge(pd.DataFrame(cnt).reset_index(), left_on="title", right_on="index", how="left")
title_type.columns = ["title", "type", "_", "count"]
title_type.drop("_", axis=1, inplace=True)
title_type.sort_values("count", ascending=False, inplace=True)
title_type

# Function for display

In [None]:
def display_json_structure(title_list):
    for title in title_list: 
        print("="*80,)
        print("="*30, f" title: {title} ", "="*30,)
        print("="*80,)
        train_focused_event = train[train.title==title]
        event_df = pd.DataFrame(train_focused_event.event_code.value_counts()).reset_index()
        cols_list = {}
        for event_code in np.sort(event_df["index"].values):
            df_s = train_focused_event[train_focused_event.event_code==event_code]
             # check all rows in order to check the structure of all data, even though displaying only 3 rows.
            event_data_df = pd.DataFrame([json.loads(d) for d in df_s.event_data.values])
            print(f"event_code: {event_code}")
            display(event_data_df.head(3))
            cols_list[event_code] = event_data_df.columns.tolist()

        col_all = set(np.hstack([cols_list[k] for k in cols_list.keys()]))
        col_dict_list = []
        for k in cols_list.keys():
            col_dict = {}
            for c in col_all:
                col_dict[c]= c in cols_list[k]
            col_dict["event_code"] = k
            col_dict_list.append(col_dict)    
        col_dict_df = pd.DataFrame(col_dict_list)
        col_dict_df[col_dict_df.sum().sort_values(ascending=False).index.tolist()]

        item_name_df = col_dict_df[col_dict_df.sum().sort_values(ascending=False).index.tolist()].sort_values("event_code")
        item_name_df["num_item"] = item_name_df.iloc[:, 1:].sum(axis=1)
        display(item_name_df)

# Displaying json data of `Game` groupbying "title" and "event_code"

In [None]:
game_title = title_type[title_type["type"]=="Game"]
game_title

In [None]:
display_json_structure(game_title.title.values)

# Displaying json data of "Activity" groupbying "title" and "event_code"

In [None]:
activity_title = title_type[title_type["type"]=="Activity"]
activity_title

In [None]:
display_json_structure(activity_title.title.values)