# Flatten Equivacard event data

In [None]:
import plotnine
import pandas as pd
import numpy as np
import json
from datetime import datetime
import os

In [None]:
json_file_path = "./data/middie_full.json"

In [None]:
with open(json_file_path) as f:
    event_json = json.load(f)

## Preview event shape

In [None]:
event_json[0:3]

In [None]:
event_json[-3:]

In [None]:
json_df = pd.read_json(json_file_path, dtype= {'timestamp': int} )

## Align timestamps

In [None]:
json_df.timestamp.values[0]

In [None]:
json_df['server_timestamp'].describe()

In [None]:
json_df['timestamp'].describe()

In [None]:
json_df[json_df['timestamp']>0]['timestamp'].describe()

In [None]:
json_df[json_df['timestamp']>10.0**16]['timestamp'].describe()

In [None]:
print(datetime.utcfromtimestamp(1.662995e+15/1000000).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
1.662995e+15/(10**12)

In [None]:
print(datetime.utcfromtimestamp(json_df.timestamp.values.max()/1000000000).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
print(datetime.utcfromtimestamp(json_df.server_timestamp.min()/1000).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
print(datetime.utcfromtimestamp(json_df.server_timestamp.max()/1000).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
def align_unix_convention(x):
    if not np.isnan(x.server_timestamp):
        return x.server_timestamp/1000 
    else:
        ts = json_df.timestamp.values.max()
        above_12 = np.floor(np.log10(json_df.timestamp.values.max())-12)
        return ts/(10**(above_12+3))
    

In [None]:
json_df['unix_timestamp_combined'] = json_df.apply(lambda x: align_unix_convention(x), axis=1)

In [None]:
json_df['timestamp_combined'] = json_df.apply(lambda x: datetime.utcfromtimestamp(x.unix_timestamp_combined), axis=1)

In [None]:
json_df['unix_timestamp_combined'].describe()

In [None]:
print(json_df.timestamp_combined.min())

In [None]:
print(json_df.timestamp_combined.max())

In [None]:
json_df.sample(5)

## Review event distribution across field categories

In [None]:
json_df.groupby('activity').count()

In [None]:
json_df.groupby('event_name').count()

In [None]:
json_df.groupby('bucket').count()

In [None]:
json_df.groupby('activity')['user_id'].nunique()

In [None]:
equiv_events_df = json_df[json_df.activity.isin(['EQUIVACARDS'])]

In [None]:
equiv_events_df.shape

## Flatten object columns

In [None]:
payload_df = pd.json_normalize(equiv_events_df.payload)

In [None]:
payload_df.sample(5)

`best_play`, `board` and `p1_hand` are objects that are not easy to process as is

In [None]:
payload_df[payload_df.best_play.notna()].sample(5).best_play.values

`best_play` is a complex object and would require targeted processing to pull out value

Length of the best play and first move of the best play seem likely useful 

In [None]:
best_play_df = payload_df.best_play.apply(pd.Series) 

In [None]:
best_play_df.columns = [f"best_play_turn_{item}" for item in best_play_df.columns]

In [None]:
best_play_df[best_play_df.best_play_turn_0.notna()].head()

In [None]:
# best_play_df["best_play_length"] = best_play_df.count(axis=1)

In [None]:
best_play_df[best_play_df.best_play_turn_0.notna()].head()

In [None]:
best_play_0_df = pd.json_normalize(best_play_df.best_play_turn_0,errors='ignore')

In [None]:
best_play_0_df

In [None]:
best_play_0_df.columns = [f"best_play_turn_0_{item}" for item in best_play_0_df.columns]

In [None]:
payload_df[payload_df.board.notna()].sample(5).board.values

In [None]:
board_df = payload_df.board.apply(pd.Series) 

In [None]:
board_df.columns = ["board_left_card", "board_right_card"]

In [None]:
board_df[board_df.board_right_card.notna()].sample(5)

In [None]:
payload_df[payload_df.p1_hand.notna()].sample(5).p1_hand.values

In [None]:
p1_hand_df = payload_df.p1_hand.apply(pd.Series) 

In [None]:
p1_hand_df.columns = [f"p1_hand_card_{item}" for item in p1_hand_df.columns]

In [None]:
p1_hand_df['p1_hand_size'] = p1_hand_df.count(axis=1)

In [None]:
p1_hand_df[p1_hand_df.p1_hand_card_0.notna()].sample(5)

In [None]:
equiv_flat_df = pd.concat([
    equiv_events_df.reset_index(drop=True), 
    payload_df.reset_index(drop=True),
    best_play_df.reset_index(drop=True),
    best_play_0_df.reset_index(drop=True),
    board_df.reset_index(drop=True),
    p1_hand_df.reset_index(drop=True)
], axis=1)

In [None]:
equiv_flat_df.sample(5)

## Correct Connect the Drops labels

In [None]:
game_temp_df = equiv_flat_df[equiv_flat_df.event_name.isin(["launched_connect_the_drops", "launched_equivacards", "user_won", "user_lost"])].sort_values(by='timestamp_combined')

In [None]:
game_temp_df = equiv_flat_df.sort_values(by='timestamp_combined')

In [None]:
game_temp_df['game_start'] = game_temp_df.event_name.apply(lambda x: x in ['launched_connect_the_drops', 'launched_equivacards'])

In [None]:
def correct_launch_activity(x):
    if x == 'launched_connect_the_drops':
        return "CONNECT_THE_DROPS"
    elif x ==  'launched_equivacards':
        return "EQUIVACARDS"

In [None]:
game_temp_df['corrected_activity'] = game_temp_df.event_name.apply(correct_launch_activity)

In [None]:
game_temp_df['game_end'] = game_temp_df.event_name.apply(lambda x: x in ['user_won', 'user_lost'])

In [None]:
game_temp_df['user_launch_index'] = game_temp_df.groupby('user_id').game_start.cumsum()
game_temp_df['user_game_index'] = game_temp_df.groupby('user_id').game_end.cumsum()

In [None]:
def correct_activity(x):
    if x == 'launched_connect_the_drops':
        return "CONNECT_THE_DROPS"
    elif x ==  'launched_equivacards':
        return "EQUIVACARDS"

In [None]:
game_temp_df['corrected_activity'] = game_temp_df.event_name.apply(lambda x: correct_activity(x))

In [None]:
game_by_launch_df = game_temp_df[game_temp_df['corrected_activity'].notna()][['user_id', "user_launch_index","corrected_activity"]]

In [None]:
game_by_launch_df.sample(5)

In [None]:
corrected_activity_events_df = pd.merge(game_temp_df.drop('corrected_activity', axis=1), game_by_launch_df, on=["user_id", "user_launch_index"])

In [None]:
corrected_activity_events_df.groupby('corrected_activity')['user_id'].nunique()

In [None]:
corrected_equivacards_events = corrected_activity_events_df[corrected_activity_events_df.corrected_activity=="EQUIVACARDS"]

In [None]:
corrected_equivacards_events.describe()

In [None]:
corrected_equivacards_events.groupby('event_name').count()

In [None]:
corrected_equivacards_events.groupby('event_name')['user_id'].nunique()

In [None]:
corrected_equivacards_events.dtypes

In [None]:
corrected_equivacards_events.columns.sort_values()

In [None]:
os.makedirs('./data', exist_ok=True)
with open('./data/results.json', 'w+') as f:
    json.dumps(corrected_equivacards_events.reset_index().to_json("records"))

## Review of game time and distribution of events per game 

In [None]:
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap, scale_x_date, geom_line,facet_grid, theme, element_text, labs, element_blank, ggtitle, geom_bar

In [None]:
(ggplot(
   corrected_equivacards_events, aes('timestamp_combined', 'factor(user_id)', color= 'factor(user_id)'))
+ geom_point(show_legend=False)
 + geom_line(show_legend=False)
  + labs(x= "date", y="User Id")
 + ggtitle("Game by user_id vs date")
 + theme(figure_size=(6, 4), axis_text_x=element_text(rotation=90, hjust=1), ) 
)

In [None]:
(ggplot(
   corrected_equivacards_events, aes('user_game_index', fill= 'factor(event_name)'))
+ geom_bar(stat='count', position= 'stack')
 + facet_grid('user_id ~', scales ='free', )
 + theme(figure_size=(4, 10), axis_text_x=element_text(rotation=90, hjust=1), strip_text_y = element_text(angle = 0)) 
 + labs(x= "User Game Index", y="Event count")
 + ggtitle("Game events by user_id")
)

## Cursory comparison of best move 0 to taken move 0

In [None]:
corrected_equivacards_events['previous_best_move_0'] = corrected_equivacards_events.groupby('user_id').best_play_turn_0_label.shift(2).apply(lambda x: x.replace(',', '.') if type(x)==str else None ) 
corrected_equivacards_events['made_best_move_0'] = corrected_equivacards_events.apply(lambda x: x.previous_best_move_0==x.card, axis =1) 

In [None]:
corrected_equivacards_events[['event_name','card', 'best_play_turn_0_label', 'previous_best_move_0', 'made_best_move_0']].head(15)

In [None]:
corrected_equivacards_events[corrected_equivacards_events.event_name=='user_turn'].groupby(['user_id', 'made_best_move_0'])['_id'].count()

In [None]:
corrected_equivacards_events