In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import gc

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
path = '/kaggle/input/data-science-bowl-2019/'

In [None]:
test = pd.read_csv(path+'test.csv')
train = pd.read_csv(path+'train.csv')
train_labels = pd.read_csv(path+'train_labels.csv')
spcs = pd.read_csv(path+'specs.csv')
sub = pd.read_csv(path+'sample_submission.csv')

In [None]:
train.head(1)

Let's get a sense of what the feature are.

First we look at unique values for the features. We see that each installation_id has on average 18 game_sessions. There are 384 event_ids but only 42 event_codes, so on average around 9 event_ids per event_code.

In [None]:
print('# unique installation_id:', len(train.installation_id.unique()))
print('# unique game_session:', len(train.game_session.unique()))
print('# unique event_id:', len(train.event_id.unique()))
print('\n')
print('# unique event_code:', len(train.event_code.unique()))
print('# unique title:', len(train.title.unique()))

In [None]:
train.title.unique()

Let's look at the event types more closely. We see the majority of events are games or activities, followed by assements and clips. This is likely because activites and games are very likely interactive, triggering many events.

In [None]:
plt.figure(figsize=[5,4])
plt.title('TYPE')
sns.countplot(train.type)
_=plt.xticks(rotation=90)

Let's produce bar plots of the event titles and event codes by event type (Clip, Activity, Game, or Assessment)

We see that the assessments are pretty balanced. Activities is pretty balanced outside of Sandcastle Builder and Bottle Filler. Games are also pretty balanced outside of Scrub-A-Dub and Chow Time. There seems to be a larger variance in the number of events associated with different clips. 

Because we do not have a good sense of what the events are, this analysis needs to be treated with care. For example, Sandcastle Builder and Bottle Filler may have more events because they have more interactive options.

Clips only have event code 2000. This makes sense; the description says that 2000 always means start. Since clips are not interactive, there are no other events.

In [None]:
plt.figure(figsize=[5*4,6])
for i,event_type in enumerate(train.type.unique(),1):
    plt.subplot(1,4,i)
    plt.title(event_type)
    sns.countplot(train[train.type==event_type].title)
    _=plt.xticks(rotation=90)
plt.tight_layout()

In [None]:
plt.figure(figsize=[5*5,5])
for i,event_type in enumerate(train.type.unique(),1):
    plt.subplot(1,4,i)
    plt.title(event_type)
    df = train[train.type==event_type]
    sns.countplot(df.event_code,order=sorted(df.event_code.unique()))
    _=plt.xticks(rotation=90)
plt.tight_layout()
del df

We can get an even more granular view by looking at the unique event_codes for each event title.

In [None]:
for event_type in train.type.unique():
    if event_type=='Clip':
        continue
    df = train[train.type==event_type]
    N = len(df.title.unique())
    W = min(5,N)
    H = N//5+1;
    plt.figure(figsize=[5*W,4*H])
    for i,title in enumerate(df.title.unique(),1):
        plt.subplot(H,W,i)
        plt.title(title)
        sns.countplot(df[df.title==title].event_code,order=sorted(df[df.title==title].event_code.unique()))
        _=plt.xticks(rotation=90)
    plt.tight_layout()
del df

A submission corresponds to event_code 4100 for all assessments except Bird Measurer, where the event_code is 4110. An exception needs to be written for title 'Air Show' which includes event_code 4100 but is not an Assesssment. 

The event_data will include a property "correct" which can be either true or false

We can group the event titles by world.

In [None]:
for world in train.world.unique():
    print('WORLD:',world)
    print(train[train.world==world].title.unique())

Let's try to get to know our users better.

The number of game sessions is not very high and starts to decline after 4 game sessions.

The number of events per game session also decreases but has a more complicated shape between 2 and 100. We ignore game sessions with only one event, since those are likely to be Clips.

In [None]:
sns.distplot(train.loc[:,['installation_id','game_session']].groupby('installation_id').nunique().game_session,bins=np.linspace(0,1300,21),kde=False,norm_hist=False,hist_kws={'rwidth':0.9})
plt.yscale('log')

In [None]:
sns.distplot(train.loc[:,['installation_id','game_session']].groupby('installation_id').nunique().game_session,bins=np.linspace(0,20,21),kde=False,norm_hist=False,hist_kws={'rwidth':0.9})
#plt.yscale('log')

In [None]:
sns.distplot(train.loc[:,['installation_id','game_session','event_id']].groupby(['installation_id','game_session']).count().event_id,bins=np.linspace(2,3502,36),kde=False,norm_hist=False,hist_kws={'rwidth':0.9})
plt.yscale('log')

In [None]:
sns.distplot(train.loc[:,['installation_id','game_session','event_id']].groupby(['installation_id','game_session']).count().event_id,bins=np.linspace(2,102,101),kde=False,norm_hist=False,hist_kws={'rwidth':0.9})
#plt.yscale('log')

Finally let us take a look a the train labels.

We see that the distribution of incorrect answers roughly follows a decreasing exponential trend. That is, the number of game sessions with n incorrect answers decreases exponentially with n. Indeed, most game sessions fall in accuracy group 3, as expected from this trend. By looking at the actual accuracy, we see that it is very bimodal; most users either get the answer the first time or not at all. The number of people who continue to attempt the problem decreases with the number of incorrect answers.

In [None]:
plt.figure(figsize=[12,6])
plt.title('NUMBER INCORRECT')
sns.countplot(train_labels.num_incorrect,order=range(90))
plt.yscale('log')
_,_=plt.xticks(ticks = range(0,90,5), labels = range(0,90,5))

In [None]:
plt.figure(figsize=[5*2,4])
plt.subplot(1,2,1)
plt.title('ACCURACY GROUP')
_=sns.countplot(train_labels.accuracy_group,order=range(4))
plt.subplot(1,2,2)
plt.title('ACCURACY')
_=sns.distplot(train_labels.accuracy,kde=False, norm_hist=False, bins=np.linspace(0,1,21), hist_kws={'rwidth' : 0.8})