In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import datetime
from tqdm.auto import tqdm
from collections import Counter
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
# %load_ext line_profiler

This is my submission to the Kaggle, data science bowl. All code here is under the open source license referred to in the Kaggle competition rules.


## Inspect the data

In [None]:
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv') #df.head()

In [None]:
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv') #df.head()

In [None]:
print('Training.csv file has {} rows and {} columns'.format(train.shape[0], train.shape[1]))

In [None]:
print('Test.csv file has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

In [None]:
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
print('specs.csv file has {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

In [None]:
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
print('train_lablels.csv file has {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))
#train_labels

In [None]:
sample = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
print('sample submission file has {} rows and {} columns'.format(sample.shape[0], sample.shape[1]))

## EDA

let's see how many installation_ids there are

In [None]:
users = train.installation_id.unique()
len(users) # number of distinct users

There are 17000 unique installation_id's in the training set. Let's examine the breakdown of event types for an example `installation_id` (from the first row in the dataset)

In [None]:
types = train.loc[train['installation_id']=='0001e90f'].groupby(['type'],as_index=False).size().reset_index(name='counts')
types

Below i calculate the total time spent on each different types of events in minutes for each installation_id. When I first tried to do this, I called train.loc without narrowing to the user data... a serious mistake from a code design point of view

In [None]:
'''N=10
user_times = np.zeros(N)
for j in tqdm(range(N)):
    user = users[j]
    event_types = [0,0,0,0]
    sessions_typed = [0,0,0,0]
    user_dat = train.loc[(train['installation_id']==user)]
    event_types[0] = user_dat.loc[user_dat['type'] == 'Activity']
    sessions_typed[0] = event_types[0]['game_session'].unique()
    event_types[1] = user_dat.loc[user_dat['type'] == 'Clip']
    sessions_typed[1] = event_types[1]['game_session'].unique()
    event_types[2] = user_dat.loc[user_dat['type'] == 'Game']
    sessions_typed[2] = event_types[2]['game_session'].unique()
    event_types[3]   = user_dat.loc[user_dat['type'] == 'Assessment']
    sessions_typed[3] = event_types[3]['game_session'].unique()
    
    times_per_type = []
    type_time = 0.0
    for i in range(4):
        for session in sessions_typed[i]:
            sesh_start_ts = event_types[i].loc[event_types[i]['game_session']==session]['timestamp'].min()
            sesh_start_dt = datetime.datetime.strptime(sesh_start_ts,'%Y-%m-%dT%H:%M:%S.%fZ')
            sesh_end_ts = event_types[i].loc[event_types[i]['game_session']==session]['timestamp'].max()
            sesh_end_dt = datetime.datetime.strptime(sesh_end_ts,'%Y-%m-%dT%H:%M:%S.%fZ')
            #print( session, sesh_start_dt.hour, " hours ", sesh_start_dt.minute, " minutes ", sesh_start_dt.second," seconds")
            #print( session, (sesh_end_dt - sesh_start_dt).total_seconds()/60.0)
            type_time = type_time + (sesh_end_dt - sesh_start_dt).total_seconds()/60.0
        times_per_type.append(type_time)
        type_time = 0.0
    total_time = sum(times_per_type)
    user_times[j] = total_time'''

## to do list

1. calculate ground-truth

2. feature definitions - wrangling

3. Models and prediction

4. Calculate quadratic weighted kappa (qwk)

5. Submissions, debugging, optimization

6. Post this to your github at some point as an example of your work

## Ground truth

Note that the Bird Measurer assessment has two parts however we consider the it to be passed correctly if just the first part is passed, that is the 4110 code. Here, I duplicate the train_labels that were provided by parsing the train data set and calculating the accuracy. 

In [None]:
assessments = train.loc[(train['type'] == 'Assessment') & (((train['event_code'] == 4100)&(train['title'] != 'Bird Measurer (Assessment)')) | (train['event_code'] == 4110))].copy()
assessments['num_correct'] = True
#assessments
test_assessments = test.loc[(test['type'] == 'Assessment') & (((test['event_code'] == 4100)&(test['title'] != 'Bird Measurer (Assessment)')) | (test['event_code'] == 4110))].copy()
test_assessments_for_union = test_assessments.copy()
test_assessments['num_correct'] = True

In [None]:
test_assessments.shape

In [None]:
assessments.loc[assessments['event_data'].str.contains(":false,"),'num_correct'] = False

test_assessments.loc[test_assessments['event_data'].str.contains(":false,"),'num_correct'] = False

assessments['num_incorrect'] = np.where(assessments.num_correct > 0,0,1)

test_assessments['num_incorrect'] = np.where(test_assessments.num_correct > 0,0,1)

The way I calculate the number of incorrect assessments is using a groupby(). However, we need to know both number incorrect as well as if there was a correct answer in separate columns. I chose two separately create these two columns and then merge them using an inner join, however this is probably not best practice, there must be a way to create two aggregate columns in a single step without creating two dataframes (stackoverflow this) **possible solution** using agg() allows for multiple aggregations - might do the trick

In [None]:
g = assessments.groupby(['game_session','installation_id','title'],as_index=False)['num_correct'].sum().sort_values(by=['installation_id'])
h = assessments.groupby(['game_session','installation_id','title'],as_index=False)['num_incorrect'].sum().sort_values(by=['installation_id'])

ggg = test_assessments.groupby(['game_session','installation_id','title'],as_index=False)['num_correct'].sum().sort_values(by=['installation_id'])
hhh = test_assessments.groupby(['game_session','installation_id','title'],as_index=False)['num_incorrect'].sum().sort_values(by=['installation_id'])

In [None]:
g['num_correct'] = g['num_correct'].astype(int)      #g.shape
h['num_incorrect'] = h['num_incorrect'].astype(int)   #h.shape

ggg['num_correct'] = ggg['num_correct'].astype(int)      #g.shape
hhh['num_incorrect'] = hhh['num_incorrect'].astype(int)   #h.shape

In [None]:
merged_inner = pd.merge(left=g,right=h, left_on='game_session', right_on='game_session')

test_merged_inner = pd.merge(left=ggg,right=hhh, left_on='game_session', right_on='game_session')

In [None]:
df1 = merged_inner[['game_session','installation_id_x','title_x','num_correct','num_incorrect']]
df2 = test_merged_inner[['game_session','installation_id_x','title_x','num_correct','num_incorrect']] 
pd.options.mode.chained_assignment = None
df1.rename(columns = {'installation_id_x':'installation_id'}, inplace = True)
df1.rename(columns = {'title_x':'title'}, inplace = True)
df2.rename(columns = {'installation_id_x':'installation_id'}, inplace = True)
df2.rename(columns = {'title_x':'title'}, inplace = True)

In [None]:
df1['accuracy'] = df1.apply(lambda row: row.num_correct/(row.num_correct + row.num_incorrect), axis=1)
df2['accuracy'] = df2.apply(lambda row: row.num_correct/(row.num_correct + row.num_incorrect), axis=1)

In [None]:
# define accuracy groups

def groupacc(row):
    if row['num_correct'] == 1 and row['num_incorrect'] == 0:
        return 3
    if row['num_correct'] == 1 and row['num_incorrect'] == 1:
        return 2
    if row['num_correct'] == 1 and row['num_incorrect'] > 1:
        return 1
    if row['num_correct'] == 0: 
        return 0
    return 'Other'

df1['accuracy_group'] = df1.apply(groupacc, axis=1)
df2['accuracy_group'] = df2.apply(groupacc, axis=1)

At this point I want to compare to see if the ground truths are the same as the one provided. I check the values below and they are all pretty much same except for a few unimportant trailing float differences.

In [None]:
df2.shape

In [None]:
c = df1.sort_values(by=['game_session'])
k = train_labels.sort_values(by=['game_session'])


In [None]:
comparison_array = c.values == k.values

if False in comparison_array:
    print ("Not the same")

In [None]:
len(np.where(comparison_array==False)[0])  # 46 differences due to floating point stuff

## Features 

$Y$ is the accuracy group for an assessment, e.g. this is what you are trying to predict.

$X$ will be a feature matrix for each such session collected by looking up data for that user, you only want to consider data 
up to the timestamp of the assessment you are training or predicting on.


There will also be features like "title" which tells you which one it is, this is not historical.

Let's pair down the train set to only the ids that actually took (e.g. started 2000) an assessment and also made an attempt (4100/4110) and do some more inspection, with comments detailing below.

one issue is features which have NA for some data - (accumulated previous accuracy group) -> our approach will be to substitue the mean value for the type.

**NOTE: YOU HAVE TO BUILD FEATURES FOR THE TEST SET AS WELL** 

below i do some encodings of some text titles to integers

In [None]:
# some encodings
list_of_assessment_titles = list(set(assessments['title'].unique()))
list_of_assessment_titles.sort()

In [None]:
assessment_titles_map = dict(zip(list_of_assessment_titles, np.arange(len(list_of_assessment_titles))))
assessment_titles_map

In [None]:
assessments['title'] = assessments['title'].map(assessment_titles_map)    # note if you run this twice it will break
train_labels['title'] = train_labels['title'].map(assessment_titles_map)  # note if you run this twice it will break

In [None]:
print(len(assessments['game_session'].unique()))  # unique sessions of type assesment in train set
train_reduced_users = assessments['installation_id'].unique()
print(len(train_reduced_users))  # unique users who made at least one attempt on an assessment in train

In [None]:
train = train.loc[train['installation_id'].isin(train_reduced_users)]  # drop some data
train.shape

In [None]:
test_users = test.installation_id.unique()    # installation_ids in test set
print(len(test_users))
len(set(test_users).intersection(set(users)))  # there is no intersection in installation_ids between train and test

At this point after inspection, it becomes clear that we need to predict the score for the last assessment in the test set. Also, despite some unclear language in the instructions, after some effort it became clear to me from inspection that **the last assessment (by timestamp) has been truncated for each user in the test set**. Some of the test set users have no previous assessments to learn from. 

To build features let's build some **accumulations**. We want accumulated accuracy groups on previous assessments. Possibly broken down for each assessment. Let's start there. Later we also want counts of time spent previously on various stuff and counts of events for previous stuff.

In [None]:
mean_group_accs = c.groupby('title',as_index=False).mean()['accuracy_group']
c.groupby('title',as_index=False).mean()

**remember to improve these means by adding rows from the test set or external data would help also!!**

In [None]:
list(mean_group_accs)

You need to combine the train data with the test data (except for last line)

In [None]:
# code to combine train and test datas

test_predict = test.loc[test['type'] == 'Assessment'].groupby('installation_id',as_index=False).last()

# you need to combine test_assessments so that it has those last rows basically do a union at this stage

test_final_assessments = pd.concat([test_assessments_for_union,test_predict],sort=False)
test_final_assessments = test_final_assessments.sort_values(by=['installation_id','timestamp'])
#test_final_assessments

In [None]:
def build_features(num,test_or_train):

    N=num   # to go to production set N = len(train_reduced_users)

    features = []   # this will by the full data for each assessment   X  & Y

    build_test = test_or_train   # False

    for j in tqdm(range(N)):

        if build_test == False:
            user = train_reduced_users[j]   
            prev_hist = train.loc[train['installation_id']==user]   # for previous history counts, narrowed to user    
            user_assessments = assessments.loc[assessments['installation_id']==user]  # for previous cum_accuracy_grps    
            user_sessions = user_assessments['game_session'].unique()  # time ordered
        else:
            user = test_users[j]
            prev_hist = test.loc[test['installation_id']==user]   # for previous history counts, narrowed to user 
            user_sessions = test_final_assessments.loc[test_final_assessments['installation_id']==user]['game_session'].unique()

        counters = np.array([0.0,0.0,0.0,0.0,0.0])  # number of each type of assessment initialization

        cumalitive_acc_groups = np.array(list(mean_group_accs))   # initialization

        #cumalitive_acc_groups = np.array([0.0,0.0,0.0,0.0,0.0])   # alternative zero initialization

        magmapeak_counts = 0
        treetopcity_counts = 0
        crystalcaves_counts = 0

        game_counts = 0
        clip_counts = 0
        activity_counts = 0
        assess_counts = 0

        ii = 1

        for session in user_sessions:     # these are already time-ordered 

            # get the time at the start of the game_session

            # code to convert timestamp data to datetime data

            prev_hist['timestamp'] = pd.to_datetime(prev_hist['timestamp'])

            the_time = prev_hist.loc[prev_hist['game_session']==session]['timestamp'].iloc[0]

            the_past = prev_hist.loc[prev_hist['timestamp'] < the_time]  # this could be empty!! 

            num_events = the_past.groupby('world',as_index=False).size().reset_index(name='counts')

            num_sessions_by_type = the_past.groupby('game_session',as_index=False).last().groupby('type').size().reset_index(name='tcounts')

            for index, row in num_sessions_by_type.iterrows():
                if row['type'] == 'Game':
                    game_counts = row['tcounts']
                if row['type'] == 'Activity':
                    activity_counts = row['tcounts']
                if row['type'] == 'Assessment':
                    assess_counts = row['tcounts'] 
                if row['type'] == 'Clip':
                    clip_counts = row['tcounts']

            # now you want to count number of previous events for this user before this timestamp

            user_features = []

            user_features.append(clip_counts)
            user_features.append(activity_counts)
            user_features.append(assess_counts)
            user_features.append(game_counts)  

            if build_test == False:
                sesh_dat = train_labels.loc[train_labels['game_session'] == session]
                score = sesh_dat['accuracy_group'].iloc[0]
            else:
                if ii < len(user_sessions):
                    sesh_dat = df2.loc[df2['game_session'] == session]
                    score = sesh_dat['accuracy_group'].iloc[0]
                else:
                    score = 0         

            #title = sesh_dat['title'].iloc[0]

            title = prev_hist.loc[prev_hist['game_session']==session]['title'].iloc[0]
            
            title = assessment_titles_map[title]

            #user_features.append(user)

            user_features.append(title)

            total_accum = np.sum(cumalitive_acc_groups)/(np.sum(counters)+5.0)
            user_features.append(cumalitive_acc_groups[0]/(counters[0]+1.0))
            user_features.append(cumalitive_acc_groups[1]/(counters[1]+1.0))
            user_features.append(cumalitive_acc_groups[2]/(counters[2]+1.0))
            user_features.append(cumalitive_acc_groups[3]/(counters[3]+1.0))
            user_features.append(cumalitive_acc_groups[4]/(counters[4]+1.0))

            if title == 0:
                counters[0] += 1
                cumalitive_acc_groups[0] = cumalitive_acc_groups[0] + score
            elif title == 1:
                counters[1] += 1
                cumalitive_acc_groups[1] = cumalitive_acc_groups[1] + score
            elif title == 2:
                counters[2] += 1
                cumalitive_acc_groups[2] = cumalitive_acc_groups[2] + score
            elif title == 3:
                counters[3] += 1
                cumalitive_acc_groups[3] = cumalitive_acc_groups[3] + score
            elif title == 4:
                counters[4] += 1
                cumalitive_acc_groups[4] = cumalitive_acc_groups[4] + score

            user_features.append(total_accum)

            user_features.append(score)

            for index, row in num_events.iterrows():
                if row['world'] == 'MAGMAPEAK':
                    magmapeak_counts = row['counts']
                if row['world'] == 'TREETOPCITY':
                    treetopcity_counts = row['counts']
                if row['world'] == 'CRYSTALCAVES':
                    crystalcaves_counts = row['counts']  

            user_features.append(magmapeak_counts)
            user_features.append(treetopcity_counts)
            user_features.append(crystalcaves_counts)
            user_features.append(sum(num_events['counts']) - (magmapeak_counts + treetopcity_counts+crystalcaves_counts))
            user_features.append(sum(num_events['counts']))
            
            if build_test == False:
                features.append(user_features)
            else:
                if ii == len(user_sessions):
                    features.append(user_features)
            ii += 1
            
    return features

In [None]:
test = pd.DataFrame(build_features(len(test_users),True))
test = test.round(2)
#test

In [None]:
train = pd.DataFrame(build_features(3614,False))
train = train.round(2)
#train

At this point there is alot of **optimization, remove inefficiencies** use Counter() etc. that needs to be done to speed up the `build_features` function - you can search through fewer events.

Also, alot more features could be included such as some **durations**... alot more could be done here but for now we just want to go ahead and get it to work and then we can come back to feature engineering later if we have time

**another idea is to include the historical test data into train**


## Approach to modelling


Our approach to modelling is to work from simple models to complex models and to learn about how to implement each model correctly while building cross-validation into the pipeline as well. The idea for this competition is to learn how to build models, not neccessarily to win this competition... though that would be nice...

1. Simple regression (or logistic regression)

2. Single decision tree

3. KNN classifier 

4. Naive Bayes

5. Neural Network (probably will not score great but you should still implement)

6. 

The idea is to start with a **simple model** and **submit** and see how you improve as you choose different models!!
For this purpose a common framework, wrapper, for models does make sense.
**categoricals** the title - column 4 is a categorical, as is the target variable column 11



## Model eval - quadratic weighted kappa (qwk)

In [None]:
def qwk(a1,a2):
    N = 4
    o = confusion_matrix(a1,a2)
    w = np.zeros(shape=(N,N))
    for i in range(N):
        for j in range(N):
            w[i,j] = (i-j)**2
    w = w/((N-1)*(N-1)) 
    w = w.round(3)
    
    e = np.outer(np.histogram(a1, [i for i in range(N+1)])[0],np.histogram(a2,[i for i in range(N+1)])[0])

    e = e/np.sum(e)
    o = o/np.sum(o)

    return 1 - np.sum(np.multiply(o,w))/np.sum(np.multiply(e,w))


In [None]:
# preprocessing 

y_train = train[11]
y_test = test[11]
X_train = train.drop(11, axis=1)
X_test = test.drop(11, axis=1)

categorical = 4   # categoricals

dumb = pd.get_dummies(X_train[categorical],drop_first=True,prefix='g')
dumby = pd.get_dummies(X_test[categorical],drop_first=True,prefix='j')

X_train = X_train.drop([categorical],axis=1)
X_test = X_test.drop([categorical],axis=1)

X_train = pd.concat([X_train,dumb],axis=1)
X_test = pd.concat([X_test,dumby],axis=1)


## KNN model

This is probably the simplest model to implement, unfortunately, it maxes out at a qwk of about 0.5 which is not competitive, but it is a good starting point. It's also very important to do **feature transformation (scaling, demean etc.)** for this type of model

In [None]:

# with feature scaling

# feature scaling is VERY important for KNN

X_scaled = preprocessing.scale(X_train)

mean_scores = []
nns = []
stds = []

qwks_mean = []

for nn in range(2,50,5):
    cv = KFold(n_splits=5)

    neigh = KNeighborsClassifier(n_neighbors = nn)

    scores = []
    qwks = []

    # if you are dealing with dataframes, you have to index with X.iloc[test_index] etc. !!1

    for train_index, test_index in cv.split(X_scaled,y_train):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train_cv, X_test_cv = X_scaled[train_index], X_scaled[test_index]
        y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

        neigh.fit(X_train_cv, y_train_cv)
        scores.append(neigh.score(X_test_cv, y_test_cv))
        y_pred = neigh.predict(X_test_cv)
        qwk_res = qwk(y_pred,y_test_cv)
        qwks.append(qwk_res)
    qwks_mean.append(np.mean(qwks))
    scores
    nns.append(nn)
    stds.append(np.std(scores))
    mean_scores.append(np.mean(scores))

In [None]:
plt.plot(nns,mean_scores);
plt.xlabel("k");plt.ylabel("cv-mean-accuracy");
fig1, ax1 = plt.subplots();
ax1.plot(nns,qwks_mean);

It's interesting to note that qwk actually has a maxima earlier and then decreases whereas accuracy keeps increasing with kk.
This suggests that we can a 0.48 with this solution, so let's just submit and see if that's the case, keep working, we want to do a submit just to get the practice

In [None]:
neigh = KNeighborsClassifier(n_neighbors = 20)

# don't forget to scale the test data!!

neigh.fit(X_scaled, y_train)

X_test_scaled = preprocessing.scale(X_test)

y_pred = neigh.predict(X_test_scaled)

In [None]:
#y_pred
d = {'installation_id': test_users, 'accuracy_group': y_pred}
sample_submission = pd.DataFrame(d)
sample_submission.to_csv('submission.csv', index=False)


In [None]:
len(test_users)