In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv("../input/data-science-bowl-2019/train.csv")
test = pd.read_csv("/kaggle/input/data-science-bowl-2019/test.csv")

In [None]:
train.shape

In [None]:
# If quick set to true, only uses 5% of training data. 
quick = False

In [None]:
# To save time, we will only use 5% of the installation_ids for preparing the model.

import random

if quick:
    print(quick)
    # Grab all unique item_nbr from items file
    f = train['installation_id'].unique()

    # Count the lines
    num_lines = f.size
    

    # Sample size - in this case ~5% of items
    size = int(num_lines / 20)

    # Grab a random subset of size size from f
    skip_idx = random.sample(list(f), size)
    print(len(skip_idx))

    # Filter to only include training data for the subset of items we want
    train = train[train['installation_id'].isin(skip_idx)]

In [None]:
train.shape

In [None]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [None]:
# Store the installation_ids separately so that we can extract the train and test data from our features DataFrame 
train_ids = train.installation_id
test_ids = test.installation_id

## Data Cleanup

### Target Variable
For this competition, we are asked to predict each child's performance on the last assessment they take. It appears we are not given this information directly in the training data, so we'll need to extract the target variable ourselves. We do this by extracting all the assessment completion events (code 4100 or 4110 for Bird Measurer.) We sort these events by timestamp to grab the last one for each user. We then pull all completion attempts in that game session to get the full picture of how the child performed on their last assessment. With this, we can assign each installation_id with the target variable that we'll need to train against.

In [None]:
# Because the Bird Measurer assessment uses a distinct event code, let's separate it from the data
bird_measure_assess = train[train['title'] == "Bird Measurer (Assessment)"]

# Capture bird_meas assessment attempts
bird_measure_assess = bird_measure_assess[bird_measure_assess['event_code'] == 4110]

In [None]:
# Grab all assessment attempts that are not for bird measurer
train_assess = train[train['title'] != "Bird Measurer (Assessment)"]

train_assess = train_assess[train_assess['event_code'] == 4100]

# Some non-assessment activities have a 4100 code, ignore those
train_assess = train_assess[train_assess['type'] == 'Assessment']

# Append bird measure assessment attempts to have a log of all assessment attempts
train_assess = train_assess.append(bird_measure_assess)

In [None]:
# To get each child's performance on their last assessment, extract only the last assessment per installation_id
last_assessments = train_assess.sort_values(by="timestamp").drop_duplicates(subset=["installation_id"], keep="last")

# Grab the game_sessions of the last assessment events
last_game_sessions = last_assessments.game_session.unique()

# Extract all assessment completion attempts in the final session
last_assessment_sessions = train_assess[train_assess['game_session'].isin(last_game_sessions)]

In [None]:
# We want to grab all the last game_sessions of the test data. This will allow us to create features on all the test data except for the last assessment attempt
test_assess = test[test.type == 'Assessment']

last_test_assess = test_assess.sort_values(by='timestamp').drop_duplicates(subset=['installation_id'], keep='last')

In [None]:
last_test_assess.head()

In [None]:
# Add columns that store if the attempt was a Pass or a Fail
last_assessment_sessions['Pass'] = last_assessment_sessions.apply(lambda row: '"correct":true' in row.event_data, axis = 1)

last_assessment_sessions['Fail'] = last_assessment_sessions.apply(lambda row: '"correct":false' in row.event_data, axis = 1)

# Create a pivot table that logs pass/fail for each installation_id for the last assessment
pass_fail_log = last_assessment_sessions.pivot_table(['Pass', 'Fail'], index='installation_id', aggfunc = 'sum')

In [None]:
# Using the rules provided in the Data tab of the competition, create a function that outputs the accuracy group
def accuracy_group_calculator(row):
    
    if row.Pass == 0:
        #0: the assessment was never solved
        return 0.0
    
    elif row.Fail == 0:
        # 3: the assessment was solved on the first attempt
        return 3.0
    
    elif row.Fail == 1:
        # 2: the assessment was solved on the second attempt
        return 2.0
    
    else:
        # 1: the assessment was solved after 3 or more attempts
        return 1.0

In [None]:
# We can finally calculate the target variable for our training data set
pass_fail_log['Accuracy_Group'] = pass_fail_log.apply(lambda row: accuracy_group_calculator(row), axis = 1)

train_targets = pass_fail_log.drop(['Fail', 'Pass'], axis=1)

train_targets.head()

## Statistical EDA
Now that we have our target variable extracted from the training data, we can start exploring the data for features that might help us predict our target variable. Because the last assessment taken by each user is not always the same, let's see if there is a difference in performance based on which assessment was taken last. 

In [None]:
# Combine the training and test data so that we can build the same features on both
data = train.append(test)

# We will build the features of the train and test data at the same time
features = train_targets.index.values
features = np.concatenate([features, test.installation_id.unique()])
features = pd.DataFrame(features, index=features)
features.index.name = 'installation_id'

In [None]:
last_assessment_sessions.drop(['Pass', 'Fail'], axis=1, inplace=True)

last_assessments = last_assessment_sessions.append(last_test_assess)

In [None]:
last_game_sessions = last_assessments.game_session.unique()

In [None]:
# Join the game_session, title, and world to the features dataframe.
features = pd.merge(features, last_assessments[['installation_id', 'game_session', 'title', 'world']], on='installation_id', how='left')

# Remove any duplicate lines created by the merge.
features = features.drop_duplicates(subset=['installation_id'], keep="first")

In [None]:
train_targets['Accuracy_Group'].value_counts()

In [None]:
train_targets['Accuracy_Group'].value_counts().sort_index()

In [None]:
train_targets.head()

In [None]:
train_targets.shape

In [None]:
features.head()

In [None]:
features.rename(columns = {'game_session':'last_assess_game_session', 'title': 'last_assess_title', 'world':'last_assess_world'}, inplace = True) 

In [None]:
features.drop([0], axis=1, inplace=True)

In [None]:
# Join the game_session, title, and world to the features dataframe.
train_targets = pd.merge(train_targets, features[['installation_id', 'last_assess_game_session', 'last_assess_title', 'last_assess_world']], on='installation_id', how='left')

In [None]:
assessments = train_targets.last_assess_title.unique()
worlds = train_targets.last_assess_world.unique()

In [None]:
worlds

In [None]:
assessments

In [None]:
import matplotlib.pyplot as plt


for i in range(1, 5):
    if i == 1:
        counts = train_targets['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(2, 2, i).set_title("All Worlds")
    
    if i == 2:
        counts = train_targets[train_targets['last_assess_world'] == 'MAGMAPEAK']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(2, 2, i).set_title("MAGMAPEAK")
        
    if i == 3:
        counts = train_targets[train_targets['last_assess_world'] == 'TREETOPCITY']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(2, 2, i).set_title("TREETOPCITY")
    
    if i == 4:
        counts = train_targets[train_targets['last_assess_world'] == 'CRYSTALCAVES']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(2, 2, i).set_title("CRYSTALCAVES")
    
    plt.bar(counts.index,counts)
    plt.tight_layout()

We see immediately that the performance on the final assessment vary somewhat depending on the world of the assessment taken. Regardless of the world of the last assessment, the children were likely to pass on the first attempt. We do see a difference in the breakup of children who never passed the last assessment. MAGMAPEAK has very few children who were unable to pass the last assessment, whereas a significant chunk of the CRYSTALCAVES group did not pass. 

There are multiple hypotheses that could support this data. Perhaps the MAGMAPEAK assessments are the easiest and the CRYSTALCAVES assessments are the most difficult. One thing to keep in mind is that there are 5 assessments and only 3 worlds. This means that some worlds have multiple assessments. Let's see how the assessments are split among the worlds. 

In [None]:
train_targets.pivot_table(['last_assess_game_session'], index='last_assess_title', columns = ['last_assess_world'], aggfunc = 'count')

A few facts are apparent based on this breakdown. First, we see that the assessments are split among the worlds as follows:

CRYSTALCAVES : Cart Balancer, Chest Sorter

MAGMAPEAK : Cauldron Filler

TREETOPCITY : Bird Measurer, Mushroom Sorter

We see that within each of the worlds, there's a fairly equal split between which assessment was the last taken. 

In [None]:
for i in range(1, 7):
    if i == 1:
        counts = train_targets['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(3, 2, i).set_title("All Assessments")
    
    if i == 2:
        counts = train_targets[train_targets['last_assess_title'] == 'Mushroom Sorter (Assessment)']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(3, 2, i).set_title('Mushroom Sorter (Assessment)')
               
    if i == 3:
        counts = train_targets[train_targets['last_assess_title'] == 'Bird Measurer (Assessment)']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(3, 2, i).set_title('Bird Measurer (Assessment)')
              
    if i == 4:
        counts = train_targets[train_targets['last_assess_title'] == 'Cauldron Filler (Assessment)']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(3, 2, i).set_title('Cauldron Filler (Assessment)')
     
    if i == 5:
        counts = train_targets[train_targets['last_assess_title'] == 'Cart Balancer (Assessment)']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(3, 2, i).set_title('Cart Balancer (Assessment)')
        
    if i == 6:
        counts = train_targets[train_targets['last_assess_title'] == 'Chest Sorter (Assessment)']['Accuracy_Group'].value_counts().sort_index()
        plt.subplot(3, 2, i).set_title('Chest Sorter (Assessment)')
    
    
    plt.bar(counts.index,counts)
    plt.tight_layout()

Here we see the performance on the assessments. These are shown in the order that the game was designed to be followed. The first world is TREETOPCITY, then MAGMAPEAK, and then CRYSTALCAVES. However, the player is not locked into to any particular track. We do see however, that the performance on the first assessment in a world is better than the performance on the last assessment in the world. For example, in TREETOPCITY, we see that the largest AccuracyGroup is 3 for Mushroom Sorter, but then the largest of Bird Measurer AccuracyGroups are 1 and 0. 

### Complete Assessment History
As we're seeing that not every assessment has the same performance outcome, it seems worthwhile to note the Accuracy Group for every installation_id on every assessment attempted prior to the last assessment. That way we can see if past performance has any indication on the last assessment attempted. 

In [None]:
# Because the Bird Measurer assessment uses a distinct event code, let's separate it from the data
bird_measure_assess = data[data['title'] == "Bird Measurer (Assessment)"]

# Capture bird_meas assessment attempts
bird_measure_assess = bird_measure_assess[bird_measure_assess['event_code'] == 4110]

In [None]:
# Grab all assessment attempts that are not for bird measurer
prior_assessments = data[data['title'] != "Bird Measurer (Assessment)"]

prior_assessments = prior_assessments[prior_assessments['event_code'] == 4100]

# Some non-assessment activities have a 4100 code, ignore those
prior_assessments = prior_assessments[prior_assessments['type'] == 'Assessment']

# Append bird measure assessment attempts to have a log of all assessment attempts
prior_assessments = prior_assessments.append(bird_measure_assess)

In [None]:
# Remove all assessment sessions that were part of the child's last assessment. That way, our features don't leak information about the target variable. 
prior_assessments = prior_assessments[~prior_assessments['game_session'].isin(last_game_sessions)]

In [None]:
prior_assessments.shape

In [None]:
# Create a Pass and Fail column that will allow us to count each user's performance history
prior_assessments['Pass'] = prior_assessments.apply(lambda row: '"correct":true' in row.event_data, axis = 1)

prior_assessments['Fail'] = prior_assessments.apply(lambda row: '"correct":false' in row.event_data, axis = 1)

In [None]:
prior_assessments.head()

In [None]:
# Create a pivot table that logs pass/fail for each installation_id for the last assessment
prior_pass_fail_log = prior_assessments.pivot_table(['Pass', 'Fail'], index=['installation_id','title',], aggfunc = 'sum')

In [None]:
prior_pass_fail_log.head(10)

We now have a history of how each user in the data set performed historically on each of the assessments they completed. Another potentially useful feature is to look at how many assessments a user started but did not complete. I will do that after this section. 

In [None]:
titles = prior_assessments.title.unique()

In [None]:
# For each assessment title, filter on that title and pull each user's history into the features dataframe we're constructing.
for title in titles:
    features = pd.merge(features, prior_pass_fail_log.filter(like=title, axis=0), on="installation_id", how='left')
    features.rename(columns = {'Fail': title + "_Fail", 'Pass': title + "_Pass"}, inplace = True) 

In [None]:
# If the user has no attempt history for an assessment, enter a 0 for both pass and fail
features.fillna(value = 0, inplace=True)

## Assessments Started Not Completed
It might be interesting to know if there is a significant number of assessments that users started but did not complete. These might suggest that the assessment was too difficult and the player simply gave up. 

In [None]:
assessments = data[data['type'] == 'Assessment']

In [None]:
assessments.title.unique()

In [None]:
# Because the Bird Measurer assessment uses a distinct event code, let's separate it from the data
bird_measure_assess = data[data['title'] == "Bird Measurer (Assessment)"]

measure_assess = data[data['title'] != "Bird Measurer (Assessment)"]
measure_assess = measure_assess[measure_assess.type == 'Assessment']

# Capture assessment attempts
game_sessions_start = bird_measure_assess[bird_measure_assess['event_code'] == 2000].game_session.unique()

game_sessions_start = np.concatenate([game_sessions_start, measure_assess[measure_assess['event_code'] == 2000].game_session.unique()])

# Capture completed assessment game sessions
game_sessions_complete = bird_measure_assess[bird_measure_assess.event_code == 4110].game_session.unique()

game_sessions_complete = np.concatenate([game_sessions_complete, measure_assess[measure_assess['event_code'] == 4100].game_session.unique()])

# Capture all game_sessions where an assessment is initiated but never completed
incomplete_gs = [gs for gs in game_sessions_start if gs not in game_sessions_complete]

In [None]:
# Capture the train data for the game_sessions that have incomplete assessments
incomplete_sessions = data[data.game_session.isin(incomplete_gs)]
incomplete_sessions = incomplete_sessions[incomplete_sessions.event_code == 2000]

In [None]:
incomplete_sessions.head()

In [None]:
incomplete_sessions_table = incomplete_sessions.pivot_table(['event_count'],columns=incomplete_sessions.title, index='installation_id', aggfunc = 'count', fill_value = 0)

In [None]:
incomplete_sessions_table.columns = [str(col) + '_incomplete_attempts' for col in incomplete_sessions_table.columns]

In [None]:
incomplete_sessions_table.tail()

In [None]:
features = pd.merge(features, incomplete_sessions_table, on='installation_id', how='left')
features.head()

### Results
Upon adding the features created in this section, our performance on the test data actually worsened. We dropped from a score of .399 to .361. For now I will keep these features. 

## Feature Idea: Time Spent in Each Title
Now I will check to see if there is a correlation between the time spent on each activity and the performance of the child on their last assessment. To do this, I will grab the maximum game time in each title as a proxy for approximately how much time was spent on the activity during the game session. We can group these by Title and then sum them up for each installation_id.

In [None]:
# To get each child's performance on their last assessment, extract only the last assessment per installation_id
last_event_per_game_session = data.sort_values(by="timestamp").drop_duplicates(subset=["game_session"], keep="last")

In [None]:
last_event_per_game_session1 = last_event_per_game_session.copy()

In [None]:
last_event_per_game_session = last_event_per_game_session[last_event_per_game_session.type != "Assessment"]

In [None]:
time_per_activity = last_event_per_game_session.groupby(['installation_id','world'])[['game_time']].sum()

In [None]:
time_per_activity

In [None]:
worlds = data.world.unique()

In [None]:
worlds.shape

In [None]:
features1 = features.copy()

In [None]:
# For each assessment title, filter on that title and pull each user's history into the features dataframe we're constructing.
for world in worlds:
    features1 = pd.merge(features1, time_per_activity.filter(like=world, axis=0), on="installation_id", how='left')
    #print(features1.columns)
    features1.rename(columns = {'game_time': world + "_game_time"}, inplace = True) 

In [None]:
features1

In [None]:
features1.fillna(value = 0, inplace=True)

In [None]:
columns_to_drop = []
for col in features1.columns:
    if features1[col].unique().size == 1:
        columns_to_drop.append(col)

In [None]:
columns_to_drop

In [None]:
features1.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
features = features1.copy()

## Preparing and Training Model

In [None]:
features.head()

In [None]:
features = pd.concat([features, pd.get_dummies(features['last_assess_title'],prefix='last_title', drop_first=True)], axis=1)

In [None]:
features.drop(labels=["last_assess_game_session", "last_assess_title", "last_assess_world"], inplace=True, axis=1)

In [None]:
train_targets.head()

In [None]:
train_targets = train_targets.iloc[:, :2]

In [None]:
# Join the game_session, title, and world to the features dataframe.
train_targets = pd.merge(train_targets, features, on='installation_id', how='left')

In [None]:
sample_sub = pd.read_csv("/kaggle/input/data-science-bowl-2019/sample_submission.csv")

In [None]:
# Join the game_session, title, and world to the features dataframe.
test_X = pd.merge(sample_sub, features, on='installation_id', how='left')

In [None]:
test_X

In [None]:
train_X = train_targets.iloc[:, 2:]

In [None]:
train_X.head()

In [None]:
train_y = train_targets.iloc[:, 1]

In [None]:
train_y.head()

In [None]:
test_X_input =  test_X.iloc[:, 2:]

In [None]:
test_X_input.head()

In [None]:
train_X.fillna(0, inplace=True, axis=1)
test_X_input.fillna(0, inplace=True, axis=1)

In [None]:
from sklearn import linear_model, metrics 
from sklearn.naive_bayes import MultinomialNB
   
# defining feature matrix(X) and response vector(y) 
X = train_X
y = train_y
  
# splitting X and y into training and testing sets 
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1) 
   
# create logistic regression object 
#reg = linear_model.LogisticRegression() 
nb = MultinomialNB()  
# train the model using the training sets 
#reg.fit(X_train, y_train) 
nb.fit(X_train, y_train)
  
# making predictions on the testing set 
#y_pred = reg.predict(X_test) 
y_pred = nb.predict(X_test)
   
# comparing actual response values (y_test) with predicted response values (y_pred) 
print("Logistic Regression model accuracy(in %):",  
metrics.accuracy_score(y_test, y_pred)*100) 

In [None]:
sample_sub.drop(['accuracy_group'], axis=1, inplace=True)

In [None]:
#test_y = reg.predict(test_X_input)
test_y = nb.predict(test_X_input)

In [None]:
test_y_df = pd.DataFrame(test_y)

In [None]:
test_y_df.rename(columns={0: 'accuracy_group'}, inplace=True)

In [None]:
test_y_df.head()

In [None]:
sample_sub

In [None]:
output = pd.merge(sample_sub, test_y_df, left_index=True, right_index=True)

In [None]:
output

In [None]:
output['accuracy_group'] = output['accuracy_group'].astype('int32')

In [None]:
output.to_csv("submission.csv", index=False)