## What this notebook does:
- Map `installation_id` in `sample_submission.csv` to the correspoding assessment.
- Compute the average accuracy of each assessment using `train_labels.csv`.
- Estimate `accuracy_group` from the average accuracy.

In [None]:
import pandas as pd

In [None]:
%ls -lh ../input/data-science-bowl-2019/

In [None]:
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
test.head()

In [None]:
last_event = test.sort_values(['installation_id', 'timestamp']).groupby('installation_id').last().reset_index()
last_event.head()

In [None]:
ends_with_assessment = last_event['title'].str.contains('Assessment')
last_event[~ends_with_assessment]

In [None]:
sbm_sample = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')
sbm_sample = pd.merge(sbm_sample, last_event[['installation_id', 'title']], on='installation_id')
sbm_sample

In [None]:
labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
labels.head()

In [None]:
def predict(accuracy):
    if accuracy > 0.5:
        return 3
    
    if accuracy > 0.4:
        return 2
    
    if accuracy > 0.13:
        return 1
    
    return 0

In [None]:
agg = labels.groupby('title').sum()[['num_correct', 'num_incorrect']].reset_index()
agg['accuracy'] = agg['num_correct'] / (agg['num_incorrect'] + agg['num_correct'])
agg['accuracy_group'] = agg['accuracy'].map(predict)
agg

In [None]:
sbm = pd.merge(sbm_sample.drop('accuracy_group', axis=1), agg, on='title')
sbm

In [None]:
sbm[['installation_id', 'accuracy_group']].to_csv('submission.csv', index=False)