# Baseline and Oracle Models

In [None]:
# Import cell, add all imports here

import pandas as pd
import numpy as np
import boto3

In [None]:
# Import data from S3
s3client = boto3.client('s3')
response = s3client.get_object(Bucket = 'click-metric-analytics', Key = 'unlabeled.csv')
metricdf = pd.read_csv(response['Body'])

## Baseline

For the baseline model we implement a simple rule-based approach. Users that meet one or more of the defined rules are classified as frustrated 

1. User performs a 'cancel' action more than once in a given session.
2. User attempts to perform the same action multiple times consecutively. 

In [None]:
# Baseline

# Start by assuming no users are frustrated
metricdf['frustrated'] = 0

frustratedUsers = ['account_1', 'account_2 >'] # removed to maintain anonymity
contentUsers = ['account_1', 'account_2 >'] # removed to maintain anonymity

baselinedf = metricdf[metricdf.account_id.isin(frustratedUsers) | metricdf.account_id.isin(contentUsers)]
uniqueSessions = baselinedf.session_id.unique()

for session in uniqueSessions:
    sessiondf = baselinedf[baselinedf.session_id == session]
    
    # Rule 1: perform a cancel action more than once in a given session
    cancelCount = sessiondf.metric.str.contains(r'cancel', flags=re.IGNORECASE).sum()
    if cancelCount >= 2:
        baselinedf.loc[baselinedf.session_id == session, 'frustrated'] = True
    else:
        # Rule 2: attempt to perform the same action multiple times consecutively
        metrics = baselinedf['metric'].tolist()
        baselinedf.loc[baselinedf.session_id == session, 'frustrated'] = any(metrics[i] == metrics[i+1] for i in range(len(metrics)-1))

## Oracle

To define the oracle, we make use of information that is not generally available. We have access to customer feedback for a small subset of customers. We can correlate customers that provided positive or negative sentiment feedback with their respective clickstreams, and label these sessions according to the feedback

In [None]:
# Oracle

# Start by assuming no users are frustrated
metricdf['frustrated'] = 0

frustratedUsers = ['account_1', 'account_2 >'] # removed to maintain anonymity
contentUsers = ['account_1', 'account_2 >'] # removed to maintain anonymity

oracledf = metricdf[metricdf.account_id.isin(frustratedUsers) | metricdf.account_id.isin(contentUsers)]
oracledf['frustrated'] = oracledf.account_id.isin(frustratedUsers)