# Data Extraction
Correlate labels from customer feedback with the unlabeled clickstream dataset

In [None]:
# Import cell, add all imports here

import pandas as pd
import numpy as np
import boto3
import os

In [None]:
# Import customer feedback data from S3

s3client = boto3.client('s3')
response = s3client.get_object(Bucket = 'click-metric-analytics', Key = 'feedback.csv')
feedbackdf = pd.read_csv(response['Body'])

In [None]:
# Import clickstream data from S3

response = s3client.get_object(Bucket = 'click-metric-analytics', Key = 'unlabeled.csv')
metricsdf = pd.read_csv(response['Body'])

In [None]:
# Add an output column (frustrated) and translate customer rating to frustrated / not frustrated labels
# Customer rating: neutral -> not frustrated
# Customer rating: positive -> not frustrated
# Customer rating: negative -> frustrated

feedbackdf.loc[feedbackdf['frustrated'].str.match('neutral\n', na=False), 'frustrated'] = False
feedbackdf.loc[feedbackdf['frustrated'].str.match('positive\n', na=False), 'frustrated'] = False
feedbackdf.loc[feedbackdf['frustrated'].str.match('negative\n', na=False), 'frustrated'] = True

In [None]:
# Get a list of all unique account IDs in the dataframe

accounts = feedbackdf.account_id.unique().tolist()
for i in range(len(accounts)):
    accounts[i] = str(accounts[i]).zfill(12) # ensure all accounts IDs are of the same length 

In [None]:
# Output the list of accounts to a .txt file 
 
file = open(os.path.join("/tmp", "accounts.txt"), "w") 
file.write('\', \''.join(accounts))
file.close()

In [None]:
# Initialize the list of session_ids and corresponding labels
session_ids = []
labels = []

# Iterate through every row in the sentiment data frame
for index, row in df.iterrows():
    # Get the account ID, timestamp, and label
    account_id = row.account_id
    timestamp = str(pd.to_datetime(row.timestamp))[:10]
    label = row.frustrated
    
    # Correlate the account id and timestamp with the session from the click metrics dataframe
    sessions = metricsdf[(metricsdf.account_id == account_id) & (pd.to_datetime(metricsdf.timestamp).dt.normalize() == timestamp)].session_id.unique()
    
    # If we found sessions, extend the sessions and labels lists with the results
    if len(sessions) != 0:
        # Append the corresponding sessions to the list
        session_ids.extend(sessions)
        # Perform the same to get corresponding labels
        labels.extend([label] * len(sessions))

In [None]:
# Initialize a new results dataframe 
labeleddf = pd.DataFrame(columns=['timestamp','account_id','session_id','metric','frustrated'])

for index in range(len(session_ids)):
    labeleddf = pd.concat([labeleddf,metricsdf[metricsdf.session_id == session_ids[index]]], axis=0, ignore_index=True)
    labeleddf.loc[labeleddf.session_id == session_ids[index], 'frustrated'] = labels[index]

In [None]:
# Write the results back to S3

local_csv = '/tmp/results.csv'
with open(local_csv, "wb") as f:
    f.write(labeleddf.to_csv(None, index=False).encode())