# SBJ Processing

## Set Notebook Parameters

These parameters are global

Make sure to run the code block below before both pre and post processing

In [28]:
# set working directory
p_directory = "/Users/smarshall/Desktop/projects/media/sbj/"

# set filename -- must have file extension attached
p_filename = "SBJ_TEST"

## Pre - Processing
- Load raw CSV from media 
- Set load locataion based on known / unknown asset types
- Attach pipeline label data
- Add `callback_metadata` if required
- Split full csv into sub-csvs by loading location 

In [2]:
import pandas as pd 

# raw_media = pd.read_csv(p_directory + "raw_media/" + p_filename + ".csv") # load raw media file
raw_media = pd.read_csv(p_directory + "raw_media/" + "sbj__0516" + ".csv") # load raw media file

In [3]:
# set unknown asset types
unknown_asset_types = ["not_available", "unidentified", "na"]

# assign load location based on known / unknown asset type
raw_media["load_loaction"] = raw_media["asset_type"].apply(lambda x: "multiclass" if x in unknown_asset_types else "pipeline")

In [4]:
label_data_xwalk = pd.read_csv(p_directory + "sbj__pipeline_label_data_xwalk.csv")

merged_with_label_data = raw_media.merge(label_data_xwalk, on="asset_type", how="left")

In [None]:
# # attach callback metadata
# merged_with_label_data["callback_metadata"] = range(len(merged_with_label_data))

In [5]:
# split into relevant csvs
multiclass_upload = merged_with_label_data.loc[merged_with_label_data["load_loaction"]=="multiclass"]
pipeline_upload   = merged_with_label_data.loc[merged_with_label_data["load_loaction"]=="pipeline"]

In [6]:
merged_with_label_data.load_loaction.value_counts()

load_loaction
pipeline      9587
multiclass    3590
Name: count, dtype: int64

In [7]:
multiclass_upload.sample(frac=1).to_csv(p_directory + "/loaded/" + p_filename + "__multiclass.csv", index=False)
pipeline_upload.sample(frac=1).to_csv(p_directory + "/loaded/" + p_filename + "__pipeline.csv", index=False)

## Post - Processing
- Load relevant CSVs
    - Raw CSV from Media, Hive pipeline export, Hive multiclass export
- Format Hive pipeline export
    - Assign status conditionally based on load location, and Y/N cat job result
    - Subset to relevant rows
- Format Hive multiclass export
    - Tidy Hive output status
    - Subset to relevant rows
- Generate deliverable table
     - Merge Hive results back into original Media CSV to preserve Media columns
     - De-dupe on `callback_metadata` (duplicates are introduced by merging)
     - Remove rows with `court_reflection`
- Export

### Load Relevant CSVs

In [16]:
import pandas as pd

raw_media = pd.read_csv('/Users/smarshall/Desktop/projects/media/sbj/raw_media/sbj__0516.csv')

pipeline_export = pd.read_csv('/Users/smarshall/Desktop/sbj__pipeline_export.csv')
multiclass_export = pd.read_csv('/Users/smarshall/Desktop/sbj__multiclass_export.csv')

### Format Pipieline Export

In [17]:
import numpy as np 

# parse out binary status 
pipeline_export['binary_status'] = pipeline_export['status'].str.split('"').str[3]

# parse out multilevel status
pipeline_export['multiclass_status'] = pipeline_export['status'].str.split('}').str[0].str.split(':').str[2]

# tidy status
pipeline_export['multiclass_status'] = pipeline_export['multiclass_status'].str.replace('"', '')

# parse out status from label data
pipeline_export['label_data_status'] = pipeline_export['label_data'].str.split('</b>').str[1].str.split('</p>').str[0]

# tidy status
pipeline_export['label_data_status'] = pipeline_export['label_data_status'].str.replace(' ', '_').str.lower()

# create final status: if binary_status is YES use label_data_status, if binary status is NO use multilevel status 
pipeline_export['delivery_status'] = np.where((pipeline_export['binary_status'] == 'yes'), 
                                               pipeline_export['label_data_status'],
                                               pipeline_export['multiclass_status'])

# subset for stack
pipeline_for_stack = pipeline_export[['image_url', 'delivery_status']]

### Format Multiclass Export

In [18]:
# format status 
multiclass_export['delivery_status'] = multiclass_export['status'].str.replace('\W', '')

# subset data for stack
multiclass_for_stack = multiclass_export[['image_url', 'delivery_status']]

  multiclass_export['delivery_status'] = multiclass_export['status'].str.replace('\W', '')


### Generate Deliverable CSV

In [19]:
# stack datasets together 
stacked_results = pd.concat([pipeline_for_stack, multiclass_for_stack], ignore_index=True, axis=0)

# rename columns for merge
stacked_results.columns = ['stable_url', 'hive_result']

# merge back into original csv from bruce using ____ as the key 
merged_results = raw_media.merge(stacked_results, on = 'stable_url', how = 'right')


In [20]:
# remove callback metadata dupes
de_duped_merged_results = merged_results.drop_duplicates(subset = 'callback_metadata')

# remove court reflection
no_reflections = de_duped_merged_results.loc[de_duped_merged_results['hive_result'] != 'court_reflection']

In [21]:
# find court reflection callback metadatas 
display(de_duped_merged_results.loc[de_duped_merged_results['hive_result'] == 'court_reflection'][['hive_result', 'callback_metadata']])

Unnamed: 0,hive_result,callback_metadata
1831,court_reflection,6666
4757,court_reflection,12733
7748,court_reflection,8933
9313,court_reflection,10047
9687,court_reflection,11961
9869,court_reflection,6472
12270,court_reflection,10130
13207,court_reflection,3947


In [22]:
# subset to dlivery cols
sbj_delivery = no_reflections.loc[:, ~no_reflections.columns.isin(['callback_metadata', 'rand', 'load_location', 'pipeline_label_data'])]

### Export

In [32]:
# write export
sbj_delivery.to_csv(p_directory + "delivered/" + p_filename + "__labeled.csv", index=False)