### Data loading and analysis notebook for Allio Bullet R1 V2 roast data https://aillio.com/
**by Ryan @f.w.Bennies https://www.instagram.com/f.w.bennies/**

<img src="allRoastsPlt.png" alt="friends" width="270"/> <img src="bulletRoastingEDA.png" alt="friends" width="210"/><img src="friendshipsign.png" alt="friends" width="210"/>


#### Objectives
 Automaticaly load, serialize, and combine data from .json files. Then clean up unwanted data (non-standard batches)
 Split into curve and point data, create a few new features
 Summarize and display data (EDA) to enable data driven decisions in planning and real-time roasting

#### Nice to haves:
 impute data (with confidence) rather than remove missing data
 generate additional new features
 incorporate bean density

#### Excluding
 assocation for brewing techniques or results such as taste and aroma
 quantified color changes

*Note: If your RoastTime isn't installed in the default MacOS location, edit 'base_path' (lines ~18-19)*

*Built for my bullet (hardware version) purchased in July 2020 - noteworthy because the data structure of the .json files have changed over time which requires some merges or gap fixes.*

In [None]:
#################################################
## open each .json in folder and append to df  ##
#################################################
import os
from pathlib import Path
import json
import pandas as pd
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
import scipy.stats as stats
pd.set_option('display.max_columns', 55)
plt.style.use('seaborn-colorblind')

# Load from roasTime repository on macOS
home = os.path.expanduser('~')
base_path = os.path.join(home, 'Library/Application Support/roast-time/roasts')
##  IF you have a specific set of roast profiles in another folder, uncomment the below
# base_path = Path('/data')

df = pd.DataFrame()

for entry in os.listdir(base_path):
    full_path =  Path('%s/%s' % (base_path, entry))
    with full_path.open() as f:
        data = json.loads(f.read())
        df_load = pd.json_normalize(data)
    df = pd.concat([df,df_load], ignore_index=True)

# Export the raw DataFrame to a .csv file just for the record
#create subfolder
subfolder = 'csvExports/'
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

df.to_csv(subfolder + f"raw_bullet-roasting_df.csv", index=False)

#############################
##  First basic clean up   ##
#############################

# Sort and remove rows with missing data
df.sort_values(by='dateTime', inplace=True)
df.dropna(subset=['roastName'], inplace=True)

# Filter dataframe to only your User ID

choiceUserID = '73009f59-2d2e-4215-b6ff-961946ee0b80' ## enter specific userID (extracted from .json roast file, or RoastWorld web address)
df = df.query('userId == @choiceUserID and isFork != 1.0')

# Define list of unused data
other_meta = ['userId', 'isFork', 'serialNumber', 'IRSensor', 'inventory.nextGreenWeight',
              'inventory.previousGreenWeight', 'inventory.changeInGreenWeight', 'isPrivate',
             'slug', 'updated_at', 'updatedAt', 'hardware']

# Drop the extra data
df.drop(other_meta, axis=1, inplace = True)
display (df)

In [None]:
## This is where I may put a LOAD from the outside .py file

In [None]:
#############################
# Clean up and Enrich data  #
#############################
try:
    # change dtypes
    dtype_cols = ['weightGreen','weightRoasted','ambient', 'humidity', 'ambientTemp','roomHumidity']
    df[dtype_cols] = df[dtype_cols].apply(pd.to_numeric, errors='coerce')

    # combine RT2 ambientTemp and roomHumidity with RT3 ambient and humidity (note I am using ambient F)
    ambientMeasurement = 'F'  #or C

    df.loc[df['ambient'].isna(), 'ambient'] = df['ambientTemp'].astype(float)
    df.loc[df['humidity'].isna(), 'humidity'] = df['roomHumidity'].astype(float)
    df.drop(columns=['ambientTemp', 'roomHumidity','exitTemperature'], inplace =  True)  #dropping old environmental data and exitTemp

    ###OPPORTUNITY to impute ambient temp where missing##
    df.ambient.replace(to_replace = 0.0, value = np.nan, inplace = True) # some case of temp actually being 0.0 C or F will be excluded, but needed to remove outliers
except KeyError:
    print("Columns missing in repeated runs, skipping execution.")

    # convert dateTime
    df['dateTime'] = pd.to_datetime(df['dateTime'], unit='ms')

# Calculate -> weight lost percent = 100 * (green - roasted)/ green #   PLUS OTHERS
df['weightLostPercent'] = 100 * (df['weightGreen'] - df['weightRoasted']) / df['weightGreen']
df.loc[df['weightLostPercent'] > 17, 'weightLostPercent'] = np.nan  ## Future change to 50, 17 is too low

# Fix low and high pre-heat errors (replace drumChargeTemperature w/ PH temp when z value > 3)
# May want to edit raw json file if you still have bad preheat temps
#df['drumChargeTemperature'].where(abs(stats.zscore(df.drumChargeTemperature-df.preheatTemperature)) < 3,
#                                  df['preheatTemperature'], inplace = True)

# calulate difference of beanDropTemp and beanChargeTemp (not ITBS, this should relative)   # maybe
df['Drop-ChargeDeltaTemp'] = df['beanDropTemperature'] - df['beanChargeTemperature']

# remove instances where FC was not picked or picked late 
## Would be better to impute FC values in the future
df.loc[(df['indexFirstCrackStart'] > 2400) | (df['indexFirstCrackStart'] == 0), 'indexFirstCrackStart'] = np.nan
df.loc[df['weightRoasted'] < 10, 'weightRoasted'] = np.nan
df.loc[df['weightLostPercent'] > 50, 'weightLostPercent'] = np.nan
    
display (df.head(3))
display (df.tail(3))

In [None]:
######################################################
# deconstruct temp curves from lists to new curve_df #
######################################################
## data for temp and derivative curve of each  is in a single cell as a list ###
# For each roast (row) extract list of each curve data into a new DF and transpose,
# add indexTime unique to each roast, add curve, concatenate to buddle all roasts curves.

temp_curve_df = pd.DataFrame()
curve_df = pd.DataFrame()
for index, row in df.iterrows():
        temp_curve_df = pd.DataFrame([pd.Series(row['beanTemperature'], name = 'beanTemperature', dtype='float64'), 
                                      pd.Series(row['drumTemperature'], name = 'drumTemperature', dtype='float64'),
                                      pd.Series(row['beanDerivative'], name = 'beanDerivative', dtype='float64'),
                                      pd.Series(row['ibtsDerivative'], name = 'ibtsDerivative', dtype='float64')]).T
        temp_curve_df['indexTime'] = temp_curve_df.index
        temp_curve_df['roastName'] = row['roastName']
        temp_curve_df['softwareVersion'] = row['softwareVersion']
        curve_df = pd.concat([curve_df, temp_curve_df],ignore_index = True) #update from #curve_df = curve_df.append(temp_curve_df,ignore_index = True) 1/23/24 for append depreciation

# Calculate second derivative        
# first pass at 2nd Derivative, review and see if it should be smoothed
curve_df['ibts2ndDerivative'] = curve_df.groupby('roastName')['ibtsDerivative'].apply(lambda x:x.diff())

#TO DO - Create 1stDerivative for roasts (.groupby('roastName')) with NaN itbsDerivative before Allio started adding it

curve_df.fillna(value=np.nan, inplace=True)
display (curve_df.head(3))
display (curve_df.tail(3))


In [None]:
########################################################
## Create df of point sets (single entry per profile) ##
########################################################
pd.set_option('display.max_columns', 500)

point_list = ['beanChargeTemperature', 'beanDropTemperature', 'drumChargeTemperature',
           'drumDropTemperature', 'preheatTemperature', 'roastStartIndex', 'roastEndIndex',
           'totalRoastTime', 'indexFirstCrackStart', 'indexFirstCrackEnd', 'indexYellowingStart',
           'weightGreen', 'weightRoasted', 'weightLostPercent','deltaTemp',
           'roastNumber', 'sampleRate', 'firmware', 'missingSeconds',
           'dateTime', 'roastName', 'comments', 'updatedAt',
           'ambient', 'humidity', 'rating', 'beanId']
point_df = pd.DataFrame(df, columns = point_list).reset_index()
point_df.drop(columns='index', inplace = True)
point_df.indexYellowingStart = point_df.indexYellowingStart.fillna(value=np.nan)
point_df['totalRoastTime'] = point_df.totalRoastTime/60  # apparently totalRoastTime is counted in seconds not index steps
display (point_df.tail(3))

In [None]:
#######################
### Enrich Point_DF ###
#######################

# Goals:
# first(MAX ROR) [Done = TP, Yellow, FC, weight lost]
# next (drying-malliard-dev times, and mean ROR between points & phases)
# then (ROR and 2nd Derivitive average values  in each phase or between key points (185, 190))

################################################################################################
##  Find turning point index and index at 165 deg bean Temp (alt to inconsistently picked YP) ##
################################################################################################
roastName_df = curve_df.groupby(['roastName']) 
for name, group in roastName_df:
    minBT = group.beanTemperature.min()  ### Get to 0 ROR indexTime (TP) via first occurance of minBT
    for i,row in group.iterrows():
        if row.beanTemperature == minBT and row.beanDerivative >= 0:   # multiple min points likely, so combined with first point climbing past 0 ROR
           # print (name,i,row)  # Use this NEXT -  TO FIND WHAT 0 and nan problems are! 1/17
            point_df.loc[(point_df.roastName == name),'indexTurningPoint'] = row.indexTime
            point_df.loc[(point_df.roastName == name),'ibtsTurningPointTemp'] = row.drumTemperature
            break
    for i,row in group.iterrows():
        if row.indexTime > 120 and row.drumTemperature >= 165:
            autoYP165 = row.indexTime
            point_df.loc[(point_df.roastName == name),'index165PT'] = autoYP165
            break
point_df['turningPointTime'] = (point_df.indexTurningPoint)/60/sampleRate


# replace missing or bad YP pick with autoYP165   ### Probably should just switch all YP to autoYP165
point_df.loc[(point_df.indexYellowingStart < 1), 'indexYellowingStart'] = point_df.index165PT
point_df.loc[(point_df.indexYellowingStart.isnull()), 'indexYellowingStart'] = point_df.index165PT
point_df['yellowPointTime'] = point_df.indexYellowingStart/60/sampleRate

# replace bad FC points with np.nan. # Ryan why didn't you use the .replace() function?
point_df.loc[(point_df.indexFirstCrackStart == 0),'indexFirstCrackStart'] = np.nan
point_df.loc[(point_df.indexFirstCrackStart >10000),'indexFirstCrackStart'] = np.nan
point_df['firstCrackTime'] = point_df.indexFirstCrackStart/60/2

# time/temp
point_df['time/temp'] = point_df.totalRoastTime/point_df.beanDropTemperature

# ITBS BeanProbe difference for change over time plot
point_df['deltaIBTS-BT'] = point_df.drumDropTemperature - point_df.beanDropTemperature

    

In [None]:
###roast phases###
## it is of the opinion that Drying, Browning, and Development are not ideal, 
## as the bean is still drying after yellowing, still browning after FC, and certainly developing before FC
## thus Pre-YP, Pre-FC, Post-FC

#point_df['pre-YellowPointPhase'] = 
#point_df['pre-FirstCrackPhase'] = 
#point_df['post-FirstCrackPhase'] = 
# Development Time Ratio (DTR)
#point_df['DTR'] = 

#display(point_df)


In [None]:
#####################################     
##  Save transformed data to .csv  ##
#####################################

#create subfolder if needed
subfolder = 'csvExports/'
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

now = datetime.now()
currentDateTime = now.strftime("%Y-%m-%d_%H%-M")

df.to_csv(subfolder + r'df_bulkData_' + currentDateTime + '.csv')   # may need   , index=False)
curve_df.to_csv(subfolder + r'curve_df_' + currentDateTime + '.csv')
point_df.to_csv(subfolder + r'point_df_' + currentDateTime + '.csv')

display ('Data frames saved in ' + subfolder + 'folder with current Date_Time ' + currentDateTime)

### ----------------
### BREAK HERE: LOAD AND TRANSFORM ABOVE - VIEW AND ANALYZE BELOW
### ----------------

In [None]:
display (df.info())