### Data loading and analysis notebook for Allio Bullet R1 V2 roast data https://aillio.com/
###### by Ryan @f.w.Bennies https://www.instagram.com/f.w.bennies/

#### Objectives
###### Automaticaly load, serialize, and combine data from .json files. Then clean up unwanted data (non-standard batches)
###### Split into curve and point data, create a few new features
###### Summarize and display data (EDA) to enable data driven decisions in planning and real-time roasting

#### Nice to haves:
###### impute data (with confidence) rather than remove missing data
###### generate additional new features
###### incorporate bean density

#### Excluding
###### assocation for brewing techniques or results such as taste and aroma
###### quantified color changes

##### Note: If your RoastTime isn't installed in the default MacOS location, edit 'base_path' (lines ~18-19)

In [2]:
#################################################
## open each .json in folder and append to df  ##
#################################################
from pathlib import Path
import os
from datetime import datetime
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
import scipy.stats as stats
pd.set_option('display.max_columns', 55)
plt.style.use('seaborn-colorblind')


# Load from roasTime repository on macOS
home = os.path.expanduser('~')
base_path = os.path.join(home, 'Library/Application Support/roast-time/roasts')
##  IF you have a specific set of roast profiles in another folder, uncomment the below
# base_path = Path('/data')
df = pd.DataFrame()

for entry in os.listdir(base_path):
    full_path =  Path('%s/%s' % (base_path, entry))
    with full_path.open() as f:
        data = json.loads(f.read())
        df_load = pd.json_normalize(data)
    df = pd.concat([df,df_load], ignore_index=True)

# Export the raw DataFrame to a .csv file with no timestamp just for the record
#create subfolder
subfolder = 'csvExports/'
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

df.to_csv(subfolder + f"raw_bullet-roasting_df.csv", index=False)

#############################
##  First basic clean up   ##
#############################

df.sort_values(by='dateTime', inplace = True)
df = df.reset_index()
df = df.drop(columns = ['index'])

# Removing unknown or unnamed roasts as they will probably give later errors due to lack of data
df = df[df['roastName'].notna()]

## Filter out roasts of others that you have saved
## enter specific userID (extracted from .json roast file, or in web address when viewing on RoastWorld)
choiceUserID = '73009f59-2d2e-4215-b6ff-961946ee0b80' 
df = df[df.userId == choiceUserID]
# simpler filter is using isFork as identifyer of roasts saved (not your roasts)
df = df[df.isFork != 1.0]  # use '!= 1.0' for your own roasts and '== 1.0' for other users saved roast. comment out for all roasts


# define list of unused meta data and create meta_df
other_meta = ['userId', 'isFork', 'serialNumber', 'IRSensor', 'inventory.nextGreenWeight',
              'inventory.previousGreenWeight','inventory.changeInGreenWeight', 'isPrivate']

# Drop the extra data
df.drop(other_meta, axis=1, inplace = True)

In [3]:
#############################
# Clean up and Enrich data  #
#############################

# change dtypes
dtypeList = ['weightGreen','weightRoasted','ambient', 'humidity', 'ambientTemp','roomHumidity']
for d in dtypeList:
    df[d] = pd.to_numeric(df[d], errors='coerce')
    
# combine RT2 ambientTemp and roomHumidity with RT3 ambient and humidity (note I am using ambient F)
ambientMeasurement = 'F'  #or C
df['ambient'].where(df['ambient'].isna() == False, df['ambientTemp'].astype(float), inplace = True)
df['humidity'].where(df['humidity'].isna()== False, df['roomHumidity'].astype(float), inplace = True)
df.drop(columns=['ambientTemp', 'roomHumidity','exitTemperature'], inplace =  True)  #dropping old environmental data and exitTemp
df.ambient.replace(to_replace = 0.0, value = np.nan, inplace = True) # some case of temp actually being 0.0 C or F will be excluded, but needed to remove outliers

# convert dateTime
from datetime import datetime
if df['dateTime'].dtype != 'datetime64[ns]':
    pd.to_numeric(df['dateTime'],errors='coerce')
    for dt in df['dateTime']:
        df['dateTime'] = datetime.fromtimestamp(int(dt/1000))

# Calculate -> weight lost percent = 100 * (green - roasted)/ green #   PLUS OTHERS
df['weightLostPercent'] = 100 * (df['weightGreen'] - df['weightRoasted']) / df['weightGreen']
df['weightLostPercent'].where(df['weightLostPercent'] < 17, np.nan, inplace = True)  ## Future change to 50, 17 is too low

# Fix low and high pre-heat errors (replace drumChargeTemperature w/ PH temp when z value > 3)
# May want to edit raw json file if you still have bad preheat temps
#df['drumChargeTemperature'].where(abs(stats.zscore(df.drumChargeTemperature-df.preheatTemperature)) < 3,
#                                  df['preheatTemperature'], inplace = True)

# calulate difference of beanDropTemp and beanChargeTemp (not ITBS, this should relative)   # maybe
df['Drop-ChargeDeltaTemp'] = df['beanDropTemperature'] - df['beanChargeTemperature']

# remove instances where FC was not picked or picked late #  ## Would be better to impute FC values in the future
for i, row in df.iterrows():
    if (df.loc[i,'indexFirstCrackStart'] > 2400 or  # if greater then 20 mins (2400 index)
       df.loc[i,'indexFirstCrackStart'] == 0):
            df.loc[i,'indexFirstCrackStart'] == np.nan
    if df.loc[i,'weightRoasted'] < 10:
        df.loc[i,'weightRoasted'] == np.nan
    if df.loc[i,'weightLostPercent'] > 50:
        df.loc[i,'weightLostPercent'] == np.nan
sampleRate = 2
display (df.head(5))

Unnamed: 0,beanChargeTemperature,beanDropTemperature,drumChargeTemperature,drumDropTemperature,beanTemperature,drumTemperature,beanDerivative,ibtsDerivative,preheatTemperature,roastStartIndex,roastEndIndex,totalRoastTime,indexFirstCrackStart,indexFirstCrackEnd,indexSecondCrackStart,indexSecondCrackEnd,indexYellowingStart,weightGreen,weightRoasted,roastNumber,sampleRate,hardware,firmware,missingSeconds,rorPreheat,uid,dateTime,roastName,ambient,humidity,beanId,softwareVersion,firmwareVersion,updatedAt,actions.actionTempList,actions.actionTimeList,guid,slug,number,rating,comments,updated_at,annotationComments,roastDegree,recipeID,weightLostPercent,Drop-ChargeDeltaTemp
4,125.9,187.6,237.7,204.6,"[125.9, 126.1, 126.2, 126.3, 126.4, 126.5, 126...","[237.7, 237.6, 237.9, 238, 237.3, 25, 25, 25, ...","[18.7, 17.6, 16.1, 14.3, 12.8, 11, 7, -0.9, -1...",[],240,0,1099,547,880,0,0,0,756,370,0.0,8,2,67240704,562,[],,56c8ea68-dca0-4ec6-8934-e469744b425d,2023-01-16 18:34:34,First Post-seasoning,78.0,90.0,3105b557-916f-40b6-bec1-59f17c2a8fd7,,,1608086908491,[],"[{'index': 6, 'value': 7, 'ctrlType': 0}, {'in...",56c8ea68-dca0-4ec6-8934-e469744b425d,qAgMK,983.0,2,First real attempt after seasoning. Several sm...,1595645000000.0,,,,,61.7
5,150.1,189.8,244.9,207.6,"[150.1, 150.1, 150.1, 150.1, 150.2, 150.2, 150...","[244.9, 245.1, 244.8, 244.6, 243.9, 25, 25, 25...","[2.6, 2.8, 2.6, 2.2, 1.2, -0.4, -4.4, -11.5, -...",[],245,0,971,483,760,0,0,0,406,380,336.0,9,2,67240704,562,[],,a455d0a0-23a7-4425-9ce5-950fd97e238e,2023-01-16 18:34:34,2nd Ethopia Extended Dev,,50.0,3105b557-916f-40b6-bec1-59f17c2a8fd7,,,1610852527878,[],"[{'index': 6, 'value': 7, 'ctrlType': 0}, {'in...",a455d0a0-23a7-4425-9ce5-950fd97e238e,KVrvK,984.0,2,Better results to drop temp by increasing fan ...,1595986000000.0,,,,11.578947,39.7
6,147.8,202.6,245.1,216.1,"[147.8, 148, 148.1, 148.3, 148.4, 148.5, 148.6...","[245.1, 245.7, 245.6, 245.1, 245, 25, 25, 25, ...","[19.7, 19, 18.7, 18.2, 16.6, 14.4, 11.3, 5.9, ...",[],245,0,1031,513,838,0,0,0,422,490,425.0,10,2,67240704,562,[],,f9561e80-21de-4d2c-9258-eff0071b8c05,2023-01-16 18:34:34,First BBP attempt - CItyish,,,3105b557-916f-40b6-bec1-59f17c2a8fd7,,,1612586069268,[],"[{'index': 6, 'value': 7, 'ctrlType': 0}, {'in...",f9561e80-21de-4d2c-9258-eff0071b8c05,qBmdp,986.0,2,First BBP PH try. PH to 260 9 mins + 2 min hol...,,,,,13.265306,54.8
7,150.0,199.8,245.3,208.6,"[150, 150.1, 150.2, 150.3, 150.5, 150.6, 150.6...","[245.3, 244.9, 244.9, 243.7, 243.5, 25, 25, 25...","[16.6, 15.8, 15.8, 15.4, 14.5, 6.9, 6.9, -0.8,...",[],245,0,1125,560,872,0,0,0,448,490,0.0,12,2,67240704,562,[],,965db6e7-37ff-4504-a616-1bd64f50723f,2023-01-16 18:34:34,Ethiopia #4 w/ marcel,71.0,75.0,3105b557-916f-40b6-bec1-59f17c2a8fd7,,,1612586098283,[],"[{'index': 6, 'value': 7, 'ctrlType': 0}, {'in...",965db6e7-37ff-4504-a616-1bd64f50723f,4QR6K,988.0,2,Tried to use an increased fan speed to mitgate...,1596505000000.0,,,,,49.8
8,158.7,198.9,247.3,211.0,"[158.7, 158.8, 159, 159.2, 159.3, 159.5, 159.5...","[247.3, 247.2, 247, 246.5, 245.3, 25, 25, 25, ...","[21.8, 21.5, 20.7, 19.9, 18.7, 14.8, 7.4, -3.6...",[],245,0,1037,516,780,0,0,0,402,490,427.0,13,2,67240704,562,[],,a013107d-595b-495b-bc93-17310915df38,2023-01-16 18:34:34,Ethiopian 5th - Hot Start,80.0,52.0,3105b557-916f-40b6-bec1-59f17c2a8fd7,,,1612586118186,[],"[{'index': 6, 'value': 7, 'ctrlType': 0}, {'in...",a013107d-595b-495b-bc93-17310915df38,qb334,990.0,2,"Started with a long 35mins PH to 260, then dro...",1596505000000.0,,,,12.857143,40.2


<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 4 to 111
Data columns (total 47 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   beanChargeTemperature   107 non-null    float64       
 1   beanDropTemperature     107 non-null    float64       
 2   drumChargeTemperature   107 non-null    float64       
 3   drumDropTemperature     107 non-null    float64       
 4   beanTemperature         107 non-null    object        
 5   drumTemperature         107 non-null    object        
 6   beanDerivative          107 non-null    object        
 7   ibtsDerivative          95 non-null     object        
 8   preheatTemperature      107 non-null    int64         
 9   roastStartIndex         107 non-null    int64         
 10  roastEndIndex           107 non-null    int64         
 11  totalRoastTime          107 non-null    int64         
 12  indexFirstCrackStart    107 non-null    int64     

None

In [5]:
######################################################
# deconstruct temp curves from lists to new curve_df #
######################################################
## data for temp and derivative curve of each  is in a single cell as a list ###
# For each roast (row) extract list of each curve data into a new DF and transpose,
# add indexTime unique to each roast, add curve, concatenate to buddle all roasts curves.

temp_curve_df = pd.DataFrame()
curve_df = pd.DataFrame()
for index, row in df.iterrows():
        v = pd.DataFrame([pd.Series(row['beanTemperature'], name = 'beanTemperature', dtype='float64'), 
                                      pd.Series(row['drumTemperature'], name = 'drumTemperature', dtype='float64'),
                                      pd.Series(row['beanDerivative'], name = 'beanDerivative', dtype='float64'),
                                      pd.Series(row['ibtsDerivative'], name = 'ibtsDerivative', dtype='float64')]).T
        temp_curve_df['indexTime'] = temp_curve_df.index
        temp_curve_df['roastName'] = row['roastName']
        temp_curve_df['softwareVersion'] = row['softwareVersion']
        
        curve_df = pd.concat([curve_df, temp_curve_df],ignore_index = True) #update from #curve_df = curve_df.append(temp_curve_df,ignore_index = True) 1/23/24 for append depreciation

# first pass 2nd Derivative - To be updated with GNPT suggestion
#curve_df['ibts2ndDerivative'] = curve_df.groupby('roastName')['ibtsDerivative'].apply(lambda x:x.diff())

curve_df.fillna(value=np.nan)
display (curve_df.head(3))
display (curve_df.tail(3))

Unnamed: 0,indexTime,roastName,softwareVersion


Unnamed: 0,indexTime,roastName,softwareVersion


In [7]:
########################################################
## Create df of point sets (single entry per profile) ##
########################################################
pd.set_option('display.max_columns', 500)

point_list = ['beanChargeTemperature', 'beanDropTemperature', 'drumChargeTemperature',
           'drumDropTemperature', 'preheatTemperature', 'roastStartIndex', 'roastEndIndex',
           'totalRoastTime', 'indexFirstCrackStart', 'indexFirstCrackEnd', 'indexYellowingStart',
           'weightGreen', 'weightRoasted', 'weightLostPercent','deltaTemp',
           'roastNumber', 'sampleRate', 'firmware', 'missingSeconds',
           'dateTime', 'roastName', 'comments', 'updatedAt',
           'ambient', 'humidity', 'rating', 'beanId']
point_df = pd.DataFrame(df, columns = point_list).reset_index()
point_df.drop(columns='index', inplace = True)
point_df.indexYellowingStart = point_df.indexYellowingStart.fillna(value=np.nan)
point_df['totalRoastTime'] = point_df.totalRoastTime/60  # apparently totalRoastTime is counted in seconds not index steps
display (point_df.tail(3))

Unnamed: 0,beanChargeTemperature,beanDropTemperature,drumChargeTemperature,drumDropTemperature,preheatTemperature,roastStartIndex,roastEndIndex,totalRoastTime,indexFirstCrackStart,indexFirstCrackEnd,indexYellowingStart,weightGreen,weightRoasted,weightLostPercent,deltaTemp,roastNumber,sampleRate,firmware,missingSeconds,dateTime,roastName,comments,updatedAt,ambient,humidity,rating,beanId
104,198.5,205.2,284.5,210.9,285,0,1197,9.933333,1122,0,0,750,,,,132,2,602,[],2023-01-16 18:34:34,#111 Ethiopia Refisa - ok,messed this one up by entering in the weight a...,1673918661500,,0.0,,qR10Om9dg1PT7TeH1vLe
105,177.3,206.1,275.1,211.0,275,0,1316,10.933333,1122,0,528,750,664.0,11.466667,,133,2,602,[],2023-01-16 18:34:34,#112 Ethiopia Resfisa,,1673924253356,25.4,61.0,,qR10Om9dg1PT7TeH1vLe
106,166.1,206.4,268.7,210.4,270,0,1387,11.516667,1122,0,600,774,667.0,13.824289,,133,2,602,[],2023-01-16 18:34:34,#113 Ethiopia Mix Refisia and Hambela Dabaye,Good roast to show effect of longer soak and s...,1673918735082,25.3,63.0,,qR10Om9dg1PT7TeH1vLe


In [8]:
### Enrich Point_DF ###

# Goals:
# first(MAX ROR) [Done = TP, Yellow, FC, weight lost]
# next (drying-malliard-dev times, and mean ROR between points & phases)
# then (ROR changes after FC, 2nd Derivitive)

################################################################################################
##  Find turning point index and index at 165 deg bean Temp (alt to inconsistently picked YP) ##
################################################################################################
roastName_df = curve_df.groupby(['roastName']) 
for name, group in roastName_df:
    minBT = group.beanTemperature.min()  ### Get to 0 ROR indexTime (TP) via first occurance of minBT
    for i,row in group.iterrows():
        if row.beanTemperature == minBT and row.beanDerivative >= 0:   # multiple min points likely, so combined with first point climbing past 0 ROR
           # print (name,i,row)  # Use this NEXT -  TO FIND WHAT 0 and nan problems are! 1/17
            point_df.loc[(point_df.roastName == name),'indexTurningPoint'] = row.indexTime
            point_df.loc[(point_df.roastName == name),'ibtsTurningPointTemp'] = row.drumTemperature
            break
    for i,row in group.iterrows():
        if row.indexTime > 100 and row.drumTemperature >= 165:
            autoYP165 = row.indexTime
            point_df.loc[(point_df.roastName == name),'index165PT'] = autoYP165
            break
    
    #print (f'autoYP165 = {autoYP165} seconds or {round(autoYP165/2/60,2)} mins for roast (group name) {name} check if correct')
point_df['turningPointTime'] = (point_df.indexTurningPoint)/60/sampleRate
    
# replace missing or bad YP pick with autoYP165   ### Probably should just switch all YP to autoYP165
point_df.loc[(point_df.indexYellowingStart < 1), 'indexYellowingStart'] = point_df.index165PT
point_df.loc[(point_df.indexYellowingStart.isnull()), 'indexYellowingStart'] = point_df.index165PT
point_df['yellowPointTime'] = point_df.indexYellowingStart/60/sampleRate
# replace bad FC points with np.nan. # Ryan why didn't you use the .replace() function?
point_df.loc[(point_df.indexFirstCrackStart == 0),'indexFirstCrackStart'] = np.nan
point_df.loc[(point_df.indexFirstCrackStart >10000),'indexFirstCrackStart'] = np.nan
point_df['firstCrackTime'] = point_df.indexFirstCrackStart/60/2

# time/temp
point_df['time/temp'] = point_df.totalRoastTime/point_df.beanDropTemperature

# ITBS BeanProbe difference for change over time plot
point_df['deltaIBTS-BT'] = point_df.drumDropTemperature - point_df.beanDropTemperature

###roast phases###
## it is of the opinion that Drying, Browning, and Development are not ideal, 
## as the bean is still drying after yellowing, still browning after FC, and certainly developing before FC
## thus Pre-YP, Pre-FC, Post-FC

#point_df['pre-YellowPointPhase'] = 
#point_df['pre-FirstCrackPhase'] = 
#point_df['post-FirstCrackPhase'] = 
# Development Time Ratio (DTR)
#point_df['DTR'] = 

#display(point_df)

display(roastName_df)


AttributeError: 'DataFrame' object has no attribute 'indexTurningPoint'

In [None]:
#####################################     
##  Save transformed data to .csv  ##
#####################################

#home = os.path.expanduser('~')
#save_path = os.path.join(home, 'Google Drive (not syncing)/Programing/Python/Coffee/bullet-roasting/ExportCSVs/') # used on select roast profiles in other folder
#print (home)

#create subfolder
subfolder = 'Cleaned Data Exports'
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

now = datetime.now()
currentDateTime = now.strftime("%Y-%m-%d_%H%-M")

df.to_csv(subfolder + r'df_bulkData_' + currentDateTime + '.csv')   # may need   , index=False)
curve_df.to_csv(subfolder + r'curve_df_' + currentDateTime + '.csv')
point_df.to_csv(subfolder + r'point_df_' + currentDateTime + '.csv')

display ('Data frames saved with current Date_Time ' + currentDateTime)


## ---------
## BREAK HERE: LOAD AND TRANSFORM ABOVE - VIEW AND ANALYZE BELOW
## ----------------

In [None]:
#####################    playing around with plot types and early observations as I learn
## Plot points_df  ##
#####################
# Bar Plot example
#point_df.plot.bar(x='roastName', y=["preheatTemperature","beanDropTemperature"])

# Box Plot example
#point_df['ibtsTurningPointTemp'].plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)

# PH to yellowing pt relation
point_df.plot.scatter(x='yellowPointTime', y='drumChargeTemperature')  #['indexYellowingStart', 'indexFirstCrackStart', 'roastEndIndex'])
# shows errors in recording actualy preheat temp (or charging too early)
point_df.plot.scatter(x='beanChargeTemperature', y='preheatTemperature') 

# difference between thermal probes
#point_df.plot.scatter(x= 'beanChargeTemperature', y='drumChargeTemperature')  
plt.show()

# colored scatter example
#iFCS = point_df['indexFirstCrackStart']/120
#plt.scatter(iFCS, point_df['drumChargeTemperature'], 
#            s=point_df['drumDropTemperature'], c=point_df['roastEndIndex'], alpha=0.5)
#plt.xlabel('First Crack Start (mins)')
#plt.ylabel('Preheat Temp (℃)')
#plt.show()

# Scatter Matrix 
from pandas.plotting import scatter_matrix 
# selecting  numerical features 
features = [ 'drumChargeTemperature', 'ibtsTurningPointTemp', 'turningPointTime', 'yellowPointTime',
            'indexFirstCrackStart', 'beanDropTemperature','totalRoastTime','weightLostPercent',
            'Drop-ChargeDeltaTemp', 'ambient', 'time/temp']
sm = scatter_matrix(point_df[features], range_padding= .5, alpha = .9, figsize=(15,15))
[s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]
[s.get_yaxis().set_label_coords(-1,0.5) for s in sm.reshape(-1)]

plt.show() 

In [None]:
## finding outliers and specific beans from above
display (df.loc[(df.weightLostPercent > 15),['roastName', 'weightGreen', 'weightRoasted', 'weightLostPercent']])
print ('\n kenyans')
for n in point_df.roastName:
    if n.__contains__("kenya") or n.__contains__("Kenya"):
        print (n)
print ('\n short roasts, less than 8 min')   
display (point_df.loc[(point_df.totalRoastTime) < 8,['roastName']])

print ('\n high pre-heat outliers')   
display (point_df.loc[(point_df.preheatTemperature) > 260,['roastName', 'preheatTemperature','drumChargeTemperature']])

print ('\n charge temp  < 205 C')
display (point_df.loc[(point_df.drumChargeTemperature) < 205,['roastName', 'preheatTemperature','drumChargeTemperature']])


print ('\n low bean drop temp C')
display (point_df.loc[(point_df.beanDropTemperature) < 175,['roastName', 'preheatTemperature','beanDropTemperature']])



In [None]:
# Linear Regression with scipy - try another method too
import matplotlib.pyplot as plt
from scipy import stats

## bean id ##
# Kenyan Gakuyu-Ini    55c16e48-9b08-4b2f-93d5-8f8b089a5017
# Ethiopian Yirg   4a557687-d410-4ad2-b504-27b518a9a7ae
beans = ['55c16e48-9b08-4b2f-93d5-8f8b089a5017','4a557687-d410-4ad2-b504-27b518a9a7ae']
bean_df = point_df[point_df.beanId.isin(beans)]
### USER INPUTS FOR DF TO BE USED ###
df_no_missing = bean_df[['drumChargeTemperature','weightLostPercent','drumDropTemperature','beanId',
                         'roastName','yellowPointTime','ibtsTurningPointTemp', 'totalRoastTime']].dropna()
allbean_df = point_df[['weightGreen','drumChargeTemperature','drumDropTemperature','beanId','roastName',
                       'yellowPointTime', 'ibtsTurningPointTemp', 'firstCrackTime',
                       'totalRoastTime','ambient','weightLostPercent', 'time/temp','deltaIBTS-BT','roastNumber']].dropna()
def OriginDataFrame (sourceDF,originCountry):
    global origin_df
    origin_list = []
    origin_df = pd.DataFrame() 
    for n in sourceDF.roastName:
        if str.lower(originCountry) in str.lower(n):
            origin_list.append(n) 
    if len(origin_list) == 0:
        print (f'\n{originCountry} not in sourceDF, check!')
    origin_df = sourceDF[sourceDF.roastName.isin(origin_list)]


# define Function for Linear Regression
def LinearRegression(sourceDF, x_name, y_name, title):
    """creates a simple linera regression plot with 4 inputs"""
    if len(sourceDF) == 0 :
        print ('sourceDF is empty, select another df for regression \n')
    else:
        x= sourceDF[x_name]; y= sourceDF[y_name]
        slope, intercept, r, p, std_err = stats.linregress(x, y)
        print ('\n\n')
        print (stats.linregress(x, y) )
        def myfunc(x):
          return slope * x + intercept
        mymodel = list(map(myfunc, x))
        plt.title(title)
        xunit = ''
        yunit = ''
        if 'time' in str.lower(x_name):
            xunit = '(mins)'
        elif 'temp' in str.lower(x_name):
            xunit = '(℃)'
        elif 'ambient' in str.lower(x_name):
            xunit = '(℉)'
        else:
            xunit = ''
        if 'time' in str.lower(y_name):
            yunit = '(mins)'
        elif 'temp' in str.lower(y_name):
            yunit = '(℃)'
        elif 'ambient' in str.lower(y_name):
            yunit = '(℉)'
        else:
            yunit = ''   
        plt.xlabel(f'{x_name} {xunit}')
        plt.ylabel(f'{y_name} {yunit}')  # (℉) ℃
        plt.scatter(x, y)
        plt.plot(x, mymodel)
        plt.show()
   

In [None]:
#############################
# Plotting LinearRegression #
#############################

## def LinearRegression(sourceDF, x_name, y_name, title):
# Regression 1 - Yellow Points
origin = 'ethiopia'
OriginDataFrame (allbean_df, origin )
LinearRegression(origin_df,'drumChargeTemperature','firstCrackTime','Check range of PH temps')  

fivehundredDF = allbean_df[allbean_df['weightGreen']>450]
# Regression 2 - Turning Points
LinearRegression(fivehundredDF,'roastNumber','deltaIBTS-BT','Probe delta')  

# Box Plot 
fivehundredDF['firstCrackTime'].plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False, showfliers = False)

#def OriginDataFrame (sourceDF,originCountry):
# Regression 3 - Origins
origin = 'ethiopia'
OriginDataFrame (allbean_df, origin )
LinearRegression(origin_df,'firstCrackTime','yellowPointTime', origin)  


# plot for all roasts no regression
x_name = 'ambient'
y_name = 'yellowPointTime'
groups = allbean_df.groupby('beanId')
for name, group in groups:
    plt.plot(group[x_name], group[y_name], marker='o', linestyle='', markersize=6, label=name)
plt.title('All Roasts colored by beanId')
plt.xlabel('ambient (℉)')
plt.ylabel('yellowPointTime (mins)')
#plt.legend()

In [None]:
########################################################  # NEED TO-DO - tighten plots, maybe in groups relative to origin
## Creating full plot temp curves and associated RORs ##  
########################################################

# selecting specific roastName; many ways to select roasts here...  
roast_list = ['#50 Kenya Recipie off 48']
plot_df = curve_df[curve_df.roastName.isin(roast_list)]

from cycler import cycler
temp_list = ['beanTemperature','drumTemperature']
derivative_list = ['beanDerivative','ibtsDerivative']

#plt.style.use('default')   #change for future
fig, ax1 = plt.subplots()

ax1.set_xlabel('roast time (mins)')
ax1.set_ylabel('temp (°C)', color='b')
ax1.set_prop_cycle(cycler('color', ['k', 'b']) + cycler('lw', [1, 2])) # necessary to set ax1 parameters before plot
ax1.tick_params(axis='y', labelcolor='b')
ax1.set_title("Bullet Roasts") 
ax1.scatter((plot_df.indexTime/sampleRate)/60, plot_df['beanTemperature'], label = 'Bean Temp', s=0.03)
ax1.scatter((plot_df.indexTime/sampleRate)/60, plot_df['drumTemperature'], label = 'Drum Temp', s=0.03)

print (len((plot_df.indexTime/sampleRate)/60), len(curve_df[temp_list]))

ax2 = ax1.twinx()  # initiate a second axes that shares the same x-axis
ax2.set_ylabel('ROR (°C/min)', color='m')  # we already handled the x-label with ax1
ax2.set_prop_cycle(cycler('color', ['r', 'm']) + cycler('lw', [1, 2]))
ax2.scatter((plot_df.indexTime/sampleRate)/60, plot_df['beanDerivative'], label='ROR', s=0.05)
ax2.scatter((plot_df.indexTime/sampleRate)/60, plot_df['ibtsDerivative'], label='ROR', s=0.05)

ax2.tick_params(axis='y', labelcolor='m')
fig.tight_layout()  # otherwise the right y-label can be slightly clipped
plt.ylim([0, 60])
plt.xlim(left=-0.2)
#plt.legend(loc='upper center') # LATER display 4 labels, currently only ROR plots for 2 of 4 curves
plt.show()

############
# SECOND D # visualizing to understand challenges
############
from cycler import cycler
temp_list = ['beanTemperature','drumTemperature']
derivative_list = ['beanDerivative','ibtsDerivative']

fig, ax1 = plt.subplots()
ax1.set_xlabel('roast time (mins)')
ax1.set_ylabel('ROR (°C/min)', color='m')
ax1.set_prop_cycle(cycler('color', ['r', 'm']) + cycler('lw', [1, 2]))
ax1.tick_params(axis='y', labelcolor='m')
ax1.set_title("Bullet Roasts") 
ax1.scatter((plot_df.indexTime/sampleRate)/60, plot_df['beanDerivative'], label='ROR', s=0.05)
ax1.scatter((plot_df.indexTime/sampleRate)/60, plot_df['ibtsDerivative'], label='ROR', s=0.05)
plt.ylim([0, 40])

ax2 = ax1.twinx()  # initiate a second axes that shares the same x-axis
marker_style = dict(marker = '.', linestyle='none', color='0.8', markersize=1,
                    mfc="C1", mec="k") #use linestyle='solid' once negative values removed and smoothed

ax2.set_ylabel(f'Second Derititive (°C/min\N{SUPERSCRIPT TWO})', color='k') 
ax2.plot((plot_df.indexTime/sampleRate)/60, plot_df['ibts2ndDerivative'], 
            label='2nd Derivitive', **marker_style) 
ax2.tick_params(axis='y', labelcolor='k')
fig.tight_layout()  # otherwise the right y-label can be slightly clipped
plt.ylim([0, 3])
plt.xlim(left=-0.2)
plt.legend(loc='upper center') # LATER display 4 labels, currently only ROR plots for 2 of 4 curves
plt.show()
print (f'Roast plotted = {roast_list}')


In [None]:
##ChatGPT code based on an interation of this prompt: Write some python code to scrape the Origin, 
## base (1-Click Rwanda Trio), value (Our 1-Click Rwanda set allows you to add this all-star trio of fresh Rwandan), and list-info (Wet Process) from this green coffee e-commerce website: https://www.sweetmarias.com/1-click-rwanda-trio.html
## Which initally didn't work, it just wasn't getting the right HTML page
## I had to ask about headers, robot.txt, and getting past Enable Javascript and cookiesthen it suggested selenium

from selenium import webdriver
from bs4 import BeautifulSoup

# Initialize the browser
browser = webdriver.Chrome()

# Navigate to the website
browser.get("https://www.sweetmarias.com/1-click-rwanda-trio.html")

# Wait for the page to load
browser.implicitly_wait(10)

# Get the HTML content
html = browser.page_source

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Extract the information you want to scrape
base = soup.find("h1", class_="page-title").get_text()
value = soup.find("div", class_="value").get_text()  # way too many values, doesn't grab it from the prefered area,  ID won't work for other beans
                                                    # also need to get the next 2+ p lines, probably in a list and loop
list_info = soup.find("div", class_="list-info").get_text() #either fix here or break out in df later
region = soup.find("td", class_="col data").get_text() #only works for first instance of "col data"
arrival = soup.find("td", class_="col data").get_text() #pulls from the first col data above (repeated)...

# Print the results
print ("Base:", base)
print("Value:", value)
print("List-info:", list_info)
print("Region:", region)
print("Arrival:", arrival)

# Close the browser
browser.quit()


### Things that didn't work
#autoYP2 = curve_df[(curve_df.index > indexTP) & (curve_df['drumTemperature'] >= 165) ,curve_df.index]
#print (autoYP2)

########
# things that did work or should do more
# display (np.sum(df.ambient == 0.0))
# df.ambient.replace(to_replace = 0.0, value = np.nan, inplace = True) 
##vectorized looping


### 