In [1]:
import pandas as pd

# Data Cleaning

In [173]:
data_folder = 'data/'

df_review_raw = pd.read_csv(data_folder + 'reviews_feb_2023.csv', header=0)
df_raw = pd.read_csv(data_folder + 'df_arabica_clean.csv', header=0)

In [174]:
df_raw.columns

Index(['Unnamed: 0', 'ID', 'Country of Origin', 'Farm Name', 'Lot Number',
       'Mill', 'ICO Number', 'Company', 'Altitude', 'Region', 'Producer',
       'Number of Bags', 'Bag Weight', 'In-Country Partner', 'Harvest Year',
       'Grading Date', 'Owner', 'Variety', 'Status', 'Processing Method',
       'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
       'Uniformity', 'Clean Cup', 'Sweetness', 'Overall', 'Defects',
       'Total Cup Points', 'Moisture Percentage', 'Category One Defects',
       'Quakers', 'Color', 'Category Two Defects', 'Expiration',
       'Certification Body', 'Certification Address', 'Certification Contact'],
      dtype='object')

In [191]:
df = df_raw.copy()

In [192]:
# Define the mapping for grouping
def categorize_method(method):
    if method not in ['Natural / Dry', 'Washed / Wet', 'Pulped natural / honey']:
        return 'Other'
    return method

# Apply the mapping to the specified column
df['Processing Method'] = df['Processing Method'].apply(categorize_method)
df['Processing Method'].value_counts()

Processing Method
Washed / Wet              124
Natural / Dry              46
Pulped natural / honey     25
Other                      12
Name: count, dtype: int64

Unnamed: 0,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance
0,Natural / Dry,0.242515,0.087391,0.144106,0.010125,0.365889,0.203964
1,Other,1.0,1.0,1.0,1.0,1.0,1.0
2,Pulped natural / honey,0.0,0.0,0.151855,0.0,0.0,0.0
3,Washed / Wet,0.14477,0.018387,0.0,0.012431,0.462523,0.196513


In [207]:
from sklearn.preprocessing import MinMaxScaler

dimensions = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance']
value_column = 'Processing Method'
grouped_df = df.groupby(value_column)[dimensions].mean().reset_index()

# MinMaxScale dimensions
scaler = MinMaxScaler()
for dimension in grouped_df.columns:
    if dimension != value_column:
        # 将当前列的数据进行缩放
        grouped_df[dimension] = scaler.fit_transform(grouped_df[[dimension]])
        
grouped_df

Unnamed: 0,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance
0,Natural / Dry,0.242515,0.087391,0.144106,0.010125,0.365889,0.203964
1,Other,1.0,1.0,1.0,1.0,1.0,1.0
2,Pulped natural / honey,0.0,0.0,0.151855,0.0,0.0,0.0
3,Washed / Wet,0.14477,0.018387,0.0,0.012431,0.462523,0.196513


In [208]:
import plotly.graph_objects as go

fig = go.Figure()

for i, row in grouped_df.iterrows():
    fig.add_trace(go.Scatterpolar(
        r=row[dimensions],
        theta=dimensions,
        fill='toself',
        name=row[value_column]
    ))
fig.show()
