In [None]:
import course;course.header()

## More advanced Pandas

In [None]:
import pandas as pd
pd.set_option("max_columns", 3000)

import numpy as np

In [None]:
df = pd.read_csv("../data/arabica_data_cleaned.csv", index_col=0)

In [None]:
df.head()

# Dropping object columns

In [None]:
 df.dtypes.head()

In [None]:
col_to_drop = []
for col, d in zip(df.columns, df.dtypes):
    if d == 'object':
        if col not in ["Country.of.Origin", "Producer", "Processing.Method"]:
            col_to_drop.append(col)
df.drop(columns=col_to_drop, inplace=True)

# renaming the dots

In [None]:
df.rename(
    columns={
        "Country.of.Origin": "Country of Origin", 
        "Producer": "Producer", 
        "Processing.Method": "Processing Method"
    },
    inplace=True
)

In [None]:
df['Processing Method'].value_counts()

In [None]:
df.sample(5)

In [None]:
df.describe()

How to describe each Country.of.Origin separately?

In [None]:
df.groupby("Country of Origin").describe()

# Aggregation and group operations!

In [None]:
grp = df[['Country of Origin','Body']].groupby("Country of Origin")

In [None]:
grp.head()

Groups behave like dataframes or series!

In [None]:
grp.idxmax()

**Note:** index is of original dataframe. Remember if not state explicitly, you will work on a view!

In [None]:
# get the content of a group object
grp.get_group('Vietnam')

In [None]:
grp.mean()

Groupby can also be combined


In [None]:
df[
    [
        'Country of Origin',
        'Body',
        'Aroma'
    ]
].groupby("Country of Origin").mean()

In [None]:
grp_cols = ['Country of Origin',"Processing Method"]
data_cols = ['Body','Aroma']
grp = df[grp_cols + data_cols].groupby(grp_cols)

In [None]:
grp.mean()

## Interating using groupby

In [None]:
grp_cols = ['Country of Origin',"Processing Method"]
data_cols = ['Body','Aroma']
for name, grp in df[grp_cols + data_cols].groupby(grp_cols):
    print(name)
    display(grp)
    break

How to set Body for the first entry to 10?

In [None]:
grp_cols = ['Country of Origin',"Processing Method"]
data_cols = ['Body','Aroma']
for name, grp in df[grp_cols + data_cols].groupby(grp_cols):
    print(name)
    display(grp.iloc[0]["Body"])
    grp.iloc[0]["Body"] = 10
    break

In [None]:
grp_cols = ['Country of Origin',"Processing Method"]
data_cols = ['Body','Aroma']
for name, grp in df[grp_cols + data_cols].groupby(grp_cols):
    print(name)
    display(grp)
    break

How to fix it?

In [None]:
grp_cols = ['Country of Origin',"Processing Method"]
data_cols = ['Body','Aroma']
for name, grp in df[grp_cols + data_cols].groupby(grp_cols):
    print(name)
#     print()
    df.loc[grp.index[0], 'Body'] = 10
    display(grp)
    break

# more generic groupby and aggregation

In [None]:
grps = df.groupby(['Country of Origin',"Processing Method"])
# making is simpler - not defining data columns, ie keeping all

In [None]:
grps.agg("mean")

In [None]:
grps.max("mean")['Body']

# Apply - Split - Combine
a very frequent usage of groupby

In [None]:
def top_x(df, x=7, col="Body"):
    return df.sort_values(col, ascending=False).head(x)

top_x(df)

In [None]:
df.groupby(['Country of Origin',"Processing Method"]).apply(top_x)

In [None]:
# passing args n kwargs to apply function
df.groupby(
    ['Country of Origin',"Processing Method"]
).apply(top_x, x=2)

# Binning values
Pandas cut or pandas qcut!
* cut : cut an array by equal wide bins
* qcut : cut an array by equally populated bin

In [None]:
pd.cut(df.Aroma, 5) 
# note dot nomenclature (instead of df['Aroma']) only works if columns
# have no spaces 

In [None]:
pd.qcut(df.Aroma, 5)

In [None]:
df['Aroma_bin'] = pd.qcut(df.Aroma, 5)

In [None]:
df['Aroma_cat'] = pd.qcut(
    df.Aroma, 
    5, 
    labels=['--', '-', '', '+', '++']
)

In [None]:
df.head()

In [None]:
df.groupby(
    ['Country of Origin',"Processing Method", "Aroma_cat"]
).apply(top_x, x=2).head(10)

# Correlations based on Country of Origin


In [None]:
def correlate(grp, col_1="Aroma", col_2="Aroma"):
    return grp[col_1].corr(grp[col_2])

df.groupby(
    ['Country of Origin']
).apply(
    correlate, 
    col_1="Aroma", 
    col_2="Body"
).sort_values()

In [None]:
def correlate(grp, col_1="Aroma", col_2="Aroma"):
    return grp[col_1].corr(grp[col_2]), grp.shape[0]  # how many obervations

df.groupby(
    ['Country of Origin']
).apply(
    correlate, 
    col_1="Aroma", 
    col_2="Body"
).sort_values()

In [None]:
def correlate(grp, col_1="Aroma", col_2="Aroma"):
    return pd.Series(
        [
            grp[col_1].corr(grp[col_2]), 
            grp.shape[0]
        ], 
        index=['pearson-corr','n']
    )

df.groupby('Country of Origin').apply(
    correlate,  
    col_1="Aroma", 
    col_2="Body"
).sort_values('pearson-corr')

# Pivot tables


In [None]:
df.shape

In [None]:
df.pivot_table(
    index=[
        "Country of Origin", 
        "Producer", 
        "Processing Method"
    ], 
    aggfunc=np.mean
)

In [None]:
df.pivot_table(
    index=[
        "Processing Method",
        "Country of Origin", 
#         "Producer",  # will be dropped
    ], 
    aggfunc=np.mean
)

In [None]:
df.pivot_table(
    index=[
        "Processing Method",
        "Country of Origin", 
#         "Producer",  # will be dropped
    ], 
    aggfunc={
        'Aroma' : np.mean,
        'Body': np.max
    }
)

In [None]:
df.pivot_table(
    index=[
        "Processing Method",
        "Country of Origin", 
#         "Producer",  # will be dropped
    ], 
    aggfunc=[np.median, np.mean]
)

In [None]:
idx = pd.IndexSlice

In [None]:
_df = df.pivot_table(
    index=[
        "Processing Method",
        "Country of Origin", 
#         "Producer",  # will be dropped
    ], 
    aggfunc=[np.median, np.mean, np.min, np.max]
)
_df.loc[
    idx[:,'Brazil'], 
    idx[:,['Acidity', 'Body']]
]

In [None]:
_df.loc[
    idx['Natural / Dry',:], 
    idx['median',['Acidity', 'Body']]
].head(5)

## Backup


In [None]:
df.sort_values("Total.Cup.Points", ascending=False).head(1)

In [None]:
df['Country.of.Origin'].unique()

In [None]:
df['altitude_mean_meters'].describe()

In [None]:
df.sort_values('altitude_mean_meters', ascending=False).head(10)

# Visualizing distribution of altitude

In [None]:
data = []
for name, grp in df.groupby("Country.of.Origin"):
    data.append(go.Histogram(x=grp['altitude_mean_meters'], name=name))
fig = go.Figure(data=data)
fig.show()

# Reviews

In [None]:
df.columns

In [None]:
rcols = [
    'Aroma', 
    'Flavor', 
    'Aftertaste', 
    'Acidity', 
    'Body',
#     'Balance', 
#     'Uniformity', 
#     'Clean.Cup', 
#     'Sweetness', 
    'Cupper.Points'
]

In [None]:
import numpy as np
data = []
for name, grp in df.groupby("Country.of.Origin"):
    s1 = grp[rcols].apply(np.max, axis=0)
    data.append(
        go.Scatterpolar(
          r=s1,
          theta=rcols,
#           fill='toself',
          name=name
        )
    )
fig = go.Figure(data=data)
fig.show()