## Load Data

In [1]:
from pandas import read_csv

#request_url = "https://raw.githubusercontent.com/s2t2/openai-embeddings-2023/3214c695d462b1389f0d669ce865eef3a9963a55/results/reduced_classification/all_results.csv"
request_url = "https://raw.githubusercontent.com/s2t2/openai-embeddings-2023/one-csv/results/reduced_classification/all_results.csv"
df = read_csv(request_url)
#df["is_reduced"] = df["reducer_type"].notna()
df["reducer_type"].fillna(value="N/A", inplace=True)
df.head()

Unnamed: 0,dataset,reducer_type,n_components,y_col,model_type,best_params,accuracy,f1_macro,f1_weighted,roc_auc_score
0,original,,1536,is_bot,LogisticRegression,{'classifier__max_iter': 25},0.945,0.943,0.945,0.979
1,original,,1536,is_bot,XGBClassifier,{},0.95,0.949,0.95,0.976
2,original,,1536,is_bot,RandomForestClassifier,"{'classifier__criterion': 'gini', 'classifier_...",0.942,0.941,0.942,0.975
3,pca_7,PCA,7,is_bot,RandomForestClassifier,"{'classifier__criterion': 'log_loss', 'classif...",0.939,0.938,0.94,0.974
4,pca_7,PCA,7,is_bot,LogisticRegression,{'classifier__max_iter': 1000},0.943,0.942,0.944,0.974


In [2]:
y_cols = list(df["y_col"].unique())
print("Y COLS:", y_cols)

Y COLS: ['is_bot', 'opinion_community', 'fourway_label', 'is_toxic', 'is_bom_astroturf', 'is_factual', 'is_bom_overall']


In [3]:
print(df["y_col"].value_counts())

is_bot               27
opinion_community    27
fourway_label        27
is_toxic             27
is_bom_astroturf     27
is_factual           27
is_bom_overall       27
Name: y_col, dtype: int64


In [4]:
print(df["dataset"].value_counts())

original    21
pca_7       21
pca_3       21
pca_2       21
tsne_3      21
tsne_4      21
umap_3      21
tsne_2      21
umap_2      21
Name: dataset, dtype: int64


In [5]:
df["roc_auc_score"].isna().sum()

0

## Colors


In [6]:

BLUES = ["#82c8e2", "#a4d7ec"]
YELLOWS = ["#fffec3", "#f5eea5"]
REDS = ["#ff9288", "#ff625e"]
# https://www.heavy.ai/blog/12-color-palettes-for-telling-better-stories-with-your-data:
RETRO_METRO = ["#ea5545", "#f46a9b", "#ef9b20", "#edbf33", "#ede15b", "#bdcf32", "#87bc45", "#27aeef", "#b33dc6"]
RIVER_NIGHTS = ["#b30000", "#7c1158", "#4421af", "#1a53ff", "#0d88e6", "#00b7c7", "#5ad45a", "#8be04e", "#ebdc78"]
SPRING_PASTELS = ["#fd7f6f", "#7eb0d5", "#b2e061", "#bd7ebe", "#ffb55a", "#ffee65", "#beb9db", "#fdcce5", "#8bd3c7"]

PASTELS_MAP = {
    "LogisticRegression": SPRING_PASTELS[5], # 5
    "RandomForestClassifier": SPRING_PASTELS[0],  # 3
    "XGBClassifier": SPRING_PASTELS[1]
}
METRO_MAP = {
    "LogisticRegression": RETRO_METRO[3],
    "RandomForestClassifier":  RETRO_METRO[8],
    "XGBClassifier": SPRING_PASTELS[1] # RETRO_METRO[7]
}
#COLORS_MAP = {"model_type": {
#    "LogisticRegression": SPRING_PASTELS[5], # RETRO_METRO[3],
#    "RandomForestClassifier": SPRING_PASTELS[3], #RETRO_METRO[8],
#    "XGBClassifier": SPRING_PASTELS[1] # RETRO_METRO[7]
#}}


## Analysis

### Best Results

What's the best result for each y / target col?

In [7]:
metric_col = "roc_auc_score" #@param ["roc_auc_score", "accuracy", "f1_macro", "f1_weighted"]
print(metric_col)

roc_auc_score


#### ... for all datasets?

It happens that all the best results are produced by using the full 1536 embeddings as features:

In [8]:
#df.groupby("y_col")[metric_col].max().sort_values(ascending=False)

In [9]:
best_rows = df.groupby("y_col")[metric_col].idxmax()
best = df.loc[best_rows].copy()
best[["dataset", "y_col", "model_type", metric_col]].sort_values(by=metric_col, ascending=False)

Unnamed: 0,dataset,y_col,model_type,roc_auc_score
0,original,is_bot,LogisticRegression,0.979
12,original,opinion_community,RandomForestClassifier,0.964
14,original,fourway_label,XGBClassifier,0.958
25,original,is_toxic,XGBClassifier,0.941
60,original,is_bom_astroturf,LogisticRegression,0.879
66,original,is_factual,RandomForestClassifier,0.87
143,original,is_bom_overall,LogisticRegression,0.759


In [10]:
import plotly.express as px

chart_df = best.sort_values(by=metric_col, ascending=True)
px.bar(chart_df, orientation="h", x=metric_col, y="y_col",
       title="Best results for each classification target",
       hover_data=["model_type", "dataset", "best_params"],
       text= metric_col # "dataset" #"model_type" #metric_col, # "model_type",
    )

#### ... for just the reduced datasets?

In [11]:
reduced_df = df[df["reducer_type"] != "N/A"].copy()
reduced_df["dataset"].value_counts()

pca_7     21
pca_3     21
pca_2     21
tsne_3    21
tsne_4    21
umap_3    21
tsne_2    21
umap_2    21
Name: dataset, dtype: int64

In [12]:
#reduced_df.groupby("y_col")[metric_col].max().sort_values(ascending=False)

In [13]:
best_rows_reduced = reduced_df.groupby("y_col")[metric_col].idxmax()
best_reduced = reduced_df.loc[best_rows_reduced]
best_reduced[["dataset", "y_col", "model_type", metric_col]].sort_values(by=metric_col, ascending=False)

Unnamed: 0,dataset,y_col,model_type,roc_auc_score
3,pca_7,is_bot,RandomForestClassifier,0.974
17,pca_7,fourway_label,RandomForestClassifier,0.949
18,tsne_3,opinion_community,RandomForestClassifier,0.949
39,pca_7,is_toxic,RandomForestClassifier,0.913
69,pca_7,is_bom_astroturf,RandomForestClassifier,0.866
81,pca_7,is_factual,LogisticRegression,0.854
151,pca_7,is_bom_overall,RandomForestClassifier,0.743


In [28]:
import plotly.express as px

chart_df = best_reduced.sort_values(by=metric_col, ascending=True)
fig = px.bar(chart_df, orientation="h", x=metric_col, y="y_col",
       title="Best results for each classification target (reduced datasets)",
       hover_data=["model_type", "dataset", "best_params"],
       text="dataset", # metric_col, # "model_type",
       #color="model_type", color_discrete_map=PASTELS_MAP
    )
# for some reason, coloring messes up sort order, so re-sort:
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()

### All Results

#### Bar Chart Maker

In [15]:
#y_col="is_bot"
#
#chart_df = df[df["y_col"] == y_col].copy()
#
#chart_df.groupby(["dataset", "model_type"])[metric_col].max()

In [16]:
#import plotly.express as px
#
#def chart_maker(y_col="is_bot", metric_col="roc_auc_score", fig_show=False, height=500, color_map=PASTELS_MAP):
#    chart_df = df[df["y_col"] == y_col].copy()
#    chart_df.reset_index(inplace=True)
#    # chart_df[["dataset", "model_type", "accuracy", "f1_weighted", "f1_macro", "roc_auc_score"]].head()
#    print(chart_df[["dataset", "model_type", metric_col]].head())
#
#    fig = px.bar(chart_df, y=metric_col, facet_col="dataset",
#        height=height, title=f"Classification Results (y_col='{y_col}')",
#        text=metric_col,
#        labels={"index": ""},
#        color="model_type", color_discrete_map=color_map
#    )
#    fig.for_each_annotation(lambda a: a.update(text=a.text.replace("dataset=", "")))
#    fig.update_xaxes(showticklabels=False)
#
#    if fig_show:
#        fig.show()
#
#    return fig
#

In [17]:
#for y_col in y_cols:
#    print("-------------------")
#    chart_maker(y_col=y_col, fig_show=True)

In [18]:
import plotly.express as px

def chart_maker(y_col="is_bot", metric_col="roc_auc_score", fig_show=False, height=500, color_map=PASTELS_MAP):
    chart_df = df[df["y_col"] == y_col].copy()
    #print(chart_df[["dataset", "model_type", metric_col]].head())

    fig = px.bar(chart_df, y=metric_col, facet_col="dataset", x="model_type",
        height=height, title=f"Classification Results (y_col='{y_col}')",
        text=metric_col,
        labels={"index": "", "model_type":""},
        color="model_type", color_discrete_map=color_map
    )
    fig.for_each_annotation(lambda a: a.update(text=a.text.replace("dataset=", "")))

    fig.update_xaxes(showticklabels=False)

    if fig_show:
        fig.show()

    return fig


In [19]:
chart_maker(y_col="is_bot")

In [20]:
chart_maker(y_col="opinion_community")

In [29]:
chart_maker(y_col="fourway_label")

In [22]:
chart_maker(y_col="is_bom_astroturf")

In [23]:
chart_maker(y_col="is_toxic")

In [24]:
chart_maker(y_col="is_factual")

#### Dumbbell Chart Maker

If you want the best scores on top, we reverse the y col order here, and use that in a few places to make the chart.

In [25]:
df_sorted = df.sort_values(by=[metric_col], ascending=False)
y_cols_reversed = list(reversed(df_sorted["y_col"].unique()))
y_cols_reversed

['is_bom_overall',
 'is_factual',
 'is_bom_astroturf',
 'is_toxic',
 'fourway_label',
 'opinion_community',
 'is_bot']

In [26]:
metric_col = "roc_auc_score"
model_a = "LogisticRegression"
model_b = "RandomForestClassifier"
model_c = "XGBClassifier"

line_x, line_y, series_a, series_b, series_c = [], [], [], [], []

for y_col in y_cols_reversed:
    val_a = df.loc[(df["reducer_type"] == "N/A") & (df["y_col"] == y_col) & (df["model_type"] == model_a)][metric_col].values[0]
    val_b = df.loc[(df["reducer_type"] == "N/A") & (df["y_col"] == y_col) & (df["model_type"] == model_b)][metric_col].values[0]
    val_c = df.loc[(df["reducer_type"] == "N/A") & (df["y_col"] == y_col) & (df["model_type"] == model_c)][metric_col].values[0]

    series_a.extend([val_a])
    series_b.extend([val_b])
    series_c.extend([val_c])
    line_x.extend([val_a, val_b, val_c, None]) # # the None is supposedly a magic trick to not connect the lines
    line_y.extend([y_col, y_col, y_col, None]) # the None is supposedly a magic trick to not connect the lines


print("-------------")
# four points per row:
print("LINE X:", line_x[0:8])
print("LINE Y:", line_y[0:8])
print("-------------")
# one point per row:
print("YEAR A:", series_a[0:8])
print("YEAR B:", series_b[0:8])
print("YEAR C:", series_c[0:8])
print("-------------")
print(len(line_x), len(line_y), len(series_a), len(series_b), len(series_c))

-------------
LINE X: [0.759, 0.753, 0.706, None, 0.864, 0.87, 0.865, None]
LINE Y: ['is_bom_overall', 'is_bom_overall', 'is_bom_overall', None, 'is_factual', 'is_factual', 'is_factual', None]
-------------
YEAR A: [0.759, 0.864, 0.879, 0.925, 0.955, 0.951, 0.979]
YEAR B: [0.753, 0.87, 0.871, 0.913, 0.948, 0.964, 0.975]
YEAR C: [0.706, 0.865, 0.865, 0.941, 0.958, 0.963, 0.976]
-------------
28 28 7 7 7


In [27]:

import plotly.graph_objects as go


# line from lowest to highest (will pass through middle)
line_xy = go.Scatter(x=line_x, y=line_y,
                    mode="lines", marker=dict(color="grey"), showlegend=False,
)

# point for series a
point_a = go.Scatter(x=series_a, y=y_cols_reversed, name=model_a,
                    mode="markers",  marker=dict(color=PASTELS_MAP[model_a],size=10),
)

# point for series b
point_b = go.Scatter(x=series_b, y=y_cols_reversed, name=model_b,
                    mode="markers", marker=dict(color=PASTELS_MAP[model_b],size=10),
)

# point for series c
point_c = go.Scatter(x=series_c, y=y_cols_reversed, name=model_c,
            mode="markers", marker=dict(color=PASTELS_MAP[model_c],size=10)
)

fig = go.Figure(data=[line_xy, point_a, point_b, point_c])
title = "Classification Results (original embeddings)"
fig.update_layout(title=title, height=500, legend_itemclick=False,
                  #xaxis=dict(type='log')
                  )
fig.show()
