# Setup

## Installing Packages

In [1]:
%%capture
!pip install umap-learn[plot]


In [2]:
# %%capture
# !pip install -U kaleido

In [3]:
# https://www.pauldesalvo.com/how-to-download-plotly-express-charts-as-images-in-google-colab/
%%capture
!pip install kaleido
!pip install plotly>=4.0.0
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4

## Imports

In [4]:
import os

from google.colab import drive
from google.cloud import bigquery

from pandas import DataFrame, crosstab

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler # OneHotEncoder 
from sklearn.pipeline import Pipeline 
from sklearn.manifold import TSNE
from umap import UMAP

import plotly.express as px

## Mounting the Drive

In [5]:
drive.mount('/content/drive')
print(os.listdir(os.getcwd())) 

Mounted at /content/drive
['.config', '=4.0.0', 'drive', 'sample_data']


In [6]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/Disinfo Research Shared 2022


True

## Configuring Credentials 


In [7]:
# google.cloud checks the file at path designated by the GOOGLE_APPLICATION_CREDENTIALS env var
# so we set it here using the shared credentials JSON file from our shared google drive
# and verify it for good measure

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(DIRPATH, "credentials", "tweet-research-shared-268bbccc0aac.json") 

GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud
print(GOOGLE_APPLICATION_CREDENTIALS) # verification for implicit check
print(os.path.isfile(GOOGLE_APPLICATION_CREDENTIALS)) # verification for implicit check

/content/drive/MyDrive/Research/Disinfo Research Shared 2022/credentials/tweet-research-shared-268bbccc0aac.json
True


## BigQuery Service

In [8]:
class BigQueryService():
    def __init__(self):
        self.client = bigquery.Client()

    def execute_query(self, sql, verbose=True):
        if verbose == True: 
            print(sql)
        job = self.client.query(sql)
        return job.result()
    
    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        results = self.execute_query(sql, verbose=verbose)
        records = [dict(row) for row in list(results)]
        df = DataFrame(records)
        return df


In [9]:
bq_service = BigQueryService()
print(bq_service)

<__main__.BigQueryService object at 0x7f6d5bca0df0>


## Helpers

In [10]:
DATA_DIRPATH = os.path.join(DIRPATH, "data", "profile_tag_mapping")
FIGURES_DIRPATH = os.path.join(DIRPATH, "figures", "profile_tag_mapping")

In [11]:
def component_names(n_components):
    columns = []
    for n in range(0, n_components):
        letter = "abcdefghijklmnopqrstuvwxyz"[n]
        columns.append(f"component_{letter}")
    #print(columns)
    return columns

assert component_names(1) == ["component_a"]
assert component_names(2) == ["component_a", "component_b"]

# Dashboard

In [12]:
TOP_TAGS_LIMIT = 50 #@param {type:"slider", min:5, max:250, step:5}

## Fetching Data

In [13]:
# there are 1,288,844 total rows (per user per tag), so lets only take the top tags instead
# sql = f"""
#     SELECT DISTINCT user_id, tag
#     FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_flat`
#     ORDER BY user_id
#     -- LIMIT 10
# """


In [14]:
sql = f"""
    WITH top_tags as (
        SELECT tag, count(DISTINCT user_id) as user_count
        FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_flat`
        GROUP BY tag
        ORDER BY user_count DESC
        LIMIT {int(TOP_TAGS_LIMIT)}
    )

    SELECT DISTINCT pt.user_id, pt.tag
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_flat` pt
    JOIN top_tags on top_tags.tag = pt.tag
    ORDER BY user_id
    -- LIMIT 10
"""

df = bq_service.query_to_df(sql, verbose=False)
df.head()

Unnamed: 0,user_id,tag
0,409,#IMPEACH
1,409,#RESIST
2,1153,#RESIST
3,1186,#1
4,3301,#RESIST


In [15]:
print("ROWS:", len(df))
print("USERS:", len(df["user_id"].unique()))
print("TAGS:", len(df["tag"].unique()))

ROWS: 360424
USERS: 183516
TAGS: 50


For top 250 tags, we see 506,119 rows (per user per tag) with 236,988 unique users and 250 unique tags.


For top 100 tags, we see 419,540 rows (per user per tag) with 204,423 unique users and 100 unique tags.

For top 75 tags, we see 394,703 rows (per user per tag) with 196,717 unique users and 75 unique tags.

For top 50 tags, we see 360,424 rows (per user per tag) with 183,516 unique users and 50 unique tags.


For top 25 tags, we see 297,095 rows (per user per tag) with 165,458 unique users and 25 unique tags.





## Top Tags

In [16]:
#from plotly.express import bar

CHART_TAGS_LIMIT = 15

chart_data = df.groupby(["tag"])["user_id"].count().sort_values(ascending=False).head(CHART_TAGS_LIMIT)
#chart_data.rename(columns={'count':'user_count'})
#chart_data

In [17]:

chart_data = df.groupby(["tag"])["user_id"].count().sort_values(ascending=True).tail(CHART_TAGS_LIMIT)

fig = px.bar(x=chart_data.values, y=chart_data.index, orientation="h", #, height=750,
          title="Top Hashtags found in User Profiles (Impeachment 2020 Dataset)", 
          labels={"x":"Count of Distinct User", "y": "Hashtag"},
          color_discrete_sequence=["#7F7F7F"]  #["#E377C2"] # ["#FFDD71"]  #["#8C564B"] #["#FF7F0E"] # "#aaaaaa"  "#004ba8"
)

fig.show()

In [18]:
top_tags = df.groupby(["tag"])["user_id"].count().sort_values(ascending=False)
tags_df = DataFrame({"tag": top_tags.index, "user_count": top_tags.values})
#tags_df.index = top_tags.index
tags_df.to_csv(os.path.join(DATA_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}.csv" ))
tags_df.head(10)

Unnamed: 0,tag,user_count
0,#MAGA,63744
1,#RESIST,35313
2,#KAG,27820
3,#TRUMP2020,20984
4,#WWG1WGA,14999
5,#2A,13681
6,#THERESISTANCE,11647
7,#FBR,10574
8,#RESISTANCE,10324
9,#QANON,7355


In [19]:
#tags_df[tags_df["tag"] == "#WWG1WGA"]["user_count"].iloc[0]

In [20]:
#dict(top_tags)

## One Hot Encoding

In [21]:
df.head()

Unnamed: 0,user_id,tag
0,409,#IMPEACH
1,409,#RESIST
2,1153,#RESIST
3,1186,#1
4,3301,#RESIST


In [22]:
# from pandas import get_dummies as one_hot_encoder
#
# one_hot_encoder(df["tag"]) #> OK
# one_hot_encoder(df["user_id"]) #> CRASH!!! NO MEMORY :-/

In [23]:
%%time

from pandas import crosstab

df_onehot = crosstab(df["tag"], df["user_id"])
df_onehot

CPU times: user 28.9 s, sys: 2.2 s, total: 31.1 s
Wall time: 29.1 s


user_id,409,1153,1186,3301,4822,4936,5908,6504,7578,10345,...,1242241859038121984,1242262008466010112,1242268109962018818,1242277072526929920,1242278762583953408,1242299587911512064,1242304023799009280,1242479770996068359,1242485644229849090,1242493368955015169
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#1A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#2A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#AMERICAFIRST,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#BACKTHEBLUE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#BERNIE2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#BLACKLIVESMATTER,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#BLM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#BLUEWAVE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#BLUEWAVE2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
print(df_onehot.columns.tolist()[0:5])
print(df_onehot.index[0:5])

[409, 1153, 1186, 3301, 4822]
Index(['#1', '#1A', '#2A', '#AMERICAFIRST', '#BACKTHEBLUE'], dtype='object', name='tag')


In [25]:
csv_filepath = os.path.join(DIRPATH, "data", f"tags_users_onehot_{TOP_TAGS_LIMIT}.csv")
df_onehot.to_csv(csv_filepath)

## PCA Pipeline

In [29]:
#def plot_embeddings(embed_df, n_components, df_onehot=df_onehot):
#    chart_df = embed_df.copy()
#    chart_df["tag"] = df_onehot.index
#    chart_df = chart_df.merge(tags_df, left_on="tag", right_on="tag")
#
#    title = f"Semantic Map of Top {TOP_TAGS_LIMIT} Hashtags in User Profiles (Impeachment 2020 Dataset)"
#
#    if n_components == 1:
#        chart_df["color"] = chart_df["component_a"]
#        
#        fig = px.scatter(chart_df, title=title, text="tag", size="user_count",
#            x="component_a",            
#            #labels={"component_a":""},
#            #color="color", color_continuous_scale=px.colors.colorbrewer.RdBu
#        )
#    elif n_components == 2:
#        chart_df["color"] = chart_df["component_a"] * chart_df["component_b"]
#
#        fig = px.scatter(chart_df, title=title, text="tag", size="user_count",
#            x="component_a", y="component_b",        
#            #labels={"component_a":""},
#            color="color", color_continuous_scale=px.colors.colorbrewer.RdBu
#        )
#    elif n_components == 3:
#        chart_df["color"] = chart_df["component_a"] * chart_df["component_b"] * chart_df["component_c"]
#
#        # https://plotly.com/python-api-reference/generated/plotly.express.scatter_3d.html
#        fig = px.scatter_3d(chart_df, title=title, text="tag", #size="user_count",
#            x="component_a", y="component_b", z="component_c", 
#            #labels={"component_a":"", "component_b":"", "component_a":""},
#            color="color", color_continuous_scale=px.colors.colorbrewer.RdBu_r
#        )
#        
#    return fig

In [30]:


def pca_pipeline(df_onehot=df_onehot, n_components=2):
    column_names = component_names(n_components)

    pipeline = Pipeline(steps=[
        #("one_hot", OneHotEncoder(use_cat_names=True, cols=column_names)),
        ("scaler", StandardScaler()),
        ("reducer", PCA(n_components=n_components, random_state=99))
    ])
    print(pipeline)

    embeddings = pipeline.fit_transform(df_onehot)
    print("EMBEDDINGS:", type(embeddings), embeddings.shape)
    pca = pipeline.named_steps["reducer"]
    print("EXPLAINED VARIANCE RATIO:", pca.explained_variance_ratio_)
    print("SINGULAR VALS:", pca.singular_values_)

    df_embed = DataFrame(embeddings, columns=column_names)
    df_embed["tag"] = df_onehot.index
    print(len(df_embed))
    print(df_embed.head())
    csv_filepath = os.path.join(DATA_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}_pca_{n_components}.csv")
    df_embed.to_csv(csv_filepath)

    title = f"PCA Dimension Reduction of Top {TOP_TAGS_LIMIT} Tags in User Profiles"
    if n_components in [1,2]:
        chart_options = dict(x="component_a", text="tag",title=title)
        if n_components == 2:
            chart_options["y"] = "component_b"
        fig = px.scatter(df_embed, **chart_options)
    elif n_components == 3:
        chart_df = df_embed.copy()
        chart_df["color"] = chart_df["component_a"] * chart_df["component_b"] * chart_df["component_c"]

        # https://plotly.com/python-api-reference/generated/plotly.express.scatter_3d.html
        fig = px.scatter_3d(chart_df, title=title, text="tag", #size="user_count",
            x="component_a", y="component_b", z="component_c", 
            #labels={"component_a":"", "component_b":"", "component_a":""},
            #color="color", color_continuous_scale=px.colors.colorbrewer.RdBu_r
        )

    fig.show()
    image_filepath = os.path.join(FIGURES_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}_pca_{n_components}.png")
    fig.write_image(image_filepath)


In [31]:
#pca_pipeline(n_components=1)

In [32]:
pca_pipeline(n_components=2)

Pipeline(steps=[('scaler', StandardScaler()),
                ('reducer', PCA(n_components=2, random_state=99))])
EMBEDDINGS: <class 'numpy.ndarray'> (50, 2)
EXPLAINED VARIANCE RATIO: [0.19793495 0.14108989]
SINGULAR VALS: [1347.66891397 1137.81045008]
50
   component_a  component_b            tag
0   -48.859166   -41.474028             #1
1   -19.207557   -36.745420            #1A
2    52.820406   -34.420723            #2A
3   -15.027216   -34.990186  #AMERICAFIRST
4   -29.868907   -35.936543   #BACKTHEBLUE


In [33]:
pca_pipeline(n_components=2)

Pipeline(steps=[('scaler', StandardScaler()),
                ('reducer', PCA(n_components=2, random_state=99))])
EMBEDDINGS: <class 'numpy.ndarray'> (50, 2)
EXPLAINED VARIANCE RATIO: [0.19793495 0.14108989]
SINGULAR VALS: [1347.66891397 1137.81045008]
50
   component_a  component_b            tag
0   -48.859166   -41.474028             #1
1   -19.207557   -36.745420            #1A
2    52.820406   -34.420723            #2A
3   -15.027216   -34.990186  #AMERICAFIRST
4   -29.868907   -35.936543   #BACKTHEBLUE


In [34]:
pca_pipeline(n_components=3)

Pipeline(steps=[('scaler', StandardScaler()),
                ('reducer', PCA(n_components=3, random_state=99))])
EMBEDDINGS: <class 'numpy.ndarray'> (50, 3)
EXPLAINED VARIANCE RATIO: [0.19793495 0.14108989 0.05429723]
SINGULAR VALS: [1347.66891397 1137.81045008  705.84736694]
50
   component_a  component_b  component_c            tag
0   -48.859196   -41.473923   -23.065098             #1
1   -19.207557   -36.745421     4.175816            #1A
2    52.820406   -34.420722    41.376223            #2A
3   -15.027215   -34.990189    -1.249942  #AMERICAFIRST
4   -29.868906   -35.936545    -6.041021   #BACKTHEBLUE


## T-SNE Pipeline

In [35]:


def tsne_pipeline(df_onehot=df_onehot, n_components=2):
    column_names = component_names(n_components)

    pipeline = Pipeline(steps=[
        #("one_hot", OneHotEncoder(use_cat_names=True, cols=column_names)),
        ("scaler", StandardScaler()),
        ("reducer", TSNE(n_components=n_components, random_state=99))
    ])
    print(pipeline)
    tsne = pipeline.named_steps["reducer"]

    embeddings = pipeline.fit_transform(df_onehot)
    print("EMBEDDINGS:", type(embeddings), embeddings.shape)
    
    print("K-L DIVERGENCE:", tsne.kl_divergence_)

    df_embed = DataFrame(embeddings, columns=column_names)
    df_embed["tag"] = df_onehot.index
    print(len(df_embed))
    print(df_embed.head())
    csv_filepath = os.path.join(DATA_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}_tsne_{n_components}.csv")
    df_embed.to_csv(csv_filepath)
    
    chart_df = df_embed.copy()
    chart_df["tag"] = df_onehot.index
    #chart_df = chart_df.merge(tags_df, left_on="tag", right_on="tag")

    title = f"T-SNE Dimension Reduction of Top {TOP_TAGS_LIMIT} Tags in User Profiles"
    if n_components in [1,2]:
        chart_options = dict(x="component_a", text="tag",title=title)
        if n_components == 2:
            chart_options["y"] = "component_b"
    
        fig = px.scatter(chart_df, **chart_options)
    elif n_components == 3:
        chart_df["color"] = chart_df["component_a"] * chart_df["component_b"] * chart_df["component_c"]
    
        # https://plotly.com/python-api-reference/generated/plotly.express.scatter_3d.html
        fig = px.scatter_3d(chart_df, title=title, text="tag", #size="user_count",
            x="component_a", y="component_b", z="component_c", 
            #labels={"component_a":"", "component_b":"", "component_a":""},
            #color="color", color_continuous_scale=px.colors.colorbrewer.RdBu_r
        )

    #fig = plot_embeddings(df_embed=df_embed, n_components=n_components, title=title)
    fig.show()
    image_filepath = os.path.join(FIGURES_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}_tsne_{n_components}.png")
    fig.write_image(image_filepath)



In [36]:
tsne_pipeline(n_components=1)

Pipeline(steps=[('scaler', StandardScaler()),
                ('reducer', TSNE(n_components=1, random_state=99))])



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



EMBEDDINGS: <class 'numpy.ndarray'> (50, 1)
K-L DIVERGENCE: 6.940203666687012
50
   component_a            tag
0  2879.544922             #1
1 -3368.776611            #1A
2  3121.458984            #2A
3 -3718.508545  #AMERICAFIRST
4 -3291.204346   #BACKTHEBLUE


In [37]:
tsne_pipeline(n_components=2)

Pipeline(steps=[('scaler', StandardScaler()),
                ('reducer', TSNE(random_state=99))])



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



EMBEDDINGS: <class 'numpy.ndarray'> (50, 2)
K-L DIVERGENCE: 0.7424312829971313
50
   component_a  component_b            tag
0   -54.680664  -270.438232             #1
1   -47.781769    40.930222            #1A
2   -26.017017   176.119370            #2A
3   130.821182   -58.517063  #AMERICAFIRST
4    51.133808   -35.137714   #BACKTHEBLUE


In [38]:
tsne_pipeline(n_components=3)

Pipeline(steps=[('scaler', StandardScaler()),
                ('reducer', TSNE(n_components=3, random_state=99))])



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



EMBEDDINGS: <class 'numpy.ndarray'> (50, 3)
K-L DIVERGENCE: 1.1587451696395874
50
   component_a  component_b  component_c            tag
0  -176.686371   -89.843567   -39.558567             #1
1    -4.534178  -110.564598   -16.706930            #1A
2   176.916321   -30.907244   -69.342514            #2A
3   157.230850   133.446976   227.507812  #AMERICAFIRST
4    55.426788    50.318306   -79.567726   #BACKTHEBLUE


## UMAP

In [39]:

def umapper(df_onehot=df_onehot, n_components=2):

    reducer = UMAP(n_components=n_components, random_state=99)
    embedding = reducer.fit_transform(df_onehot)

    embed_df = DataFrame(embedding, columns=component_names(n_components))
    #embed_df.index = df_onehot.index
    embed_df["tag"] = df_onehot.index
    #embed_df.head()
    csv_filepath = os.path.join(DATA_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}_umap_{n_components}.csv")
    embed_df.to_csv(csv_filepath)

    #
    # PLOTTING
    #

    chart_df = embed_df.copy()
    chart_df["tag"] = df_onehot.index
    chart_df = chart_df.merge(tags_df, left_on="tag", right_on="tag")

    title = f"Semantic Map of Top {TOP_TAGS_LIMIT} Hashtags in User Profiles (Impeachment 2020 Dataset)"
    if n_components == 1:
        chart_df["color"] = chart_df["component_a"]
        
        fig = px.scatter(chart_df, title=title, text="tag", size="user_count",
            x="component_a",            
            labels={"component_a":""},
            #color="color", color_continuous_scale=px.colors.colorbrewer.RdBu
        )
    elif n_components == 2:
        chart_df["color"] = chart_df["component_a"] * chart_df["component_b"]
        if TOP_TAGS_LIMIT == 25:
            chart_df["color"] = chart_df["component_a"] #* chart_df["component_b"] * chart_df["component_c"]

        fig = px.scatter(chart_df, title=title, text="tag", size="user_count",
            x="component_a", y="component_b",        
            labels={"component_a":"", "component_b":""},
            color="color", color_continuous_scale=px.colors.colorbrewer.RdBu
        )
    elif n_components == 3:
        chart_df["color"] = chart_df["component_a"] * chart_df["component_b"] * chart_df["component_c"]
        scale = px.colors.colorbrewer.RdBu_r
        if TOP_TAGS_LIMIT == 25:
            chart_df["color"] = chart_df["component_a"] * chart_df["component_b"] # * chart_df["component_c"]
            scale = px.colors.colorbrewer.RdBu
        
        # https://plotly.com/python-api-reference/generated/plotly.express.scatter_3d.html
        fig = px.scatter_3d(chart_df, title=title, text="tag", #size="user_count",
            x="component_a", y="component_b", z="component_c", 
            labels={"component_a":"", "component_b":"", "component_c":""},
            color="color", color_continuous_scale=scale
        )
    
    fig.show()
    image_filepath = os.path.join(FIGURES_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}_umap_{n_components}.png")
    fig.write_image(image_filepath)
    if n_components in [2,3]:
        html_filepath = os.path.join(FIGURES_DIRPATH, f"profile_tags_{TOP_TAGS_LIMIT}_umap_{n_components}.html")
        fig.write_html(html_filepath)



In [40]:
#umapper(n_components=1)

In [41]:
umapper(n_components=2)

In [42]:
umapper(n_components=3)

## UMAP Enhanced Clustering (TODO)

In [43]:
#from sklearn.cluster import KMeans
## n_init: Number of time the k-means algorithm will be run with different centroid seeds. 
## ... The final results will be the best output of n_init consecutive runs in terms of inertia.
#
#N_CLUSTERS = 2
#kmeans = KMeans(init="k-means++", n_clusters=N_CLUSTERS, n_init=1000, random_state=99, verbose=True) 
#x_kmeans = kmeans.fit_transform(df_onehot)

In [44]:
#print(kmeans.cluster_centers_)
#print(kmeans.labels_)
#print(kmeans.feature_names_in_)

In [45]:
#import numpy as np
#
#cluster_df = DataFrame(x_kmeans, columns=["cluster_a", "cluster_b"])
#cluster_df.index = df_onehot.index
#cluster_df.head()

In [46]:
#import seaborn as sns
#import matplotlib.pyplot as plt
#
#plt.figure(figsize = (16,16))
#
#sns.heatmap(cluster_df, linewidths=0.5)

In [47]:
#cluster_df.idxmax(axis="columns")