## Setup

### Google Drive

In [73]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd())) #> 'content', ['.config', 'drive', 'sample_data']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content ['.config', 'drive', '=4.0.0', 'results', 'sample_data']


In [74]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DATA_DIR = '/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020'
print(DATA_DIR)
assert os.path.isdir(DATA_DIR)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020


In [75]:
#users_sample_csv_filepath = os.path.join(DATA_DIR, "users_sample_by_account_type_v2_and_their_tweets.csv")
#assert os.path.isfile(users_sample_csv_filepath)

In [76]:
MODEL_ID = "text-embedding-ada-002"

embeddings_csv_filepath = os.path.join(DATA_DIR, MODEL_ID, "users_sample_openai_embeddings.csv")
assert os.path.isfile(embeddings_csv_filepath)

### Packages

In [77]:
%%capture
!pip install umap-learn[plot]

In [78]:
# https://www.pauldesalvo.com/how-to-download-plotly-express-charts-as-images-in-google-colab/
%%capture
!pip install kaleido
!pip install plotly>=4.0.0
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4

## Load Embeddings

In [79]:
from pandas import read_csv

df = read_csv(embeddings_csv_filepath)
df.drop(columns=["user_id.1"], inplace=True)
#df.index = df["user_id"]
df.head()

Unnamed: 0,user_id,created_on,screen_name_count,screen_names,status_count,rt_count,rt_pct,opinion_community,is_bot,is_q,profile_descriptions,tweet_texts,profile_embeddings,tweet_embeddings
0,14710012,2008-05-09,1,TREYPOLE,5,1,0.2,0,False,False,"ACTOR, MUSICIAN, FILMMAKER, DATABASE ARCHITECT...",RT @foxnewpolls: Daily FOX NEW POLL: (retweet ...,"[-0.017691081389784813, -0.022827202454209328,...","[-0.035227615386247635, 0.012718206271529198, ..."
1,43295319,2009-05-29,1,ERICCRAIN,1,1,1.0,0,False,False,DFS PRO. FORMER POKER PRO. STL SPORTS ENTHUSIA...,"RT @MattOswaltVA: ""I JUST GOT ARRESTED FOR MAK...","[-0.005914194509387016, -0.017133159562945366,...","[-0.018160004168748856, 0.0018456234829500318,..."
2,239281184,2011-01-17,1,MY_BIRDIE98,3,0,0.0,0,False,False,2 GIRLS 2 BOYS ARMY VET🇺🇲 BSN RN CCM CLNC BA P...,Just in time for impeachment!!! @Ryan449955 @b...,"[-0.04244328290224075, -0.009620477445423603, ...","[-0.02768823318183422, 0.01606280729174614, 0...."
3,996168658077528065,2018-05-14,1,REPELTHEGOONS,1,0,0.0,0,False,False,"PRO #2A, #REPELTHEGOONS. FOR GOD, COUNTRY & F...",I'd trust Casey Anthony with my kids more than...,"[-0.016304858028888702, -0.00787061732262373, ...","[0.0014648172073066235, 0.032542694360017776, ..."
4,1073315597939822592,2018-12-13,1,QMAN10251,1,1,1.0,0,False,False,JUST LIVE MY LIFE NOT TALKING ANYTHING SERIOUS...,RT @jemelehill: Same NFL who thinks players kn...,"[-0.02205023169517517, -0.004288257099688053, ...","[-0.01741509884595871, -0.03316406160593033, 0..."


### User Labels

In [80]:
df["opinion_label"] = df["opinion_community"].map({0:"Anti-Trump", 1:"Pro-Trump"})
df["bot_label"] = df["is_bot"].map({True:"Bot", False:"Human"})
df["q_label"] = df["is_q"].map({True:"Q-anon", False:"Normal"})

In [81]:
df["group_label"] = df["opinion_label"] + " " + df["q_label"] + " " + df["bot_label"]
df["group_label"].value_counts()

Pro-Trump Q-anon Human     47
Pro-Trump Q-anon Bot       40
Anti-Trump Normal Human    39
Pro-Trump Normal Human     36
Anti-Trump Normal Bot      35
Pro-Trump Normal Bot       34
Name: group_label, dtype: int64

In [82]:
#GREY = "#ccc"
#PURPLE = "#7E57C2"

# light --> dark
#BLUES = ["#3498DB", "#2E86C1", "#2874A6"]
#REDS = ["#D98880", "#E6B0AA", "#C0392B", "#B03A2E", "#922B21"]

# colorbrewer scales
BLUES = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1', '#6baed6', '#4292c6', '#2171b5', '#08519c', '#08306b']
REDS = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272', '#fb6a4a', '#ef3b2c', '#cb181d', '#a50f15', '#67000d']
PURPLES = ['#fcfbfd', '#efedf5', '#dadaeb', '#bcbddc', '#9e9ac8', '#807dba', '#6a51a3', '#54278f', '#3f007d']
GREYS = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd', '#969696', '#737373', '#525252', '#252525', '#000000']
#GREENS = ['#f7fcf5', '#e5f5e0', '#c7e9c0', '#a1d99b', '#74c476', '#41ab5d', '#238b45', '#006d2c', '#00441b'],
#ORANGES = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b', '#fd8d3c', '#f16913', '#d94801', '#a63603', '#7f2704']


OPINION_COLORS_MAP = {"Anti-Trump": BLUES[5], "Pro-Trump": REDS[5]}
BOT_COLORS_MAP = {"Human": GREYS[3], "Bot": PURPLES[6]}
Q_COLORS_MAP = {"Normal":GREYS[3], "Q-anon": REDS[6]}

GROUP_COLORS_MAP = {
    "Anti-Trump Normal Human": BLUES[3],
    "Anti-Trump Normal Bot": BLUES[6],

    "Pro-Trump Normal Human": REDS[2],
    "Pro-Trump Normal Bot": REDS[3],

    "Pro-Trump Q-anon Human": REDS[6],
    "Pro-Trump Q-anon Bot": REDS[7],
}
#df["group_color"] = df["group_label"].map(GROUP_COLORS_MAP)

In [83]:
len(df)

231

### Unpack Embeddings

The embeddings happen to be stored as a JSON string, so we'll need to convert that single column into a column per value in the embeddings array. We'll get 1536 columns back.

In [84]:
import json

def unpack(embeddings_str):
    # idempotence check
    if isinstance(embeddings_str, str):
        return json.loads(embeddings_str)
    else:
        return embeddings_str


df["tweet_embeddings"] = df["tweet_embeddings"].apply(unpack)
df["profile_embeddings"] = df["profile_embeddings"].apply(unpack)

In [85]:
type(df["tweet_embeddings"][0])
len(df["tweet_embeddings"][0]) #> 1536

1536

These datasets have a column per embedding, and some user label columns.

In [86]:
from pandas import DataFrame

tweet_embeddings = DataFrame(df["tweet_embeddings"].values.tolist())
profile_embeddings = DataFrame(df["profile_embeddings"].values.tolist())

In [87]:
LABEL_COLS = ["user_id", #"created_on", "screen_name_count", "status_count", "rt_count", "rt_pct", 
    "opinion_community", "is_bot", "is_q",
    # engineered labels:
    "opinion_label", "bot_label", "q_label", 
    "group_label" #, "group_color"
]
tweets_df = df[LABEL_COLS].merge(tweet_embeddings, left_index=True, right_index=True)
profiles_df = df[LABEL_COLS].merge(profile_embeddings, left_index=True, right_index=True)
profiles_df

Unnamed: 0,user_id,opinion_community,is_bot,is_q,opinion_label,bot_label,q_label,group_label,0,1,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,14710012,0,False,False,Anti-Trump,Human,Normal,Anti-Trump Normal Human,-0.017691,-0.022827,...,0.030491,-0.022148,0.008513,-0.029132,-0.020164,0.024662,-0.008078,-0.023683,0.004929,-0.010700
1,43295319,0,False,False,Anti-Trump,Human,Normal,Anti-Trump Normal Human,-0.005914,-0.017133,...,0.034793,-0.041967,0.036427,-0.025250,-0.015471,0.031053,-0.032133,-0.017923,0.007375,-0.017410
2,239281184,0,False,False,Anti-Trump,Human,Normal,Anti-Trump Normal Human,-0.042443,-0.009620,...,0.004069,-0.018379,0.004217,-0.020804,-0.025978,0.052899,-0.014943,-0.010773,-0.009546,-0.018635
3,996168658077528065,0,False,False,Anti-Trump,Human,Normal,Anti-Trump Normal Human,-0.016305,-0.007871,...,0.021565,0.003580,0.022813,-0.022344,-0.031644,0.041171,0.027537,-0.007965,0.011655,-0.019552
4,1073315597939822592,0,False,False,Anti-Trump,Human,Normal,Anti-Trump Normal Human,-0.022050,-0.004288,...,0.028178,0.008083,0.004814,-0.038819,-0.008935,0.011461,0.022191,-0.016127,-0.007474,-0.036819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,585143406,1,True,True,Pro-Trump,Bot,Q-anon,Pro-Trump Q-anon Bot,-0.036304,-0.016253,...,-0.001333,-0.022079,0.020180,-0.022833,-0.034223,0.024094,0.019504,-0.004232,-0.006696,-0.006553
227,819615302715637760,1,True,True,Pro-Trump,Bot,Q-anon,Pro-Trump Q-anon Bot,-0.039970,-0.020100,...,0.017803,-0.000142,-0.016019,-0.025530,-0.017495,0.061713,0.002817,-0.037942,0.014684,-0.029368
228,1203386270010900480,1,True,True,Pro-Trump,Bot,Q-anon,Pro-Trump Q-anon Bot,-0.024254,-0.018535,...,0.000068,-0.020598,0.031617,-0.034139,-0.040509,0.032025,0.024649,-0.016420,0.010688,-0.010382
229,1176824400760451079,1,True,True,Pro-Trump,Bot,Q-anon,Pro-Trump Q-anon Bot,-0.013144,-0.012658,...,0.034970,-0.020955,0.013130,-0.033667,-0.021009,0.044544,0.020783,-0.030290,0.008177,0.012505


## Dimensionality Reduction

In [88]:

import os
import numpy as np
from functools import cached_property

from pandas import DataFrame
from sklearn.preprocessing import scale #, StandardScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP

import plotly.express as px

FIG_SHOW = False
FIG_SAVE = False

class ReductionPipeline:
    def __init__(self, df, label_cols, x_scale=True, reducer_type="PCA", n_components=2, labels_df=None, results_dirname="reduction_results"):
        """Params 
            df: a DataFrame with all the feature columns, plus some label columns 
        
            reducer_type: one of "PCA", "T-SNE", "UMAP"

            label_cols: list of strings: the columns you want to use for labeling / segmenting. 
                choose all column names except for the features.
        """
        self.df = df

        self.label_cols = label_cols
        self.labels_df = self.df[self.label_cols]
        self.x = self.df.drop(columns=self.label_cols)
        #print("X:", self.x.shape)
        
        #self.y = self.df[y_col]
        #print("Y:", len(self.y))
        
        self.x_scale = x_scale
        self.reducer_type = reducer_type
        self.n_components = n_components

        self.reducer_name = {"PCA": "pca", "T-SNE": "tsne", "UMAP": "umap"}[self.reducer_type]

        self.results_dirname = results_dirname

        self.reducer = None
        self.embeddings = None
        self.embeddings_df = None
        self.loadings = None
        self.loadings_df = None


    @cached_property
    def feature_names(self):
        # returns strings because PCA get_feature_names_out only works with string feature names
        # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
        return [str(colname) for colname in self.x.columns.tolist()]

    @cached_property
    def component_names(self):
        return [f"component_{i}" for i in range(1, self.n_components+1)]

    @cached_property
    def x_scaled(self):
        x = scale(self.x)
        df = DataFrame(x, columns=self.feature_names)
        df.index = self.x.index
        return df

    def perform(self):
        if self.x_scale:
            x = self.x_scaled
        else:
            x = self.x

        if self.reducer_type == "PCA":
            self.reducer = PCA(n_components=self.n_components, random_state=99)
        elif self.reducer_type == "T-SNE":
            self.reducer = TSNE(n_components=self.n_components, random_state=99)
        elif self.reducer_type == "UMAP":
            self.reducer = UMAP(n_components=self.n_components, random_state=99)

        self.embeddings = self.reducer.fit_transform(x)
        #print("EMBEDDINGS:", type(self.embeddings), self.embeddings.shape)
        self.embeddings_df = DataFrame(self.embeddings, columns=self.component_names)
        self.embeddings_df = self.embeddings_df.merge(self.labels_df, left_index=True, right_index=True)

        # EXPLAINABILITY:
        if self.reducer_type == "PCA":
            #print("EXPLAINED VARIANCE RATIO:", self.reducer.explained_variance_ratio_)
            #print("SINGULAR VALS:", self.reducer.singular_values_)

            self.loadings = self.reducer.components_.T * np.sqrt(self.reducer.explained_variance_)
            #print("LOADINGS...", type(self.loadings), self.loadings.shape)
            self.loadings_df = DataFrame(self.loadings, columns=self.component_names)
            self.loadings_df.index = self.reducer.feature_names_in_

            # these represent the absolute magnitude of importances, not direction up or down
            self.feature_importances = {}
            for component_name in self.component_names:
                top_feature_names = self.loadings_df.abs().sort_values(by=[component_name], ascending=False).head(10)[component_name]
                self.feature_importances[component_name] = top_feature_names.to_dict()

        elif self.reducer_type == "T-SNE":
            print("K-L DIVERGENCE:", self.reducer.kl_divergence_)



    @property
    def results_dirpath(self):
        #dirpath = os.path.join(RESULTS_DIRPATH, "youtube", f"length_{self.track_length}_mfcc_{self.n_mfcc}")
        dirpath = self.results_dirname # f"results/{self.results_dirname}" # colab
        os.makedirs(dirpath, exist_ok=True)
        return dirpath

    @property
    def embeddings_png_filepath(self):
        return os.path.join(self.results_dirpath, f"{self.reducer_name}_{self.n_components}.png")

    @property
    def embeddings_html_filepath(self):
        return os.path.join(self.results_dirpath, f"{self.reducer_name}_{self.n_components}.html")

    @property
    def centroids_png_filepath(self):
        return os.path.join(self.results_dirpath, f"{self.reducer_name}_{self.n_components}_centroids.png")

    @property
    def centroids_html_filepath(self):
        return os.path.join(self.results_dirpath, f"{self.reducer_name}_{self.n_components}_centroids.html")


    def plot_embeddings(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, title=None, subtitle=None, color=None, color_map=None):
        title = title or f"Dimensionality Reduction ({self.reducer_type} n_components={self.n_components})"
        if subtitle:
            title = title + f"<br><sup>{subtitle}</sup>"

        chart_params = dict(x="component_1", y="component_2", height=height,
            title=title, hover_data=self.label_cols
        )
        if color:
            chart_params["color"] = color
        if color_map:
            chart_params["color_discrete_map"] = color_map

        fig = None
        if self.n_components == 2:
            fig = px.scatter(self.embeddings_df, **chart_params)
        elif self.n_components ==3:
            chart_params["z"] = "component_3"
            fig = px.scatter_3d(self.embeddings_df, **chart_params)

        if fig and fig_show:
            fig.show()

        if fig and fig_save:
            #fig.write_image(self.embeddings_png_filepath)
            fig.write_html(self.embeddings_html_filepath)

        return fig


    def plot_embedding_centroids(self, groupby_col, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, title=None, subtitle=None, color_map=None):
        title = title or f"Dimensionality Reduction ({self.reducer_type} n_components={self.n_components}) Centroids"
        if subtitle:
            title = title + f"<br><sup>{subtitle}</sup>"

        chart_params = dict(x="component_1", y="component_2", height=height,
            title=title, #hover_data=self.label_cols,
            color=groupby_col, text=groupby_col              
        )
        if color_map:
            chart_params["color_discrete_map"] = color_map

        agg_params = {"component_1": "mean", "component_2": "mean"}

        fig = None
        if self.n_components == 2:
            centroids = self.embeddings_df.groupby(groupby_col).agg(agg_params)
            centroids[groupby_col] = centroids.index
            fig = px.scatter(centroids, **chart_params)

        elif self.n_components == 3:
            chart_params["z"] = "component_3"
            agg_params["component_3"] = "mean"
            centroids = self.embeddings_df.groupby(groupby_col).agg(agg_params)
            centroids[groupby_col] = centroids.index
            fig = px.scatter_3d(centroids, **chart_params)

        if fig:
            fig.update_traces(textposition='top center')

        if fig and fig_show:
            fig.show()

        if fig and fig_save:
            fig.write_html(self.centroids_html_filepath)

        return fig


### Save Figures

In [89]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
RESULTS_DIR = '/content/drive/MyDrive/Research/DS Research Shared 2023/users/mjr300/Impeachment 2020/reduction_results'
print(RESULTS_DIR)
os.makedirs(RESULTS_DIR, exist_ok=True)
assert os.path.isdir(RESULTS_DIR)

/content/drive/MyDrive/Research/DS Research Shared 2023/users/mjr300/Impeachment 2020/reduction_results


In [118]:

for n_components in [2,3]:

    for reducer_type in ["PCA", "T-SNE", "UMAP"]:
        print("---------------")
        print(reducer_type, n_components)

        results_dirname = os.path.join(RESULTS_DIR, "profiles")
        profiles_pipeline = ReductionPipeline(df=profiles_df, label_cols=LABEL_COLS, reducer_type=reducer_type, results_dirname=results_dirname, n_components=n_components)
        profiles_pipeline.perform()
        subtitle = "User Profile Embeddings"
        profiles_pipeline.plot_embeddings(fig_show=False,           fig_save=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
        profiles_pipeline.plot_embeddings(fig_show=False,           fig_save=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
        profiles_pipeline.plot_embeddings(fig_show=False,           fig_save=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
        profiles_pipeline.plot_embeddings(fig_show=False,           fig_save=True, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)
        profiles_pipeline.plot_embedding_centroids(fig_show=False,  fig_save=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
        profiles_pipeline.plot_embedding_centroids(fig_show=False,  fig_save=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
        profiles_pipeline.plot_embedding_centroids(fig_show=False,  fig_save=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
        profiles_pipeline.plot_embedding_centroids(fig_show=False,  fig_save=True, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

        results_dirname = os.path.join(RESULTS_DIR, "tweets")
        tweets_pipeline = ReductionPipeline(df=tweets_df, label_cols=LABEL_COLS, reducer_type=reducer_type, results_dirname=results_dirname, n_components=n_components)
        tweets_pipeline.perform()
        subtitle = "User Tweet Embeddings"
        tweets_pipeline.plot_embeddings(fig_show=False,          fig_save=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
        tweets_pipeline.plot_embeddings(fig_show=False,          fig_save=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
        tweets_pipeline.plot_embeddings(fig_show=False,          fig_save=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
        tweets_pipeline.plot_embeddings(fig_show=False,          fig_save=True, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)
        tweets_pipeline.plot_embedding_centroids(fig_show=False, fig_save=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
        tweets_pipeline.plot_embedding_centroids(fig_show=False, fig_save=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
        tweets_pipeline.plot_embedding_centroids(fig_show=False, fig_save=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
        tweets_pipeline.plot_embedding_centroids(fig_show=False, fig_save=True, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)


---------------
PCA
---------------
T-SNE
K-L DIVERGENCE: 0.958896815776825
K-L DIVERGENCE: 1.0971325635910034
---------------
UMAP
---------------
PCA
---------------
T-SNE
K-L DIVERGENCE: 1.400071382522583
K-L DIVERGENCE: 1.4581352472305298
---------------
UMAP


### PCA Tuner

In [115]:

class PCATuner:

    def __init__(self, df, label_cols=LABEL_COLS, results_dirname="results"):
        self.df = df
        self.label_cols = label_cols
        self.feature_names = self.df.drop(columns=self.label_cols).columns.tolist()

        self.results_dirname = results_dirname
        self.results = None
        self.results_df = None


    def perform(self, components_limit=50):
        self.results = []

        components_range = range(1, len(self.feature_names)+1)
        if components_limit:
            components_range = components_range[0:components_limit]

        for n_components in components_range:
            pipeline = ReductionPipeline(self.df, label_cols=self.label_cols, 
                                         reducer_type="PCA", n_components=n_components)
            pipeline.perform()

            pca = pipeline.reducer
            self.results.append({
                "n_components": n_components,
                "explained_variance": pca.explained_variance_ratio_.sum(),
                "eigenvals": pca.explained_variance_, # number of vals depend on n components
                #"loadings": loadings,
                #"embeddings": embeddings
            })
        self.results_df = DataFrame(self.results)
        #print(self.results_df[["n_components", "explained_variance"]].head())





    @property
    def results_dirpath(self):
        #dirpath = os.path.join(RESULTS_DIRPATH, "youtube", f"length_{self.track_length}_mfcc_{self.n_mfcc}")
        dirpath = self.results_dirname # "results" # colab
        os.makedirs(dirpath, exist_ok=True)
        return dirpath


    def plot_explained_variance(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, subtitle=None, log_y=False):
        title = f"Total Explained Variance by Number of Components (PCA)"
        if subtitle:
            title = title + f"<br><sup>{subtitle}</sup>"
        
        chart_opts = dict(x="n_components", y="explained_variance",
                title=title, height=height,
                markers="line+point", color_discrete_sequence=["steelblue"],
        )
        if log_y:
            chart_opts["log_y"] = True # range_y=[0,1] # range_x=[1,100000], "]

        fig = px.line(self.results_df, **chart_opts)
        if fig_show:
            fig.show()

        if fig_save:
            image_filepath = os.path.join(self.results_dirpath, "pca-explained-variance.png")
            fig.write_image(image_filepath)
        #return fig


    def plot_scree(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, subtitle=None, log_y=False):
        eigenvals = self.results_df.sort_values(by=["n_components"], ascending=False).iloc[0]["eigenvals"]
        print("EIGENVALS:", eigenvals)

        component_numbers = list(range(1, len(self.results_df)+1))
        print("COMPONENT NUMBERS:", component_numbers)

        title=f"Scree Plot of Eigenvalues by Component (PCA)"
        if subtitle:
            title = title + f"<br><sup>{subtitle}</sup>"

        fig = px.line(x=component_numbers, y=eigenvals,
                title=title, height=height,
                labels={"x": "Component Number", "y": "Eigenvalue"},
                markers="line+point", color_discrete_sequence=["steelblue"],
                log_y=log_y
        )
        if fig_show:
            fig.show()

        if fig_save:
            image_filepath = os.path.join(self.results_dirpath, "pca-scree.png")
            fig.write_image(image_filepath)
        #return fig



#### Profiles

In [116]:

results_dirname = os.path.join(RESULTS_DIR, "profiles")
profile_tuner = PCATuner(df=profiles_df, label_cols=LABEL_COLS, results_dirname=results_dirname)
profile_tuner.perform(components_limit=100)
subtitle = "User Profile Embeddings"
profile_tuner.plot_explained_variance(fig_show=True, fig_save=True, subtitle=subtitle)
profile_tuner.plot_scree(fig_show=True, fig_save=True, subtitle=subtitle)

EIGENVALS: [96.50921924 47.06744037 38.01652548 32.98382608 30.81943006 26.53104001
 25.96635246 23.82736474 22.59364863 20.73679924 20.30480758 19.59138356
 18.91947448 18.75792184 17.90890339 17.47280911 17.21348771 16.71788569
 15.71629148 15.65267168 15.45177629 14.8594491  14.46511716 13.87202733
 13.64896264 13.40347551 13.2559412  13.06310739 13.00084418 12.50836129
 12.41115974 12.35823152 11.90986926 11.61308259 11.40230332 11.29398093
 11.12042608 10.9881184  10.72193801 10.59152607 10.44085085 10.32578244
 10.1563362   9.94241544  9.71813635  9.58658025  9.43901231  9.38584604
  9.2941693   9.05780748  8.92866843  8.86354727  8.70531799  8.54789957
  8.4143992   8.25938299  8.19118008  8.1543578   8.00842976  7.85382997
  7.77943999  7.73313449  7.62218557  7.52026241  7.38816665  7.36142302
  7.16560178  7.0337077   6.94643999  6.87273177  6.84993306  6.77334588
  6.69121281  6.57237737  6.53106028  6.42971209  6.39481831  6.28396968
  6.19744658  6.03512155  5.96604977  5.

#### Tweets

In [117]:
results_dirname = os.path.join(RESULTS_DIR, "tweets")
tweet_tuner = PCATuner(df=tweets_df, label_cols=LABEL_COLS, results_dirname=results_dirname)
tweet_tuner.perform(components_limit=100)
subtitle = "User Tweet Embeddings"
tweet_tuner.plot_explained_variance(fig_show=True, fig_save=True, subtitle=subtitle)
tweet_tuner.plot_scree(fig_show=True, fig_save=True, subtitle=subtitle)

EIGENVALS: [118.1198751   44.25442061  37.03156698  33.4761317   28.86425773
  26.20539355  25.31373813  23.24796548  22.15035644  20.67585271
  20.47873458  19.30564035  19.20263015  18.46639105  18.42232017
  17.3856701   16.60527069  16.55925822  16.21713046  15.53752426
  15.11262337  14.78279563  14.52136807  14.17314841  13.83557584
  13.36701577  13.21216299  13.15577823  12.79976587  12.51255766
  12.17386512  12.05053515  11.9065662   11.74533617  11.60113247
  11.34203389  11.17150186  11.08216101  10.97328064  10.70553241
  10.29988342  10.16785353  10.06184216   9.92771576   9.66402889
   9.60875382   9.35182299   9.15240702   9.12315713   9.06488832
   8.86427785   8.74907893   8.66261851   8.59757488   8.47796607
   8.26448982   8.08501755   7.99881657   7.94053768   7.68785335
   7.63689635   7.52241868   7.43619496   7.35112018   7.23403554
   7.17410776   7.00555867   6.94487601   6.89185785   6.74532598
   6.68445241   6.59012972   6.55865012   6.45604185   6.42291669

### PCA (n=2)

#### Profiles

In [94]:
profile_pca = ReductionPipeline(df=profiles_df, label_cols=LABEL_COLS)
profile_pca.perform()

In [95]:
subtitle = "User Profile Embeddings"
profile_pca.plot_embeddings(fig_show=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
profile_pca.plot_embeddings(fig_show=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
profile_pca.plot_embeddings(fig_show=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
profile_pca.plot_embeddings(fig_show=False, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

In [96]:
subtitle = "User Profile Embeddings"
#profile_pca.plot_embedding_centroids(fig_show=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
#profile_pca.plot_embedding_centroids(fig_show=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
#profile_pca.plot_embedding_centroids(fig_show=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
profile_pca.plot_embedding_centroids(fig_show=False, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

#### Tweets

In [97]:
tweets_pca = ReductionPipeline(df=tweets_df, label_cols=LABEL_COLS)
tweets_pca.perform()

In [98]:
subtitle = "User Tweet Embeddings"
tweets_pca.plot_embeddings(fig_show=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
tweets_pca.plot_embeddings(fig_show=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
tweets_pca.plot_embeddings(fig_show=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
tweets_pca.plot_embeddings(fig_show=False, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

In [99]:
subtitle = "User Tweet Embeddings"
#tweets_pca.plot_embedding_centroids(fig_show=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
#tweets_pca.plot_embedding_centroids(fig_show=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
#tweets_pca.plot_embedding_centroids(fig_show=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
tweets_pca.plot_embedding_centroids(fig_show=False, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

### T-SNE (n=2)

#### Profiles

In [100]:
#profile_tsne = ReductionPipeline(df=profiles_df, label_cols=LABEL_COLS, reducer_type="T-SNE")
#profile_tsne.perform()

In [101]:
#subtitle = "User Profile Embeddings"
#profile_tsne.plot_embeddings(fig_show=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
#profile_tsne.plot_embeddings(fig_show=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
#profile_tsne.plot_embeddings(fig_show=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
#profile_tsne.plot_embeddings(fig_show=False, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

In [102]:
#subtitle = "User Profile Embeddings"
##profile_tsne.plot_embedding_centroids(fig_show=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
##profile_tsne.plot_embedding_centroids(fig_show=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
##profile_tsne.plot_embedding_centroids(fig_show=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
#profile_tsne.plot_embedding_centroids(fig_show=False, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

#### Tweets

In [103]:
tweets_tsne = ReductionPipeline(df=tweets_df, label_cols=LABEL_COLS, reducer_type="T-SNE")
tweets_tsne.perform()

K-L DIVERGENCE: 1.0971325635910034


In [104]:
subtitle = "User Tweet Embeddings"
tweets_tsne.plot_embeddings(fig_show=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
tweets_tsne.plot_embeddings(fig_show=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
tweets_tsne.plot_embeddings(fig_show=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
tweets_tsne.plot_embeddings(fig_show=False, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

In [105]:
subtitle = "User Tweet Embeddings"
#tweets_tsne.plot_embedding_centroids(fig_show=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
#tweets_tsne.plot_embedding_centroids(fig_show=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
#tweets_tsne.plot_embedding_centroids(fig_show=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
tweets_tsne.plot_embedding_centroids(fig_show=False, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

### UMAP (n=2)





#### Profiles

In [106]:
#profile_umap = ReductionPipeline(df=profiles_df, label_cols=LABEL_COLS, reducer_type="UMAP")
#profile_umap.perform()

In [107]:
#subtitle = "User Profile Embeddings"
#profile_umap.plot_embeddings(fig_show=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
#profile_umap.plot_embeddings(fig_show=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
#profile_umap.plot_embeddings(fig_show=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
#profile_umap.plot_embeddings(fig_show=False, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

In [108]:
#subtitle = "User Profile Embeddings"
##profile_umap.plot_embedding_centroids(fig_show=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
##profile_umap.plot_embedding_centroids(fig_show=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
##profile_umap.plot_embedding_centroids(fig_show=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
#profile_umap.plot_embedding_centroids(fig_show=False, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

#### Tweets

In [109]:
tweets_umap = ReductionPipeline(df=tweets_df, label_cols=LABEL_COLS, reducer_type="UMAP")
tweets_umap.perform()

In [110]:
#subtitle = "User Tweet Embeddings"
#tweets_umap.plot_embeddings(fig_show=True, color="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
#tweets_umap.plot_embeddings(fig_show=True, color="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
#tweets_umap.plot_embeddings(fig_show=True, color="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
#tweets_umap.plot_embeddings(fig_show=False, color="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)

In [111]:
subtitle = "User Tweet Embeddings"
#tweets_umap.plot_embedding_centroids(fig_show=True, groupby_col="opinion_label", subtitle=subtitle, color_map=OPINION_COLORS_MAP)
#tweets_umap.plot_embedding_centroids(fig_show=True, groupby_col="bot_label", subtitle=subtitle, color_map=BOT_COLORS_MAP)
#tweets_umap.plot_embedding_centroids(fig_show=True, groupby_col="q_label", subtitle=subtitle, color_map=Q_COLORS_MAP)
tweets_umap.plot_embedding_centroids(fig_show=False, groupby_col="group_label", subtitle=subtitle, color_map=GROUP_COLORS_MAP)