# Imports

In [1]:
# regex
import re

# pandas + numpy
import numpy as np
import pandas as pd

# setting pandas options
pd.set_option('display.max_colwidth', 200)


# storing and loading models
import pickle

# to set types for functions
from typing import Tuple

# Plotting
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


# gpu debug
import torch

# setting device to use GPU for NLP backend if you have GPU available
device = "cuda" if torch.cuda.is_available() else "cpu"


# SBERT
from sentence_transformers import SentenceTransformer

# UMAP
from umap import UMAP

#HDBSCAN
from hdbscan import HDBSCAN

# topic finding
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading model from pickle if possible, to avoid downloading it again
try:
    model = pickle.load(open(f'model-{device}.pkl', 'rb'))

    model_load = True

except:
    model = SentenceTransformer('all-mpnet-base-v2', device=device)
    pickle.dump(model, open(f'model-{device}.pkl', 'wb'))

    model_load = False

print(f"""
GPUs detected:          {torch.cuda.device_count()}
Using GPU:              {torch.cuda.is_available()}
Device:                 {device}
Got model from pickle:  {model_load}
""")

  from .autonotebook import tqdm as notebook_tqdm



GPUs detected:          0
Using GPU:              False
Device:                 cpu
Got model from pickle:  True



# Function

In [2]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words

## Cleaning

In [3]:
def string_cleaner(input: str) -> str:
    """
    Function to clean up strings.

    Args:
        input (str): String to be cleaned.

    Returns:
        str: Cleaned string.
    """
    
    # turning lowercase
    input = input.lower()

    # removing punctuation and other non-alphanumeric characters
    input = re.sub(r'[^\w\s]', '', input)
    
    return input

## Topic Modeling

In [4]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words



def topic_by_clusterId(result: pd.DataFrame) -> dict:
  """
  Function that maps topics to cluster ids.

  Args:
      result (pd.DataFrame): Dataframe with cluster ids and topics.

  Returns:
      dict: Dictionary with cluster ids as keys and topics as values.
  """

  #print(result.isna().sum())

  df_group = result[["titles", "cluster_label"]].groupby("cluster_label").agg(list).reset_index()

  df_group["topics"] = tfidf_most_relevant_word(df_group["titles"])

  return dict(zip(df_group.cluster_label, df_group.topics))

## Plotting Functions

In [5]:
# when you actually cast the type here, then it works with how pandas casts types and you don't have to worry about copying seriers
def result_df_maker(embeddings: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> pd.DataFrame:
  """
  Function to make a dataframe with the embeddings, cluster labels, topic per cluster label and titles.

  Args:
      embeddings (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      pd.DataFrame: Dataframe with embeddings, cluster labels, topics per cluster, and titles.
  """
  result = pd.DataFrame(embeddings, columns=['x', 'y'])

  result["titles"] = titles

  result["cluster_label"] = cluster_labels

  topic_dict = topic_by_clusterId(result)

  result["topics"] = result["cluster_label"].apply(lambda x: topic_dict[x])

  result["topics"] = result["topics"].apply(lambda x: " ".join(x))

  return result

def result_splitter(result: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
  """
  Function to split the dataframe into two dataframes, one for clustered and one for outliers.

  Args:
      result (pd.DataFrame): Dataframe with embeddings, cluster labels, topics per cluster, and titles.

  Returns:
      Tuple[np.ndarray, np.ndarray]: Tuple of two dataframes, one for clustered and one for outliers.
  """

  clustered = result.loc[result.cluster_label != -1, :]
  outliers = result.loc[result.cluster_label == -1, :]
  return clustered, outliers

# the cavalry is not here, but it's fine! Why? I am here!
def result_tracer(clustered: pd.DataFrame, outliers: pd.DataFrame) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      clustered (pd.DataFrame): clustered dataframe to be colored by cluster and get hover data
      outliers (pd.DataFrame): outlier data frame with grey color and no hover data

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  trace_cluster = go.Scattergl(
    x=clustered.x, 
    y=clustered.y, 
    mode="markers", 
    name="Clustered",

    # styling markers
    marker=dict(
      size=2, 
      color=clustered.cluster_label,
      colorscale="Rainbow"
    ), 

    # setting hover text to the titles of the videos
    hovertemplate="<b>Title:</b> %{customdata[0]} <br><b>Topics:</b> %{customdata[1]} <br><b>Cluster Id:</b> %{customdata[2]}<extra></extra>", 
    customdata=np.column_stack([clustered.titles, clustered.topics, clustered.cluster_label]),
  )

  trace_outlier = go.Scattergl(
    x=outliers.x,
    y=outliers.y,
    mode="markers",
    name="Outliers",

    marker=dict(
      size=1,
      color="grey"
    ),

    hovertemplate="Outlier<extra></extra>"
  )

  return trace_cluster, trace_outlier

def result_tracer_wrapper(uembs: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      uembs (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  result = result_df_maker(uembs, cluster_labels, titles)
  clustered, outliers = result_splitter(result)
  trace_cluster, trace_outlier = result_tracer(clustered, outliers)
  return trace_cluster, trace_outlier

In [6]:
def subplotter(trace_nested_list: list, titles: list, base_size=1000) -> go.Figure:
    """
    Function to make a figure with subplots of the clustered and outliers.

    Args:
        trace_nested_list (list): list holding rows of columns, each column holding traces. 
        titles (list): Titles for the subplots
        base_size (int, optional): Base size of the sub plots. Defaults to 1000.

    Returns:
        go.Figure: Figure with subplots.
    """
    
    row_count = len(trace_nested_list)
    col_count = len(trace_nested_list[0])
    
    fig = make_subplots(
        rows=row_count, 
        cols=col_count,
        subplot_titles=(titles),
        vertical_spacing=0.02,
        horizontal_spacing=0.02
    )

    for i, row in enumerate(trace_nested_list):
        for j, col in enumerate(row):

            # adding both outlieers and clustered
            for trace in col:
                fig.add_trace(trace, row=i+1, col=1)
    
    # figure settings
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    
    fig.update_layout(width=base_size*col_count, height=base_size*row_count, plot_bgcolor='rgba(250,250,250,1)')

    return fig

## Saving / Showing Plots

In [7]:
def fig_show_save(fig: go.Figure, filename: str, show=True):
  """
  Function to show and save a figure.

  Args:
      fig (go.Figure): fig to be saved and shown
      filename (str): filename to save the figure, without extension
      show (bool, optional): Option to disable showing of figure (in case too big for notebook). Defaults to True.
  """
  
  # writing both interactible .html and static image .png
  fig.write_html(f"figures/{filename}.html")
  fig.write_image(f"figures/{filename}.png")

  if show: 
    fig.show()

# Data Part

In the code beneath this block, we are using pandas to read a data from a local source

we are then copying just one series from that data, and that we are sampling that down, in the case where we do not have a gpu to do our text encodings

In [8]:
df_whole = pd.read_csv("data/USvideos.csv")

df = df_whole[["title"]].copy()

if device == "cpu": df = df.sample(frac=0.05)

df

Unnamed: 0,title
3637,Americans Try To Explain The Royal Family
4488,Testing Nail Products from Wish Nails Remove Toes Hollow Nail Polish Peel-off Nails Wish Buy Now
36325,NERF Hide Your Weapon Challenge!
30548,David Guetta & Sia - Flames (Official Video)
36346,We Are So Blessed | Acacia Kersey
...,...
8024,Honey Butter Smashed Potato 🎄Asian at Home Holiday Special Recipe🎄
5746,Youtube Rewind 2017 - BEHIND THE SCENES
39243,Christopher Robin Official Trailer
492,Model Fei Fei Sun Perform Skin-Care Magic | Beauty Secrets | Vogue


In [9]:
list(df["title"])[0:20]

['Americans Try To Explain The Royal Family',
 'Testing Nail Products from Wish Nails Remove Toes Hollow Nail Polish Peel-off Nails Wish Buy Now',
 'NERF Hide Your Weapon Challenge!',
 'David Guetta & Sia - Flames (Official Video)',
 'We Are So Blessed | Acacia Kersey',
 'MY GLOWING Natural MAKEUP ROUTINE',
 'Introducing: The Players! - Super Smash Bros. Invitational 2018',
 'James Cameron Answers Sci-Fi Questions From Twitter | Tech Support | WIRED',
 'Drake - Nice For What',
 "BTS (방탄소년단) 'FAKE LOVE' Official MV",
 'Daddy Yankee - Hielo (Video Oficial)',
 'Freeze! NZ Police’s most entertaining recruitment video, yet!',
 "How Michael B. Jordan's Black Panther Makeup Was Done — Exclusive Behind The Scenes",
 'The Smallest House In The World',
 'The Chainsmokers - You Owe Me',
 'Cardi B - Bartier Cardi (feat. 21 Savage) [Official Audio]',
 'Wizards Give Bad Directions',
 'Are Critics What Killed Justice League? Sony Pay Too Much For Tarantino? - The John Campea Show',
 'MY MY MY! TRAILE

In [10]:
df["title_clean"] = df["title"].apply(string_cleaner)

df.head(3)

Unnamed: 0,title,title_clean
3637,Americans Try To Explain The Royal Family,americans try to explain the royal family
4488,Testing Nail Products from Wish Nails Remove Toes Hollow Nail Polish Peel-off Nails Wish Buy Now,testing nail products from wish nails remove toes hollow nail polish peeloff nails wish buy now
36325,NERF Hide Your Weapon Challenge!,nerf hide your weapon challenge


# Getting Encodings

In [11]:
embs = model.encode(df["title_clean"].to_numpy())

In [12]:
print(f"""
{type(embs)}
hvor langt er selve embs: {len(embs)}
hvor langt et er et element: {len(embs[0])}
hva er første element i embs:
{embs[0][0:20]}
""")


<class 'numpy.ndarray'>
hvor langt er selve embs: 2047
hvor langt et er et element: 768
hva er første element i embs:
[-0.00952766  0.03230976  0.00629791  0.0193039  -0.0046478   0.02963522
 -0.04211115  0.00308626  0.01404309  0.02988292  0.00276043 -0.00798842
  0.01713376 -0.0602063  -0.02199618 -0.03249558  0.024577   -0.01760846
  0.01674838 -0.02218898]



In [13]:
df["embs"] = list(embs)

df.head(3)

Unnamed: 0,title,title_clean,embs
3637,Americans Try To Explain The Royal Family,americans try to explain the royal family,"[-0.009527659, 0.03230976, 0.006297907, 0.019303905, -0.004647805, 0.029635219, -0.042111155, 0.0030862591, 0.014043092, 0.02988292, 0.0027604308, -0.007988418, 0.017133761, -0.060206298, -0.02199..."
4488,Testing Nail Products from Wish Nails Remove Toes Hollow Nail Polish Peel-off Nails Wish Buy Now,testing nail products from wish nails remove toes hollow nail polish peeloff nails wish buy now,"[0.032359283, 0.0020081406, -0.037049524, 0.010657191, -0.04816739, 0.013827549, 0.05480524, 0.072066315, 0.06314754, 0.012768811, 0.009375898, 0.0018209312, 0.014223949, 0.04230842, 0.0010414064,..."
36325,NERF Hide Your Weapon Challenge!,nerf hide your weapon challenge,"[0.008093136, 0.029868003, -0.007413508, 0.05254078, -0.046362087, -0.0072559356, 0.063857436, 0.032383356, -0.034648493, 0.00055324804, 0.00017251789, 0.031673957, -0.026213238, 0.0054176454, -0...."


# Dimensionality Reduction

In [14]:
umap = UMAP(n_neighbors=20, min_dist=0.1)

embs_2d = umap.fit_transform(embs)

In [15]:
print(f"""
{type(embs_2d)}
hvor langt er selve embs_2d: {len(embs_2d)}
hvor langt et er et element: {len(embs_2d[0])}
hva er første element i embs_2d:
{embs_2d[0]}
""")


<class 'numpy.ndarray'>
hvor langt er selve embs_2d: 2047
hvor langt et er et element: 2
hva er første element i embs_2d:
[6.947312  4.3327556]



In [16]:
df["embs_2d"] = list(embs_2d)

df.head(3)

Unnamed: 0,title,title_clean,embs,embs_2d
3637,Americans Try To Explain The Royal Family,americans try to explain the royal family,"[-0.009527659, 0.03230976, 0.006297907, 0.019303905, -0.004647805, 0.029635219, -0.042111155, 0.0030862591, 0.014043092, 0.02988292, 0.0027604308, -0.007988418, 0.017133761, -0.060206298, -0.02199...","[6.947312, 4.3327556]"
4488,Testing Nail Products from Wish Nails Remove Toes Hollow Nail Polish Peel-off Nails Wish Buy Now,testing nail products from wish nails remove toes hollow nail polish peeloff nails wish buy now,"[0.032359283, 0.0020081406, -0.037049524, 0.010657191, -0.04816739, 0.013827549, 0.05480524, 0.072066315, 0.06314754, 0.012768811, 0.009375898, 0.0018209312, 0.014223949, 0.04230842, 0.0010414064,...","[5.6373625, 0.77200705]"
36325,NERF Hide Your Weapon Challenge!,nerf hide your weapon challenge,"[0.008093136, 0.029868003, -0.007413508, 0.05254078, -0.046362087, -0.0072559356, 0.063857436, 0.032383356, -0.034648493, 0.00055324804, 0.00017251789, 0.031673957, -0.026213238, 0.0054176454, -0....","[4.376295, 5.010812]"


In [17]:
fig = px.scatter(x=embs_2d[:,0], y=embs_2d[:,1])

fig.update_layout(width=800, height=800)
fig.update_traces(marker=dict(size=2))

# plotting to show how the embeddings are when just dimensionality reduction is used
fig_show_save(fig, "umap-scatter")

ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido


In [18]:
clusters_2d = HDBSCAN(min_cluster_size=10, cluster_selection_method="leaf").fit(embs_2d)

print(f"""
    2D
    Number of clusters: {len(set(clusters_2d.labels_)) - 1}
    Number of rows as outliers: {clusters_2d.labels_.tolist().count(-1)}
""")


    2D
    Number of clusters: 50
    Number of rows as outliers: 999



In [19]:
df["cluster_id"] = clusters_2d.labels_

df.head(3)

Unnamed: 0,title,title_clean,embs,embs_2d,cluster_id
3637,Americans Try To Explain The Royal Family,americans try to explain the royal family,"[-0.009527659, 0.03230976, 0.006297907, 0.019303905, -0.004647805, 0.029635219, -0.042111155, 0.0030862591, 0.014043092, 0.02988292, 0.0027604308, -0.007988418, 0.017133761, -0.060206298, -0.02199...","[6.947312, 4.3327556]",23
4488,Testing Nail Products from Wish Nails Remove Toes Hollow Nail Polish Peel-off Nails Wish Buy Now,testing nail products from wish nails remove toes hollow nail polish peeloff nails wish buy now,"[0.032359283, 0.0020081406, -0.037049524, 0.010657191, -0.04816739, 0.013827549, 0.05480524, 0.072066315, 0.06314754, 0.012768811, 0.009375898, 0.0018209312, 0.014223949, 0.04230842, 0.0010414064,...","[5.6373625, 0.77200705]",-1
36325,NERF Hide Your Weapon Challenge!,nerf hide your weapon challenge,"[0.008093136, 0.029868003, -0.007413508, 0.05254078, -0.046362087, -0.0072559356, 0.063857436, 0.032383356, -0.034648493, 0.00055324804, 0.00017251789, 0.031673957, -0.026213238, 0.0054176454, -0....","[4.376295, 5.010812]",-1


# TODO
- plot
- high dimensionality clustring
- sette på titler på plot
- vise cluster rangert
- kanskje se på value counts ord i cluster

In [None]:
trace_cluster_2d, trace_outlier_2d = result_tracer_wrapper(embs_2d, clusters_2d.labels_, df["title_clean"].to_numpy())


col11 = [trace_cluster_2d, trace_outlier_2d]


row1 = [col11]


trace_list = [row1]

fig = subplotter(trace_list, ["Topics by HDBSCAN Cluster", ])

fig_show_save(fig, "topics-by-hdbscan-clusters")

"""
result_2d = result_df_maker(embs_2d, clusters_2d.labels_, df["title_clean"].to_numpy())

result_2d[["cluster_label", "topics"]].groupby(["cluster_label", "topics"])["topics"].count().reset_index(name="vidoes_count").sort_values(by="vidoes_count", ascending=False).head(20)
"""

In [None]:
dfcg = df.groupby(["cluster_id"])

dfcg = dfcg.agg(list)

dfcg = dfcg.reset_index()

dfcg.head(3)

Unnamed: 0,cluster_id,title,title_clean,embs,embs_2d
0,-1,"[Narcos - Season 4 | Teaser [HD] I Netflix, John Mayer - New Light (Premium Content!), 世界で一番切れるパスタの包丁を作りたい！, singing the same song 6 years later.., The Ultimate Red Lip Look | Nicole Guerriero, Th...","[narcos season 4 teaser hd i netflix, john mayer new light premium content, 世界で一番切れるパスタの包丁を作りたい, singing the same song 6 years later, the ultimate red lip look nicole guerriero, the weirdest m...","[[-0.0667129, 0.06655551, -0.026436742, -0.0069335666, -0.00504826, 0.016876755, 0.0023019437, 0.006201012, -0.014044925, 0.01971911, 0.0012816396, -0.04646158, -0.015638025, 0.073019825, 0.009613...","[[11.840066, 7.4985857], [11.953117, 12.808144], [10.026723, 13.0706], [11.4126835, 12.34074], [7.718423, 12.980728], [7.3262234, 9.097649], [11.038841, 7.5897055], [11.416008, 11.547061], [8.0286..."
1,0,"[Real Madrid vs. Liverpool | 2017-18 UEFA Champions League Final Highlights, PSG 1-2 Real Madrid | RONALDO & HIS TEAMMATES IN THE DRESSING ROOM: Celebrations, Roma vs. Liverpool | 2017-18 UEFA Cha...","[real madrid vs liverpool 201718 uefa champions league final highlights, psg 12 real madrid ronaldo his teammates in the dressing room celebrations, roma vs liverpool 201718 uefa champions lea...","[[-0.0269576, -0.06661856, 0.014376589, 0.01771011, 0.020629233, 0.008125495, -0.099945396, 0.041083433, -0.039946366, -0.007319208, -0.006689946, 0.025209751, 0.03762367, -0.053506248, 0.04864922...","[[8.741208, 6.6592283], [8.701977, 6.6592607], [8.733076, 6.6889124], [8.665647, 6.8511167], [8.798641, 6.662543], [8.783636, 6.75002], [8.727512, 6.7018895], [8.674904, 6.674897], [8.76352, 6.706..."
2,1,"[Top 10 Plays of the Night: January 6, 2018, Top 10 Plays of the Night: January 27, 2018, Let's talk about Colin Kaepernick | Chart Party, Never Bet Your Money On Another Man's Game, Top 5 Plays o...","[top 10 plays of the night january 6 2018, top 10 plays of the night january 27 2018, lets talk about colin kaepernick chart party, never bet your money on another mans game, top 5 plays of the n...","[[-0.046044108, 0.026989933, -0.014179003, -0.021738028, -0.011217805, -0.029685352, -0.0053041033, 0.03616013, 0.014296963, -0.023670096, 0.01899161, 0.047746863, 0.019091759, 0.025366396, 0.0493...","[[9.450432, 7.3553696], [9.452858, 7.3683295], [9.488768, 7.381989], [9.402591, 7.333701], [9.422234, 7.3112636], [9.423366, 7.323855], [9.511345, 7.2999], [9.479753, 7.2313175], [9.396548, 7.4073..."
