# Imports

In [44]:
# regex
import re

# pandas + numpy
import numpy as np
import pandas as pd

# setting pandas options
pd.set_option('display.max_colwidth', 200)


# storing and loading models
import pickle

# to set types for functions
from typing import Tuple

# Plotting
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


# gpu debug
import torch

# setting device to use GPU for NLP backend if you have GPU available
device = "cuda" if torch.cuda.is_available() else "cpu"


# SBERT
from sentence_transformers import SentenceTransformer

# UMAP
from umap import UMAP

#HDBSCAN
from hdbscan import HDBSCAN

# topic finding
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading model from pickle if possible, to avoid downloading it again
try:
    model = pickle.load(open(f'model-{device}.pkl', 'rb'))

    model_load = True

except:
    model = SentenceTransformer('all-mpnet-base-v2', device=device)
    pickle.dump(model, open(f'model-{device}.pkl', 'wb'))

    model_load = False

print(f"""
GPUs detected:          {torch.cuda.device_count()}
Using GPU:              {torch.cuda.is_available()}
Device:                 {device}
Got model from pickle:  {model_load}
""")


GPUs detected:          0
Using GPU:              False
Device:                 cpu
Got model from pickle:  True



# Functions

In [45]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words

## Cleaning

In [46]:
def string_cleaner(input: str) -> str:
    """
    Function to clean up strings.

    Args:
        input (str): String to be cleaned.

    Returns:
        str: Cleaned string.
    """
    
    # turning lowercase
    input = input.lower()

    # removing punctuation and other non-alphanumeric characters
    input = re.sub(r'[^\w\s]', '', input)
    
    return input

## Topic Modeling

In [47]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words



def topic_by_clusterId(result: pd.DataFrame) -> dict:
  """
  Function that maps topics to cluster ids.

  Args:
      result (pd.DataFrame): Dataframe with cluster ids and topics.

  Returns:
      dict: Dictionary with cluster ids as keys and topics as values.
  """

  #print(result.isna().sum())

  df_group = result[["titles", "cluster_label"]].groupby("cluster_label").agg(list).reset_index()

  df_group["topics"] = tfidf_most_relevant_word(df_group["titles"])

  return dict(zip(df_group.cluster_label, df_group.topics))

## Plotting Functions

In [48]:
# when you actually cast the type here, then it works with how pandas casts types and you don't have to worry about copying seriers
def result_df_maker(embeddings: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> pd.DataFrame:
  """
  Function to make a dataframe with the embeddings, cluster labels, topic per cluster label and titles.

  Args:
      embeddings (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      pd.DataFrame: Dataframe with embeddings, cluster labels, topics per cluster, and titles.
  """
  result = pd.DataFrame(embeddings, columns=['x', 'y'])

  result["titles"] = titles

  result["cluster_label"] = cluster_labels

  topic_dict = topic_by_clusterId(result)

  result["topics"] = result["cluster_label"].apply(lambda x: topic_dict[x])

  result["topics"] = result["topics"].apply(lambda x: " ".join(x))

  return result

def result_splitter(result: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
  """
  Function to split the dataframe into two dataframes, one for clustered and one for outliers.

  Args:
      result (pd.DataFrame): Dataframe with embeddings, cluster labels, topics per cluster, and titles.

  Returns:
      Tuple[np.ndarray, np.ndarray]: Tuple of two dataframes, one for clustered and one for outliers.
  """

  clustered = result.loc[result.cluster_label != -1, :]
  outliers = result.loc[result.cluster_label == -1, :]
  return clustered, outliers

# the cavalry is not here, but it's fine! Why? I am here!
def result_tracer(clustered: pd.DataFrame, outliers: pd.DataFrame) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      clustered (pd.DataFrame): clustered dataframe to be colored by cluster and get hover data
      outliers (pd.DataFrame): outlier data frame with grey color and no hover data

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  trace_cluster = go.Scattergl(
    x=clustered.x, 
    y=clustered.y, 
    mode="markers", 
    name="Clustered",

    # styling markers
    marker=dict(
      size=2, 
      color=clustered.cluster_label,
      colorscale="Rainbow"
    ), 

    # setting hover text to the titles of the videos
    hovertemplate="<b>Title:</b> %{customdata[0]} <br><b>Topics:</b> %{customdata[1]} <br><b>Cluster Id:</b> %{customdata[2]}<extra></extra>", 
    customdata=np.column_stack([clustered.titles, clustered.topics, clustered.cluster_label]),
  )

  trace_outlier = go.Scattergl(
    x=outliers.x,
    y=outliers.y,
    mode="markers",
    name="Outliers",

    marker=dict(
      size=1,
      color="grey"
    ),

    hovertemplate="Outlier<extra></extra>"
  )

  return trace_cluster, trace_outlier

def result_tracer_wrapper(uembs: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      uembs (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  result = result_df_maker(uembs, cluster_labels, titles)
  clustered, outliers = result_splitter(result)
  trace_cluster, trace_outlier = result_tracer(clustered, outliers)
  return trace_cluster, trace_outlier

In [49]:
def subplotter(trace_nested_list: list, titles: list, base_size=1000) -> go.Figure:
    """
    Function to make a figure with subplots of the clustered and outliers.

    Args:
        trace_nested_list (list): list holding rows of columns, each column holding traces. 
        titles (list): Titles for the subplots
        base_size (int, optional): Base size of the sub plots. Defaults to 1000.

    Returns:
        go.Figure: Figure with subplots.
    """
    
    row_count = len(trace_nested_list)
    col_count = len(trace_nested_list[0])
    
    fig = make_subplots(
        rows=row_count, 
        cols=col_count,
        subplot_titles=(titles),
        vertical_spacing=0.02,
        horizontal_spacing=0.02
    )

    for i, row in enumerate(trace_nested_list):
        for j, col in enumerate(row):

            # adding both outlieers and clustered
            for trace in col:
                fig.add_trace(trace, row=i+1, col=1)
    
    # figure settings
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    
    fig.update_layout(width=base_size*col_count, height=base_size*row_count, plot_bgcolor='rgba(250,250,250,1)')

    return fig

## Saving / Showing Plots

In [50]:
def fig_show_save(fig: go.Figure, filename: str, show=True):
  """
  Function to show and save a figure.

  Args:
      fig (go.Figure): fig to be saved and shown
      filename (str): filename to save the figure, without extension
      show (bool, optional): Option to disable showing of figure (in case too big for notebook). Defaults to True.
  """
  
  # writing both interactible .html and static image .png
  fig.write_html(f"figures/{filename}.html")
  fig.write_image(f"figures/{filename}.png")

  if show: 
    fig.show()

# Data Part

In this part we read the csv-file "USvideos.csv" using pandas "read_csv" function to create a pandas database structure. We are only concentrating on the titles in this assignment so we create the database "df" with only the titles from the df_whole. 
Then there is an exception for computers not having the capability of gpu-acceleration during the ML part. This just takes a smaller version of the database to make it faster to run through. 
The last two lines are just to be able to see what we just created. We see the "shape" of the df and them the first 3 lines.

In [51]:
# got data from kaggle: https://www.kaggle.com/datasets/datasnaek/youtube-new?resource=download

df_whole = pd.read_csv("data/USvideos.csv")

df = df_whole[["title"]].copy()

# if your computer does not have GPU support, you can use a sample of the dataset instead to make it run in a reasonable time
# if you want to use the full dataset even wihtout GPU in case you have a very strong CPU, then you can just comment out the next line
if device == "cpu": df = df.sample(frac=0.05)

print(df.shape)

df.head(3)

(2047, 1)


Unnamed: 0,title
7472,Another Holiday Commercial
15586,German Shepherd being a great attack dog
6990,Fighting California's Wildfires: Stunning Footage from the Front Lines


## Cleaning

The titles from youtube are cluttered with emojis and special characters that does not give any information to the ML-algorithms that we are going to use later. Therefore we can run the string_cleaner function and remove any characters we don't need in the later parts.
The last line is just to see that the new clean titles actually stuck to the df.

In [52]:
df["title_clean"] = df["title"].apply(string_cleaner)

df.head(3)

Unnamed: 0,title,title_clean
7472,Another Holiday Commercial,another holiday commercial
15586,German Shepherd being a great attack dog,german shepherd being a great attack dog
6990,Fighting California's Wildfires: Stunning Footage from the Front Lines,fighting californias wildfires stunning footage from the front lines


# ML Part

## Getting Encodings

Embs is a numpy nd-array (n-dimential array) created from the clean titles in the df, in this case a 3-dimential ndarray. The numpy array of the titles are then sent to the pickle model to be processed. Pickle is a library that takes in a python object and creating a "byte-stream" from them. This ends up being a whole bunch of floats in a 3-dimential array.

The next part is printing some values from the embs variable, just to see what we created.

In [53]:
embs = model.encode(df["title_clean"].to_numpy())

In [54]:
print(f"""
{type(embs)}
hvor langt er selve embs: {len(embs)}
hvor langt et er et element: {len(embs[0])}
hva er første element i embs:
{embs[0][0:20]}
""")


<class 'numpy.ndarray'>
hvor langt er selve embs: 2047
hvor langt et er et element: 768
hva er første element i embs:
[ 0.01375684  0.04277974 -0.02076503 -0.01860791  0.01514913  0.01745451
 -0.01420375 -0.01810662 -0.0726046  -0.01643376  0.04552158 -0.057751
  0.05076957  0.02838483 -0.00155469 -0.07070419  0.01915802  0.01445961
  0.02491704 -0.02340465]



Here we put the embs floats in the 3-dimantial ndarray into the df database under the name "embs". We have to use "list" to list the items so each colonm gets 3 coordinated.

In [43]:
df["embs"] = list(embs)
df.head(3)

Unnamed: 0,title,title_clean,embs,embs_2b,cluster_id
18587,*SPOILER* 4th Eliminated Queen RuPaul's All Stars 3: Behind the Scenes,spoiler 4th eliminated queen rupauls all stars 3 behind the scenes,"[0.004531481, -0.047333, 0.01654481, 0.0680197, 0.012752331, 0.06016765, -0.05804439, 0.012055915, -0.052730627, 0.04515409, 0.024579685, -0.033766676, -0.013822519, 0.01565219, 0.022297572, -0.02...","[10.477215, 2.0818236]",45
14903,How-To: Make a Deadliest Catch Pineapple Bowl with Trap Kitchen,howto make a deadliest catch pineapple bowl with trap kitchen,"[0.041717667, 0.0022279853, -0.007166611, -0.033043846, -0.025676552, -0.019236201, -0.049276374, 0.02583269, -0.0017067442, -0.00050672365, -0.015143436, -0.03707369, 0.004563747, 0.062019918, 0....","[4.711583, 0.22323248]",-1
23698,Carrie Underwood - The Champion ft. Ludacris,carrie underwood the champion ft ludacris,"[-0.03194607, 0.061770156, -0.022123119, 0.0092906505, -0.019104972, -0.0063905823, -0.0442067, -0.0062427185, -0.030885503, -0.0049865474, -0.00079759624, 0.0050578997, 0.025917465, 0.04812967, -...","[11.875835, 0.92784184]",-1


## Dimensinality Reduction

Here we are using UMAP algorithm to create 2-dimential array from a 3-dimential one in embs. UMAP sees the correlation between the 3 coordinates and creates a accurate version in 2-dimentions. We are doing this so that we can visualize the final cluster plotting in 2-dimentions. The values n_neighbors and min_dist are valeus that can be used to fine-tune the algorithm to make us get the values we want. 
The fit_transform function does the processing itself. umap variable was just a model from the UMAP library.

In [35]:
umap = UMAP(n_neighbors=20, min_dist=0.1)

embs_2d = umap.fit_transform(embs)

In [36]:
print(f"""
{type(embs_2d)}
hvor langt er selve embs_2d: {len(embs_2d)}
hvor langt et er et element: {len(embs_2d[0])}
hva er første element i embs_2d:
{embs_2d[0]}
""")


<class 'numpy.ndarray'>
hvor langt er selve embs_2d: 2047
hvor langt et er et element: 2
hva er første element i embs_2d:
[10.477215   2.0818236]



Here we put the new 2-dimential values into the df aswell as printing it to see that it worked.

In [37]:
df["embs_2b"] = list(embs_2d)

df.head(3)

Unnamed: 0,title,title_clean,embs,embs_2b
18587,*SPOILER* 4th Eliminated Queen RuPaul's All Stars 3: Behind the Scenes,spoiler 4th eliminated queen rupauls all stars 3 behind the scenes,"[0.004531481, -0.047333, 0.01654481, 0.0680197, 0.012752331, 0.06016765, -0.05804439, 0.012055915, -0.052730627, 0.04515409, 0.024579685, -0.033766676, -0.013822519, 0.01565219, 0.022297572, -0.02...","[10.477215, 2.0818236]"
14903,How-To: Make a Deadliest Catch Pineapple Bowl with Trap Kitchen,howto make a deadliest catch pineapple bowl with trap kitchen,"[0.041717667, 0.0022279853, -0.007166611, -0.033043846, -0.025676552, -0.019236201, -0.049276374, 0.02583269, -0.0017067442, -0.00050672365, -0.015143436, -0.03707369, 0.004563747, 0.062019918, 0....","[4.711583, 0.22323248]"
23698,Carrie Underwood - The Champion ft. Ludacris,carrie underwood the champion ft ludacris,"[-0.03194607, 0.061770156, -0.022123119, 0.0092906505, -0.019104972, -0.0063905823, -0.0442067, -0.0062427185, -0.030885503, -0.0049865474, -0.00079759624, 0.0050578997, 0.025917465, 0.04812967, -...","[11.875835, 0.92784184]"


## Clustering 2D data

This is the plotting of the values we got from umap. We are setting it so the x-value is the first number 

In [38]:
fig = px.scatter(x=embs_2d[:,0], y=embs_2d[:,1])

fig.update_layout(width=800, height=800)
fig.update_traces(marker=dict(size=2))

fig_show_save(fig, "test")

In [39]:
clusters_2d = HDBSCAN(min_cluster_size=10, cluster_selection_method="leaf").fit(embs_2d)

print(f"""
    Number of clusters: {len(set(clusters_2d.labels_)) - 1}
    Number of rows as outliers: {clusters_2d.labels_.tolist().count(-1)}
""")


    Number of clusters: 47
    Number of rows as outliers: 1072



In [40]:
df["cluster_id"] = clusters_2d.labels_
df.head(3)


Unnamed: 0,title,title_clean,embs,embs_2b,cluster_id
18587,*SPOILER* 4th Eliminated Queen RuPaul's All Stars 3: Behind the Scenes,spoiler 4th eliminated queen rupauls all stars 3 behind the scenes,"[0.004531481, -0.047333, 0.01654481, 0.0680197, 0.012752331, 0.06016765, -0.05804439, 0.012055915, -0.052730627, 0.04515409, 0.024579685, -0.033766676, -0.013822519, 0.01565219, 0.022297572, -0.02...","[10.477215, 2.0818236]",45
14903,How-To: Make a Deadliest Catch Pineapple Bowl with Trap Kitchen,howto make a deadliest catch pineapple bowl with trap kitchen,"[0.041717667, 0.0022279853, -0.007166611, -0.033043846, -0.025676552, -0.019236201, -0.049276374, 0.02583269, -0.0017067442, -0.00050672365, -0.015143436, -0.03707369, 0.004563747, 0.062019918, 0....","[4.711583, 0.22323248]",-1
23698,Carrie Underwood - The Champion ft. Ludacris,carrie underwood the champion ft ludacris,"[-0.03194607, 0.061770156, -0.022123119, 0.0092906505, -0.019104972, -0.0063905823, -0.0442067, -0.0062427185, -0.030885503, -0.0049865474, -0.00079759624, 0.0050578997, 0.025917465, 0.04812967, -...","[11.875835, 0.92784184]",-1


# Results

## Plotting the results

In [41]:
trace_cluster_2d, trace_outlier_2d = result_tracer_wrapper(embs_2d, clusters_2d.labels_, df["title_clean"].to_numpy())


col11 = [trace_cluster_2d, trace_outlier_2d]


row1 = [col11]


trace_list = [row1]

fig = subplotter(trace_list, ["Topics by HDBSCAN Cluster", ])

fig_show_save(fig, "topics-by-hdbscan-clusters")

## Showing topic per cluster

In [42]:
dfcg = df.groupby(["cluster_id"])
dfcg = dfcg.agg(list)
dfcg = dfcg.reset_index()
dfcg.head(3)

Unnamed: 0,cluster_id,title,title_clean,embs,embs_2b
0,-1,"[How-To: Make a Deadliest Catch Pineapple Bowl with Trap Kitchen, Carrie Underwood - The Champion ft. Ludacris, Maze Runner: The Death Cure | Any Ideas Clip | 20th Century FOX, 5 Reasons to Buy a ...","[howto make a deadliest catch pineapple bowl with trap kitchen, carrie underwood the champion ft ludacris, maze runner the death cure any ideas clip 20th century fox, 5 reasons to buy a 188 lap...","[[0.041717667, 0.0022279853, -0.007166611, -0.033043846, -0.025676552, -0.019236201, -0.049276374, 0.02583269, -0.0017067442, -0.00050672365, -0.015143436, -0.03707369, 0.004563747, 0.062019918, 0...","[[4.711583, 0.22323248], [11.875835, 0.92784184], [7.1446567, 5.5086713], [6.6786027, -0.17593466], [5.0974317, 0.14011322], [11.831653, 1.2749321], [5.3041344, -0.34131438], [8.86171, -0.34246725..."
1,0,"[YoungBoy Never Broke Again Goes Sneaker Shopping With Complex, Luna Stracci Goes Sneaker Shopping With Halsey and Complex, Shannon Sharpe Goes Sneaker Shopping With Complex, Sneakerheads Try To S...","[youngboy never broke again goes sneaker shopping with complex, luna stracci goes sneaker shopping with halsey and complex, shannon sharpe goes sneaker shopping with complex, sneakerheads try to s...","[[-0.017950041, 0.036332164, -0.005577358, 0.014525459, 0.045893427, 0.014291536, -0.009168323, 0.012867643, 0.026447013, 0.027225558, 0.023591556, 0.07054726, -0.008500265, 0.066723004, -0.031844...","[[6.415239, -2.2148144], [6.458903, -2.1421459], [6.432264, -2.2073042], [6.485964, -2.1413903], [6.3995123, -2.2295356], [6.4049864, -2.1818364], [6.4150243, -2.211089], [6.448775, -2.1930826], [..."
2,1,"[SodaSoak by SodaStream, ETRADE Super Bowl Commercial 2018 This Is Getting Old, T-Mobile | #LittleOnes | 2018 Big Game Ad, Rocket Mortgage Super Bowl 2018 Ad ft. Keegan-Michael Key and Big Sean (...","[sodasoak by sodastream, etrade super bowl commercial 2018 this is getting old, tmobile littleones 2018 big game ad, rocket mortgage super bowl 2018 ad ft keeganmichael key and big sean officia...","[[0.0026250246, 0.00421502, 0.014693317, -0.038649686, -0.0064043594, 0.031905785, -0.010349739, 0.018666098, -0.03415807, 0.036021564, 0.0054302216, 0.0030056108, -0.013534277, 0.09812609, -0.017...","[[10.765723, 4.2094097], [10.544393, 4.3812075], [10.579665, 4.360911], [10.504079, 4.388758], [10.617423, 4.363756], [10.608318, 4.366495], [10.660723, 4.3298383], [10.520604, 4.255566], [10.7251..."
