# Imports

In [9]:
# regex
import re

# pandas + numpy
import numpy as np
import pandas as pd

# setting pandas options
pd.set_option('display.max_colwidth', 200)


# storing and loading models
import pickle

# to set types for functions
from typing import Tuple

# Plotting
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


# gpu debug
import torch

# setting device to use GPU for NLP backend if you have GPU available
device = "cuda" if torch.cuda.is_available() else "cpu"


# SBERT
from sentence_transformers import SentenceTransformer

# UMAP
from umap import UMAP

#HDBSCAN
from hdbscan import HDBSCAN

# topic finding
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading model from pickle if possible, to avoid downloading it again
try:
    model = pickle.load(open(f'model-{device}.pkl', 'rb'))

    model_load = True

except:
    model = SentenceTransformer('all-mpnet-base-v2', device=device)
    pickle.dump(model, open(f'model-{device}.pkl', 'wb'))

    model_load = False

print(f"""
GPUs detected:          {torch.cuda.device_count()}
Using GPU:              {torch.cuda.is_available()}
Device:                 {device}
Got model from pickle:  {model_load}
""")


GPUs detected:          0
Using GPU:              False
Device:                 cpu
Got model from pickle:  True



# Functions

In [10]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words

## Cleaning

In [11]:
def string_cleaner(input: str) -> str:
    """
    Function to clean up strings.

    Args:
        input (str): String to be cleaned.

    Returns:
        str: Cleaned string.
    """
    
    # turning lowercase
    input = input.lower()

    # removing punctuation and other non-alphanumeric characters
    input = re.sub(r'[^\w\s]', '', input)
    
    return input

## Topic Modeling

In [12]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words



def topic_by_clusterId(result: pd.DataFrame) -> dict:
  """
  Function that maps topics to cluster ids.

  Args:
      result (pd.DataFrame): Dataframe with cluster ids and topics.

  Returns:
      dict: Dictionary with cluster ids as keys and topics as values.
  """

  #print(result.isna().sum())

  df_group = result[["titles", "cluster_label"]].groupby("cluster_label").agg(list).reset_index()

  df_group["topics"] = tfidf_most_relevant_word(df_group["titles"])

  return dict(zip(df_group.cluster_label, df_group.topics))

## Plotting Functions

In [13]:
# when you actually cast the type here, then it works with how pandas casts types and you don't have to worry about copying seriers
def result_df_maker(embeddings: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> pd.DataFrame:
  """
  Function to make a dataframe with the embeddings, cluster labels, topic per cluster label and titles.

  Args:
      embeddings (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      pd.DataFrame: Dataframe with embeddings, cluster labels, topics per cluster, and titles.
  """
  result = pd.DataFrame(embeddings, columns=['x', 'y'])

  result["titles"] = titles

  result["cluster_label"] = cluster_labels

  topic_dict = topic_by_clusterId(result)

  result["topics"] = result["cluster_label"].apply(lambda x: topic_dict[x])

  result["topics"] = result["topics"].apply(lambda x: " ".join(x))

  return result

def result_splitter(result: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
  """
  Function to split the dataframe into two dataframes, one for clustered and one for outliers.

  Args:
      result (pd.DataFrame): Dataframe with embeddings, cluster labels, topics per cluster, and titles.

  Returns:
      Tuple[np.ndarray, np.ndarray]: Tuple of two dataframes, one for clustered and one for outliers.
  """

  clustered = result.loc[result.cluster_label != -1, :]
  outliers = result.loc[result.cluster_label == -1, :]
  return clustered, outliers

# the cavalry is not here, but it's fine! Why? I am here!
def result_tracer(clustered: pd.DataFrame, outliers: pd.DataFrame) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      clustered (pd.DataFrame): clustered dataframe to be colored by cluster and get hover data
      outliers (pd.DataFrame): outlier data frame with grey color and no hover data

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  trace_cluster = go.Scattergl(
    x=clustered.x, 
    y=clustered.y, 
    mode="markers", 
    name="Clustered",

    # styling markers
    marker=dict(
      size=2, 
      color=clustered.cluster_label,
      colorscale="Rainbow"
    ), 

    # setting hover text to the titles of the videos
    hovertemplate="<b>Title:</b> %{customdata[0]} <br><b>Topics:</b> %{customdata[1]} <br><b>Cluster Id:</b> %{customdata[2]}<extra></extra>", 
    customdata=np.column_stack([clustered.titles, clustered.topics, clustered.cluster_label]),
  )

  trace_outlier = go.Scattergl(
    x=outliers.x,
    y=outliers.y,
    mode="markers",
    name="Outliers",

    marker=dict(
      size=1,
      color="grey"
    ),

    hovertemplate="Outlier<extra></extra>"
  )

  return trace_cluster, trace_outlier

def result_tracer_wrapper(uembs: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      uembs (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  result = result_df_maker(uembs, cluster_labels, titles)
  clustered, outliers = result_splitter(result)
  trace_cluster, trace_outlier = result_tracer(clustered, outliers)
  return trace_cluster, trace_outlier

In [14]:
def subplotter(trace_nested_list: list, titles: list, base_size=1000) -> go.Figure:
    """
    Function to make a figure with subplots of the clustered and outliers.

    Args:
        trace_nested_list (list): list holding rows of columns, each column holding traces. 
        titles (list): Titles for the subplots
        base_size (int, optional): Base size of the sub plots. Defaults to 1000.

    Returns:
        go.Figure: Figure with subplots.
    """
    
    row_count = len(trace_nested_list)
    col_count = len(trace_nested_list[0])
    
    fig = make_subplots(
        rows=row_count, 
        cols=col_count,
        subplot_titles=(titles),
        vertical_spacing=0.02,
        horizontal_spacing=0.02
    )

    for i, row in enumerate(trace_nested_list):
        for j, col in enumerate(row):

            # adding both outlieers and clustered
            for trace in col:
                fig.add_trace(trace, row=i+1, col=1)
    
    # figure settings
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    
    fig.update_layout(width=base_size*col_count, height=base_size*row_count, plot_bgcolor='rgba(250,250,250,1)')

    return fig

## Saving / Showing Plots

In [15]:
def fig_show_save(fig: go.Figure, filename: str, show=True):
  """
  Function to show and save a figure.

  Args:
      fig (go.Figure): fig to be saved and shown
      filename (str): filename to save the figure, without extension
      show (bool, optional): Option to disable showing of figure (in case too big for notebook). Defaults to True.
  """
  
  # writing both interactible .html and static image .png
  fig.write_html(f"figures/{filename}.html")
  fig.write_image(f"figures/{filename}.png")

  if show: 
    fig.show()

# Data Part

In this part we read the csv-file "USvideos.csv" using pandas "read_csv" function to create a pandas database structure. We are only concentrating on the titles in this assignment so we create the database "df" with only the titles from the df_whole. 
Then there is an exception for computers not having the capability of gpu-acceleration during the ML part. This just takes a smaller version of the database to make it faster to run through. 
The last two lines are just to be able to see what we just created. We see the "shape" of the df and then the first 3 lines.

In [16]:
# got data from kaggle: https://www.kaggle.com/datasets/datasnaek/youtube-new?resource=download

df_whole = pd.read_csv("data/USvideos.csv")

df = df_whole[["title"]].copy()

# if your computer does not have GPU support, you can use a sample of the dataset instead to make it run in a reasonable time
# if you want to use the full dataset even wihtout GPU in case you have a very strong CPU, then you can just comment out the next line
if device == "cpu": df = df.sample(frac=0.05)

print(df.shape)

df.head(3)

(2047, 1)


Unnamed: 0,title
15179,A janitor's son is bullied by rich kids. So his dad stands up for him in the most heartwarming way.
8040,Mamma Mia! Here We Go Again - Trailer
6493,coin cell challenge


## Cleaning

The titles from youtube are cluttered with emojis and special characters that does not give any information and/or create problems for the ML-algorithms that we are going to use later. Therefore we can run the string_cleaner function and remove any characters we don't need in the later parts.
The last line is just to see that the new clean titles actually stuck to the df.

In [17]:
df["title_clean"] = df["title"].apply(string_cleaner)

df.head(3)

Unnamed: 0,title,title_clean
15179,A janitor's son is bullied by rich kids. So his dad stands up for him in the most heartwarming way.,a janitors son is bullied by rich kids so his dad stands up for him in the most heartwarming way
8040,Mamma Mia! Here We Go Again - Trailer,mamma mia here we go again trailer
6493,coin cell challenge,coin cell challenge


# ML Part

## Getting Encodings

Embs is a numpy nd-array (n-dimential array) created from the clean titles in the df, in this case a 3-dimential ndarray. The numpy array of the titles are then sent to the pickle model to be processed. Pickle is a library that takes in a python object and creating a "byte-stream" from them. This ends up being a whole bunch of floats in a 3-dimential array.

The next part is printing some values from the embs variable, just to see what we created.

In [18]:
embs = model.encode(df["title_clean"].to_numpy())

In [19]:
print(f"""
{type(embs)}
hvor langt er selve embs: {len(embs)}
hvor langt et er et element: {len(embs[0])}
hva er første element i embs:
{embs[0][0:20]}
""")


<class 'numpy.ndarray'>
hvor langt er selve embs: 2047
hvor langt et er et element: 768
hva er første element i embs:
[-0.03733551  0.05368667  0.01742675  0.03971022  0.01163898  0.02597906
 -0.02413752 -0.00554852 -0.0416946   0.03564197  0.01334045 -0.00567929
  0.01474648  0.03036969 -0.03649659 -0.05573335 -0.00706238  0.01780203
  0.0015598   0.00298764]



Now we put the embs floats in the 3-dimential ndarray into the df database under the name "embs". We have to use "list" to list the items so the values gets separated correctly with the df.

In [20]:
df["embs"] = list(embs)
df.head(3)

Unnamed: 0,title,title_clean,embs
15179,A janitor's son is bullied by rich kids. So his dad stands up for him in the most heartwarming way.,a janitors son is bullied by rich kids so his dad stands up for him in the most heartwarming way,"[-0.03733551, 0.05368667, 0.017426753, 0.03971022, 0.011638979, 0.025979057, -0.024137525, -0.005548518, -0.0416946, 0.03564197, 0.013340446, -0.0056792907, 0.014746477, 0.03036969, -0.03649659, -..."
8040,Mamma Mia! Here We Go Again - Trailer,mamma mia here we go again trailer,"[-0.0010785938, 0.024954392, 0.006947854, -0.036223643, 0.035740424, -0.016962444, -0.07804767, 0.00668523, -0.060773473, -0.005343946, -0.05221765, -0.012986505, 0.0074118064, 0.028774602, 0.0323..."
6493,coin cell challenge,coin cell challenge,"[-0.008217413, -0.0210403, -0.043763883, 0.052465804, -0.06781711, 0.0143045075, 0.01708238, 0.0017869879, -0.074608, 0.069769986, 0.010051132, 0.013755199, -0.031166455, 0.019815156, -0.001627701..."


## Dimensinality Reduction

Here we are using UMAP algorithm to create 2-dimential array from a 3-dimential one in embs. UMAP sees the correlation between the 3 coordinates and creates a accurate version in 2-dimentions. We are doing this so that we can visualize the final cluster plotting in 2-dimentions. The values n_neighbors and min_dist are values that can be used to fine-tune the algorithm, to make us get the values we want. 
The fit_transform function does the processing itself. umap variable was just a model from the UMAP library.

In [21]:
umap = UMAP(n_neighbors=20, min_dist=0.1)

embs_2d = umap.fit_transform(embs)

In [22]:
print(f"""
{type(embs_2d)}
hvor langt er selve embs_2d: {len(embs_2d)}
hvor langt et er et element: {len(embs_2d[0])}
hva er første element i embs_2d:
{embs_2d[0]}
""")


<class 'numpy.ndarray'>
hvor langt er selve embs_2d: 2047
hvor langt et er et element: 2
hva er første element i embs_2d:
[8.320494 0.764461]



Here we put the new 2-dimential values into the df, aswell as printing it to see that it worked.

In [23]:
df["embs_2b"] = list(embs_2d)

df.head(3)

Unnamed: 0,title,title_clean,embs,embs_2b
15179,A janitor's son is bullied by rich kids. So his dad stands up for him in the most heartwarming way.,a janitors son is bullied by rich kids so his dad stands up for him in the most heartwarming way,"[-0.03733551, 0.05368667, 0.017426753, 0.03971022, 0.011638979, 0.025979057, -0.024137525, -0.005548518, -0.0416946, 0.03564197, 0.013340446, -0.0056792907, 0.014746477, 0.03036969, -0.03649659, -...","[8.320494, 0.764461]"
8040,Mamma Mia! Here We Go Again - Trailer,mamma mia here we go again trailer,"[-0.0010785938, 0.024954392, 0.006947854, -0.036223643, 0.035740424, -0.016962444, -0.07804767, 0.00668523, -0.060773473, -0.005343946, -0.05221765, -0.012986505, 0.0074118064, 0.028774602, 0.0323...","[7.3866825, 3.7584968]"
6493,coin cell challenge,coin cell challenge,"[-0.008217413, -0.0210403, -0.043763883, 0.052465804, -0.06781711, 0.0143045075, 0.01708238, 0.0017869879, -0.074608, 0.069769986, 0.010051132, 0.013755199, -0.031166455, 0.019815156, -0.001627701...","[7.0615354, 1.2680292]"


## Clustering 2D data

This is the plotting of the values we got from umap. We are setting it up so the x-values are the first number before the comma and y-values from the second value after the comma. Then the figure is updated with values of the size of the window and some other visual values. 

Lastly the fig_show_save function is run. This function creates the plot itself and also saves a html-file and png file of the same model.

In [26]:
fig = px.scatter(x=embs_2d[:,0], y=embs_2d[:,1])

fig.update_layout(width=800, height=800)
fig.update_traces(marker=dict(size=2))

fig_show_save(fig, "test")

Now we have to cluster the plots into categories. We do this by using the HDBSCAN algorithm. We create clusters by running the HDBSCAN and giving the paramters we like. These can be changed to fine-tune results. We give HDBSCAN embs_2d values and run the function. This gives every title an ID of which cluster it is in. If the function finds the title as an outlier, it gives them the ID of -1. 

We lastly print out the number of clusters and how many outliers it found. 

In [27]:
clusters_2d = HDBSCAN(min_cluster_size=10, cluster_selection_method="leaf").fit(embs_2d)

print(f"""
    Number of clusters: {len(set(clusters_2d.labels_)) - 1}
    Number of rows as outliers: {clusters_2d.labels_.tolist().count(-1)}
""")


    Number of clusters: 48
    Number of rows as outliers: 975



We can now put the new cluster-label-ID's into the df.

In [28]:
df["cluster_id"] = clusters_2d.labels_
df.head(3)


Unnamed: 0,title,title_clean,embs,embs_2b,cluster_id
15179,A janitor's son is bullied by rich kids. So his dad stands up for him in the most heartwarming way.,a janitors son is bullied by rich kids so his dad stands up for him in the most heartwarming way,"[-0.03733551, 0.05368667, 0.017426753, 0.03971022, 0.011638979, 0.025979057, -0.024137525, -0.005548518, -0.0416946, 0.03564197, 0.013340446, -0.0056792907, 0.014746477, 0.03036969, -0.03649659, -...","[8.320494, 0.764461]",-1
8040,Mamma Mia! Here We Go Again - Trailer,mamma mia here we go again trailer,"[-0.0010785938, 0.024954392, 0.006947854, -0.036223643, 0.035740424, -0.016962444, -0.07804767, 0.00668523, -0.060773473, -0.005343946, -0.05221765, -0.012986505, 0.0074118064, 0.028774602, 0.0323...","[7.3866825, 3.7584968]",-1
6493,coin cell challenge,coin cell challenge,"[-0.008217413, -0.0210403, -0.043763883, 0.052465804, -0.06781711, 0.0143045075, 0.01708238, 0.0017869879, -0.074608, 0.069769986, 0.010051132, 0.013755199, -0.031166455, 0.019815156, -0.001627701...","[7.0615354, 1.2680292]",-1


# Results

## Plotting the results

We can now plot our results. We use the function result_tracer_wrapper to create colors for each cluster and putting them on their representing dot. The figure is created and shown using the fig_show_save function.

In [29]:
trace_cluster_2d, trace_outlier_2d = result_tracer_wrapper(embs_2d, clusters_2d.labels_, df["title_clean"].to_numpy())


col11 = [trace_cluster_2d, trace_outlier_2d]


row1 = [col11]


trace_list = [row1]

fig = subplotter(trace_list, ["Topics by HDBSCAN Cluster", ])

fig_show_save(fig, "topics-by-hdbscan-clusters")

## Showing topic per cluster

Lastly to show the names of every title per cluster we group then by the groupby function and them putting it in a list and lastly adding it to the dfcg database and showing the 3 first columns. Now we see every colmn by each cluster id. 

In [30]:
dfcg = df.groupby(["cluster_id"])
dfcg = dfcg.agg(list)
dfcg = dfcg.reset_index()
dfcg.head(3)

Unnamed: 0,cluster_id,title,title_clean,embs,embs_2b
0,-1,"[A janitor's son is bullied by rich kids. So his dad stands up for him in the most heartwarming way., Mamma Mia! Here We Go Again - Trailer, coin cell challenge, Let's Talk About Google Duplex!, W...","[a janitors son is bullied by rich kids so his dad stands up for him in the most heartwarming way, mamma mia here we go again trailer, coin cell challenge, lets talk about google duplex, we tried...","[[-0.03733551, 0.05368667, 0.017426753, 0.03971022, 0.011638979, 0.025979057, -0.024137525, -0.005548518, -0.0416946, 0.03564197, 0.013340446, -0.0056792907, 0.014746477, 0.03036969, -0.03649659, ...","[[8.320494, 0.764461], [7.3866825, 3.7584968], [7.0615354, 1.2680292], [5.705953, 2.7935865], [4.4535456, -1.0022278], [12.223395, 1.4828941], [6.478069, -0.17610644], [11.006031, 1.6196425], [7.3..."
1,0,"[Philippines: Scores dead in landslides, Man risks life to save wild rabbit during SoCal wildfire | ABC7, Philippines: Scores dead in landslides, Hawaiian lava flows ‘faster than a turtle’, Hawaii...","[philippines scores dead in landslides, man risks life to save wild rabbit during socal wildfire abc7, philippines scores dead in landslides, hawaiian lava flows faster than a turtle, hawaiis kil...","[[-0.024978522, 0.000591418, -0.012573435, 0.007992777, 0.0823684, -0.033058073, -0.03200603, -0.014551218, -0.017856661, 0.050326392, 0.0014727511, 0.07415553, 0.012551897, 0.039591864, 0.0041573...","[[3.6368701, 1.1446432], [4.0141892, 1.1972219], [3.6106145, 1.1616036], [3.742626, 1.0310028], [3.6730747, 1.1055219], [3.8798935, 0.9413153], [3.7213113, 1.0486088], [3.6843982, 1.0796411], [3.7..."
2,1,"[Samsung Galaxy S9 Review: The Perfect... Samsung!, Samsung The Wall is a 146-inch modular TV, Samsung Galaxy S9 Review: The Perfect... Samsung!, Curved iPhones Are Coming, iOS 12 Siri Overhaul & ...","[samsung galaxy s9 review the perfect samsung, samsung the wall is a 146inch modular tv, samsung galaxy s9 review the perfect samsung, curved iphones are coming ios 12 siri overhaul apple ditchin...","[[-0.009260378, -0.020816224, -0.018847454, -0.032752242, 0.03726508, 0.007335431, 0.00022816935, -0.04398326, -0.07516435, 0.0027919142, -0.03469381, 0.01803315, 0.008178897, 0.07092998, -0.02970...","[[5.4462166, 3.3923542], [5.402306, 3.419448], [5.446985, 3.4179206], [5.561009, 3.2356954], [5.2049203, 3.4689207], [5.3829136, 3.3019257], [5.3114033, 3.4136183], [5.446185, 3.3241909], [5.27264..."
