In [None]:
try:
    %load_ext lab_black
except ModuleNotFoundError:
    print("nb_black not installed")

In [None]:
import os
import sys
import networkx as nx
import numpy as np
import functools
import webbrowser
from time import time
from random import shuffle
from IPython.core.display import display, HTML
from IPython.display import Javascript
from scipy.cluster.hierarchy import cut_tree, to_tree, leaves_list
from ipywidgets import Button, HBox, VBox, Output, Layout, Image
from ipyevents import Event

sys.path.append(os.path.abspath(".."))
from yourtube.krakow import krakow
from yourtube.clustering_utils import plot_dendrogram, normalized_dasgupta_cost
from yourtube.scraping import scrape
from yourtube.file_operations import (
    save_graph,
    load_graph,
    id_to_url,
)

# import matplotlib.cm as cm
# import matplotlib.pyplot as plt

balance = 2.1
num_of_columns = 3
column_width = 1200 / num_of_columns

id_to_thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg"

In [None]:
G = load_graph()

In [None]:
def window_open(_, url):
    webbrowser.open(url)
    # alternative is to use Javascript https://stackoverflow.com/a/61900572/11756613
    # and it works even when jupyter is remote
    # but here, when called by an event, it's broken for some reason

In [None]:
def display_video_links(G, ids, prefixes=None, show_first=8):
    if prefixes is None:
        prefixes = [""] * len(ids)

    for id_, prefix in list(zip(ids, prefixes))[:show_first]:
        if "title" not in G.nodes[id_] or G.nodes[id_]["title"] is None:
            scrape(id_, G)
            if "title" not in G.nodes[id_]:
                # scrapig failed - probably video is down
                continue

        url = id_to_url.format(id_)
        title = G.nodes[id_]["title"]

        image_url = id_to_thumbnail.format(id_)
        img = Image.from_url(image_url)
        event = Event(source=img, watched_events=["click"])
        func = functools.partial(window_open, url=url)
        event.on_dom_event(func)
        display(img)
        print(title)
        # display(HTML(f"""<a href="{url}">{prefix} {title}</a>"""))
        # display(HTML(f"""<a href="{url}"><img src="{image_url}"></a>"""))
        # display(HTML(f"""<textarea rows="3">{prefix} {title}</textarea>"""))
        print()
        print()

    save_graph(G)

In [None]:
# ranking functions


def rank_nodes_by_in_degree(G, nodes):
    recs = sorted(G.subgraph(nodes).in_degree(), key=lambda pair: pair[1], reverse=True)
    ids, scores = zip(*recs)
    return ids, scores


# def recommend(SubG, pickiness=0):
#     # this line recommends very normie videos
#     # it's equivalent to pickiness==0
#     # recs = sorted(SubG.in_degree(), key=lambda pair: pair[1], reverse=True)

#     # to be honest, I don't fully understand this part
#     # but it works better than the one on top:
#     # first limit recs only to the best ones
#     # this way, we'll omit most general normie recommendations later
#     best_recs = [node for node, in_degree in SubG.in_degree() if in_degree >= pickiness]
#     #     best_recs = [node for node, degree in SubG.degree() if degree >= pickiness]

#     ids, scores = sort_nodes_by_in_degree(SubG, best_recs)
#     return ids, scores

In [None]:
# filtering functions


def added_in_last_n_years(G, ids, n=5):
    seconds_in_year = 60 * 60 * 24 * 365
    start_time = time() - seconds_in_year * n

    filterd_ids = []
    for id_ in ids:
        node = G.nodes[id_]
        if "time_added" not in node:
            continue
        if start_time < node["time_added"]:
            filterd_ids.append(id_)

    return filterd_ids


def neighborhood(G, ids):
    out_edges = G.out_edges(ids)
    return G.edge_subgraph(out_edges).nodes

In [None]:
sources1 = added_in_last_n_years(G, G.nodes)
neighborhood1 = neighborhood(G, sources1)
wide_neighborhood1 = neighborhood(G, neighborhood1)

print("sources: ", len(sources1))
print("neighborhood: ", len(neighborhood1))
print("wide neighborhood: ", len(wide_neighborhood1))

In [None]:
# B = load_graph("basia")

In [None]:
# sources2 = added_in_last_n_years(B, B.nodes)
# neighborhood2 = neighborhood(B, sources2)
# wide_neighborhood2 = neighborhood(G, neighborhood2)

# print("sources: ", len(sources2))
# print("neighborhood: ", len(neighborhood2))
# print("wide neighborhood: ", len(wide_neighborhood2))

In [None]:
to_cluster = neighborhood1  # & neighborhood2
# to_cluster = wide_neighborhood1  # & wide_neighborhood2
# len(to_cluster)

In [None]:
RecentDirected = G.subgraph(to_cluster)
Recent = RecentDirected.to_undirected()
print("number of videos: ", Recent.number_of_nodes())

# choose only the biggest connected component
components = sorted(nx.connected_components(Recent), key=len, reverse=True)
# for el in components[:5]:
#     print(len(el))
main_component = components[0]
Main = Recent.subgraph(main_component)

D = krakow(Main, balance=balance)

tree = to_tree(D)

# plot_dendrogram(D, clusters_limit=200, width=22)
# normalized_dasgupta_cost(Main, D)

In [None]:
layout = Layout(width=f"{column_width}px")
outputs = [Output(layout=layout) for _ in range(num_of_columns)]
message_output = Output()

whole_output = HBox(outputs)


def split_into_n_children(tree, n):
    children = [tree]

    while len(children) < n:
        index_of_biggest = np.argmax([child.count for child in children])
        to_split = children[index_of_biggest]
        splitten = [to_split.left, to_split.right]
        children[index_of_biggest : index_of_biggest + 1] = splitten
        # print([child.count for child in children])
    return children


main_ids_list = np.array(Main.nodes)


def update_displayed_videos():
    global children
    try:
        children = split_into_n_children(tree, n=num_of_columns)

        for output, child in zip(outputs, children):
            output.clear_output(wait=True)

            cluster = child.pre_order()
            ids = main_ids_list[cluster]
            ranked_ids, scores = rank_nodes_by_in_degree(RecentDirected, ids)

            ranked_ids = ranked_ids[: len(ranked_ids) // 20]
            ranked_ids = list(ranked_ids)
            shuffle(ranked_ids)

            with output:
                display_video_links(G, ranked_ids)  # , scores)

    except AttributeError as e:
        with message_output:
            print("already on the lowest cluster")


def choose_column(event, i):
    global tree
    tree = children[i]
    update_displayed_videos()


for i, output in enumerate(outputs):
    event = Event(source=output, watched_events=["auxclick"])
    func = functools.partial(choose_column, i=i)
    event.on_dom_event(func)

update_displayed_videos()
whole_output

In [None]:
message_output