In [1]:
import re
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from sklearn.cluster import AgglomerativeClustering

In [2]:
# Load the data into a pandas dataframe
df = pd.read_csv("../data/final_data.csv")
df.columns

Index(['channel_id', 'channel_title', 'video_topics', 'corpus_text'], dtype='object')

In [3]:
# Text cleaning
df['corpus_text'] = df['corpus_text'].str.lower()
df['corpus_text'] = df['corpus_text'].apply(lambda x: re.sub('[^\w\s]', '', x))
df['corpus_text'] = df['corpus_text'].apply(lambda x: re.sub('\d+', '', x))

In [4]:
model= SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [17]:
"""
This is a simple application for sentence embeddings: clustering

Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

embedder = SentenceTransformer('all-mpnet-base-v2')
corpus= df['corpus_text'].to_list()
corpus_embeddings = embedder.encode(corpus)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5, affinity='euclidean', linkage='ward') #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i)
    print("")

Cluster  19

Cluster  26

Cluster  28

Cluster  20

Cluster  29

Cluster  23

Cluster  21

Cluster  30

Cluster  15

Cluster  11

Cluster  18

Cluster  2

Cluster  5

Cluster  16

Cluster  12

Cluster  32

Cluster  7

Cluster  9

Cluster  10

Cluster  0

Cluster  24

Cluster  14

Cluster  8

Cluster  3

Cluster  13

Cluster  31

Cluster  22

Cluster  1

Cluster  33

Cluster  25

Cluster  4

Cluster  17

Cluster  6

Cluster  27



In [18]:
df['cluster']=cluster_assignment
df['cluster'].value_counts()

2     24
0     22
8     21
14    19
30    18
19    17
3     17
1     15
11    14
13    13
9     13
7     12
5     12
18    11
6     11
23    10
29     9
20     9
26     9
33     9
15     8
21     8
16     7
17     7
28     7
10     6
32     5
25     4
4      4
31     4
22     4
24     4
12     3
27     3
Name: cluster, dtype: int64

In [19]:
# Generate a name for the cluster labels
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['corpus_text'])

def get_cluster_keywords(cluster_id):
    cluster_rows = df[df['cluster'] == cluster_id]
    cluster_tfidf = tfidf_matrix[cluster_rows.index]
    sums = np.array(cluster_tfidf.sum(axis=0)).ravel()
    sorted_indices = np.argsort(sums)[::-1]
    feature_names = np.array(tfidf.get_feature_names())
    keywords = feature_names[sorted_indices[:5]]
    return ', '.join(keywords)

df['cluster_name'] = df['cluster'].apply(lambda x: get_cluster_keywords(x))



In [20]:
pd.DataFrame(df[['cluster_name', 'channel_title', 'channel_id']]).head(20)

Unnamed: 0,cluster_name,channel_title,channel_id
0,"job, shorts, career, work, iit",Joshua Fluke,UC-91UA-Xy2Cvb98deRXuggA
1,"subliminal, affirmations, meditation, mind, hy...",30 Minute Mindset,UC-UYJe8O87Y9Qo3XJQRDljg
2,"startup, design, entrepreneurship, startups, s...",The Futur,UC-b3c7kxa5vU-bnmaROgvog
3,"usa, abroad, dallas, student, visa",Yocket,UC-cDK8t9TBFWEnCyoTYIR-Q
4,"healing, attraction, law, shivani, pranic",Clark Kegley,UC-dmJ79518WlKMbsu50eMTQ
5,"echosmith, shuba, ed, music, sheeran",Reneé Dominique,UC-fn57_xqMv-PaMmpm-ZspQ
6,"ukraine, russia, news, tldr, chomsky",TLDR News Global,UC-uhvujip5deVcEtLxnW8qg
7,"kannada, comedy, funny, vines, prank",Dhruv Actoholic,UC-vGXUq70B4p6BD-z-711hQ
8,"echosmith, shuba, ed, music, sheeran",Ed Sheeran,UC0C-w0YjGpqDXGB8IHb662A
9,"documentary, telegraph, bbc, tv, news",eltrece,UC0DM_mHV2u6dj8ig51GkQwg


In [None]:
len(df['cluster'].unique())

In [21]:
df['cluster'].unique()

array([19, 26, 28, 20, 29, 23, 21, 30, 15, 11, 18,  2,  5, 16, 12, 32,  7,
        9, 10,  0, 24, 14,  8,  3, 13, 31, 22,  1, 33, 25,  4, 17,  6, 27])

In [24]:
for i in df['cluster'].unique():
    print(i)
    print(df[df['cluster']==i]['cluster_name'].tolist()[0])

19
job, shorts, career, work, iit
26
subliminal, affirmations, meditation, mind, hypnosis
28
startup, design, entrepreneurship, startups, sales
20
usa, abroad, dallas, student, visa
29
healing, attraction, law, shivani, pranic
23
echosmith, shuba, ed, music, sheeran
21
ukraine, russia, news, tldr, chomsky
30
kannada, comedy, funny, vines, prank
15
documentary, telegraph, bbc, tv, news
11
prank, shorts, onlyjayus, iphone, vrchat
18
data, science, learning, google, tensorflow
2
songs, song, music, cover, mishra
5
mantra, bhajan, shiva, durga, sai
16
gre, toefl, ielts, test, vocabulary
12
vending, stradman, cars, collection, machine
32
product, management, marketing, sales, carey
7
workout, fitness, healthy, calisthenics, dr
9
dubai, amish, dignity, nas, zaidalit
10
peterson, jordan, personality, psychology, clips
0
english, study, big, harvard, brown
24
sbi, rachana, market, ca, card
14
comedy, trevor, noah, funny, netflix
8
india, sadhguru, google, canada, indian
3
bloomberg, news, bitc

In [25]:
df[df['cluster_name']=="dating, gabblin, hussey, men, relationship"]

Unnamed: 0,channel_id,channel_title,video_topics,corpus_text,cluster,cluster_name
59,UC9HGzFGt7BLmWDqooUbWGBg,Matthew Hussey,", Health, Lifestyle_(sociology), Entertainment...",matthew hussey worlds leading dating advice ex...,22,"dating, gabblin, hussey, men, relationship"
67,UCB_oqEMvs9Xd7RKoG_8sCyA,Gabblin,", Lifestyle_(sociology), Entertainment, Film",tell stories help live authentically broke wed...,22,"dating, gabblin, hussey, men, relationship"
213,UCZC45sBWNdkqSQ9Bwtt5lfA,Anna Akana,", Music, Lifestyle_(sociology), Entertainment",actor writer director management coronel group...,22,"dating, gabblin, hussey, men, relationship"
283,UCnpB6UZ7U5rKPLEjCHKLgbg,LoveQuest With Lisa Concepcion,", Health, Lifestyle_(sociology)",entertaining uplifting real raw unscripted aut...,22,"dating, gabblin, hussey, men, relationship"


In [28]:
df[df['corpus_text'].str.contains('sports')]

Unnamed: 0,channel_id,channel_title,video_topics,corpus_text,cluster,cluster_name
39,UC6MXE0Px3m1aI4vI0pLWzQg,Goal Guys,", Physical_fitness, Sport, Lifestyle_(sociolog...",two brothers take different goals fitness prod...,10,"peterson, jordan, personality, psychology, clips"
40,UC6n8I1UDTKP1IWjQMg6_TwA,The B1M,", Society, Tourism, Lifestyle_(sociology), Veh...",love construction want whole world love hosted...,13,"vice, documentary, news, history, world"
53,UC8Su5vZCXWRag13H53zWVwA,TIME,", Baseball, Sport, Association_football, Lifes...",time brings unparalleled insight access author...,13,"vice, documentary, news, history, world"
63,UCAkYgW-5XG5uPyaO9PD_I5A,Veer by Discovery,", Military, Society, Entertainment",welcome official youtube channel veer discover...,1,"india, defence, indian, upsc, ias"
87,UCG1_A0jPBGZUpRW7XkaaBkg,Honest Outlaw,", Lifestyle_(sociology)",gun reviews people make sure subscribe new cha...,17,"shooting, gun, glock, review, ak"
137,UCNJUhkpXnF1GcItcztJM-Ww,Karunesh Talwar,", Entertainment, Film, Humour, Television_prog...",channel videos standup swiggy stand comedy kar...,30,"kannada, comedy, funny, vines, prank"
148,UCP6HGa63sBC7-KHtkme-p-g,USA TODAY,", Sport, Politics, Society, Physical_fitness, ...",heartwarming moments latest sports entertainme...,15,"documentary, telegraph, bbc, tv, news"
150,UCPgLNge0xqQHWM5B5EFH9Cg,The Telegraph,", Entertainment, Film, Television_program, Soc...",latest news videos opinion pieces animations s...,15,"documentary, telegraph, bbc, tv, news"
151,UCPxMZIFE856tbTfdkdjzTSQ,BeerBiceps,", Society, Entertainment, Knowledge, Film, Lif...",beerbiceps ranveer allahbadia ultimate selfimp...,8,"india, sadhguru, google, canada, indian"
180,UCUMZ7gohGI9HcU9VNsr2FJQ,Bloomberg Originals,", Society, Food, Lifestyle_(sociology), Techno...",bloomberg originals offers bold takes curious ...,3,"bloomberg, news, bitcoin, economy, economics"


In [29]:
df

Unnamed: 0,channel_id,channel_title,video_topics,corpus_text,cluster,cluster_name
0,UC-91UA-Xy2Cvb98deRXuggA,Joshua Fluke,", Lifestyle_(sociology), Entertainment",make variety content im always wild ideas job ...,19,"job, shorts, career, work, iit"
1,UC-UYJe8O87Y9Qo3XJQRDljg,30 Minute Mindset,", Music, Electronic_music",welcome little corner youtube im glad find rel...,26,"subliminal, affirmations, meditation, mind, hy..."
2,UC-b3c7kxa5vU-bnmaROgvog,The Futur,", Lifestyle_(sociology), Technology, Vehicle, ...",mission teach billion people make living love ...,28,"startup, design, entrepreneurship, startups, s..."
3,UC-cDK8t9TBFWEnCyoTYIR-Q,Yocket,", Lifestyle_(sociology), Technology",yocket indias largest network study abroad asp...,20,"usa, abroad, dallas, student, visa"
4,UC-dmJ79518WlKMbsu50eMTQ,Clark Kegley,", Lifestyle_(sociology), Knowledge, Technology...",helping create version law attraction reprogra...,29,"healing, attraction, law, shivani, pranic"
...,...,...,...,...,...,...
354,UCy436qLXlyLqddExVC64auw,Gajendra Verma,", Music, Music_of_Asia, Pop_music, Electronic_...",welcome gajendra vermas official youtube chann...,2,"songs, song, music, cover, mishra"
355,UCy5mW8fB24ITiiC0etjLI6w,Neelesh Misra,", Entertainment, Film, Television_program, Soc...",neelesh misras work reached biggest audio plat...,2,"songs, song, music, cover, mishra"
356,UCyI4gi0BtlKiUl6KgtuVIUw,CTV Your Morning,", Action-adventure_game, Action_game, Country_...",hosted annemarie mediwake coanchors lindsey de...,9,"dubai, amish, dignity, nas, zaidalit"
357,UCydVjjfO74Vr7Scecn1sNAg,Desi American Professor,", Lifestyle_(sociology), Technology, Society",assistant professor information systems operat...,20,"usa, abroad, dallas, student, visa"


In [30]:
for i in df['cluster_name'].unique().tolist():
    print(i)

job, shorts, career, work, iit
subliminal, affirmations, meditation, mind, hypnosis
startup, design, entrepreneurship, startups, sales
usa, abroad, dallas, student, visa
healing, attraction, law, shivani, pranic
echosmith, shuba, ed, music, sheeran
ukraine, russia, news, tldr, chomsky
kannada, comedy, funny, vines, prank
documentary, telegraph, bbc, tv, news
prank, shorts, onlyjayus, iphone, vrchat
data, science, learning, google, tensorflow
songs, song, music, cover, mishra
mantra, bhajan, shiva, durga, sai
gre, toefl, ielts, test, vocabulary
vending, stradman, cars, collection, machine
product, management, marketing, sales, carey
workout, fitness, healthy, calisthenics, dr
dubai, amish, dignity, nas, zaidalit
peterson, jordan, personality, psychology, clips
english, study, big, harvard, brown
sbi, rachana, market, ca, card
comedy, trevor, noah, funny, netflix
india, sadhguru, google, canada, indian
bloomberg, news, bitcoin, economy, economics
vice, documentary, news, history, world
h

In [33]:
for i in df[df['cluster']==18]['channel_id'].tolist():
    print('https://www.youtube.com/channel/'+i)

https://www.youtube.com/channel/UC0patpmwYbhcEUap0bTX3JQ
https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ
https://www.youtube.com/channel/UC8ofcOdHNINiPrBA9D59Vaw
https://www.youtube.com/channel/UCNU_lfiiWBdtULKOw6X0Dig
https://www.youtube.com/channel/UCYwVxWpjeKFWwu8TML-Te9A
https://www.youtube.com/channel/UCiT9RITQ9PW6BhXK0y2jaeg
https://www.youtube.com/channel/UCk6ONJlPzjw3DohAeMSgsng
https://www.youtube.com/channel/UCtYLUTtgS3k1Fg4y5tAhLbw
https://www.youtube.com/channel/UCteRPiisgIoHtMgqHegpWAQ
https://www.youtube.com/channel/UCupUfUvPVDKDIbDrSFCniAg
https://www.youtube.com/channel/UCwH7ZUuRVucq4CYIcoAyE4w


In [35]:
df['channel_url']=df['channel_id'].apply(lambda x: "https://www.youtube.com/channel/"+x)

In [37]:
df.head()

Unnamed: 0,channel_id,channel_title,video_topics,corpus_text,cluster,cluster_name,channel_url
0,UC-91UA-Xy2Cvb98deRXuggA,Joshua Fluke,", Lifestyle_(sociology), Entertainment",make variety content im always wild ideas job ...,19,"job, shorts, career, work, iit",https://www.youtube.com/channel/UC-91UA-Xy2Cvb...
1,UC-UYJe8O87Y9Qo3XJQRDljg,30 Minute Mindset,", Music, Electronic_music",welcome little corner youtube im glad find rel...,26,"subliminal, affirmations, meditation, mind, hy...",https://www.youtube.com/channel/UC-UYJe8O87Y9Q...
2,UC-b3c7kxa5vU-bnmaROgvog,The Futur,", Lifestyle_(sociology), Technology, Vehicle, ...",mission teach billion people make living love ...,28,"startup, design, entrepreneurship, startups, s...",https://www.youtube.com/channel/UC-b3c7kxa5vU-...
3,UC-cDK8t9TBFWEnCyoTYIR-Q,Yocket,", Lifestyle_(sociology), Technology",yocket indias largest network study abroad asp...,20,"usa, abroad, dallas, student, visa",https://www.youtube.com/channel/UC-cDK8t9TBFWE...
4,UC-dmJ79518WlKMbsu50eMTQ,Clark Kegley,", Lifestyle_(sociology), Knowledge, Technology...",helping create version law attraction reprogra...,29,"healing, attraction, law, shivani, pranic",https://www.youtube.com/channel/UC-dmJ79518WlK...


In [39]:
df[['channel_id', 'channel_url', 'cluster', 'cluster_name']].to_csv('../data/clustered_data.csv')

In [2]:
import pandas as pd
df= pd.read_csv(r"../data/clustered_data.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,channel_id,channel_url,cluster,cluster_name
0,UC-91UA-Xy2Cvb98deRXuggA,https://www.youtube.com/channel/UC-91UA-Xy2Cvb...,19,"job, shorts, career, work, iit"
1,UC-UYJe8O87Y9Qo3XJQRDljg,https://www.youtube.com/channel/UC-UYJe8O87Y9Q...,26,"subliminal, affirmations, meditation, mind, hy..."
2,UC-b3c7kxa5vU-bnmaROgvog,https://www.youtube.com/channel/UC-b3c7kxa5vU-...,28,"startup, design, entrepreneurship, startups, s..."
3,UC-cDK8t9TBFWEnCyoTYIR-Q,https://www.youtube.com/channel/UC-cDK8t9TBFWE...,20,"usa, abroad, dallas, student, visa"
4,UC-dmJ79518WlKMbsu50eMTQ,https://www.youtube.com/channel/UC-dmJ79518WlK...,29,"healing, attraction, law, shivani, pranic"
...,...,...,...,...
354,UCy436qLXlyLqddExVC64auw,https://www.youtube.com/channel/UCy436qLXlyLqd...,2,"songs, song, music, cover, mishra"
355,UCy5mW8fB24ITiiC0etjLI6w,https://www.youtube.com/channel/UCy5mW8fB24ITi...,2,"songs, song, music, cover, mishra"
356,UCyI4gi0BtlKiUl6KgtuVIUw,https://www.youtube.com/channel/UCyI4gi0BtlKiU...,9,"dubai, amish, dignity, nas, zaidalit"
357,UCydVjjfO74Vr7Scecn1sNAg,https://www.youtube.com/channel/UCydVjjfO74Vr7...,20,"usa, abroad, dallas, student, visa"


In [3]:
df.to_csv(r"../data/clustered_data.csv", index=False)

In [16]:
df.groupby('cluster_name')['channel_url'].apply(list).reset_index(name='links').to_dict('records')

[{'cluster_name': 'bloomberg, news, bitcoin, economy, economics',
  'links': ['https://www.youtube.com/channel/UC6-URUKHVmFWSt_KdRsX8UA',
   'https://www.youtube.com/channel/UCGBoWe1mzCvbbqDNncmxm8A',
   'https://www.youtube.com/channel/UCKQvGU-qtjEthINeViNbn6A',
   'https://www.youtube.com/channel/UCL8w_A8p8P1HWI3k6PR5Z6w',
   'https://www.youtube.com/channel/UCLo66QVfEod0nNM_GzKNxmQ',
   'https://www.youtube.com/channel/UCOa104QrplnxBkVnoArabwQ',
   'https://www.youtube.com/channel/UCUMZ7gohGI9HcU9VNsr2FJQ',
   'https://www.youtube.com/channel/UCZ4AMrDcNrfy3X6nsU8-rPg',
   'https://www.youtube.com/channel/UCZ7x7yDBbEFCGztD8BYvRhA',
   'https://www.youtube.com/channel/UCdxi8d8qRsRyUi2ERYjYb-w',
   'https://www.youtube.com/channel/UCi-pkXLbm7sqXFhV1NBLUfQ',
   'https://www.youtube.com/channel/UCneQdPbDLwZ__ZXP0YVwiag',
   'https://www.youtube.com/channel/UCnpekFV93kB1O0rVqEKSumg',
   'https://www.youtube.com/channel/UCo7a6riBFJ3tkeHjvkXPn1g',
   'https://www.youtube.com/channel/UCqvaXJ

In [12]:
df.to_dict('records')

[{'channel_id': 'UC-91UA-Xy2Cvb98deRXuggA',
  'channel_url': 'https://www.youtube.com/channel/UC-91UA-Xy2Cvb98deRXuggA',
  'cluster': 19,
  'cluster_name': 'job, shorts, career, work, iit'},
 {'channel_id': 'UC-UYJe8O87Y9Qo3XJQRDljg',
  'channel_url': 'https://www.youtube.com/channel/UC-UYJe8O87Y9Qo3XJQRDljg',
  'cluster': 26,
  'cluster_name': 'subliminal, affirmations, meditation, mind, hypnosis'},
 {'channel_id': 'UC-b3c7kxa5vU-bnmaROgvog',
  'channel_url': 'https://www.youtube.com/channel/UC-b3c7kxa5vU-bnmaROgvog',
  'cluster': 28,
  'cluster_name': 'startup, design, entrepreneurship, startups, sales'},
 {'channel_id': 'UC-cDK8t9TBFWEnCyoTYIR-Q',
  'channel_url': 'https://www.youtube.com/channel/UC-cDK8t9TBFWEnCyoTYIR-Q',
  'cluster': 20,
  'cluster_name': 'usa, abroad, dallas, student, visa'},
 {'channel_id': 'UC-dmJ79518WlKMbsu50eMTQ',
  'channel_url': 'https://www.youtube.com/channel/UC-dmJ79518WlKMbsu50eMTQ',
  'cluster': 29,
  'cluster_name': 'healing, attraction, law, shivani