# LLM-based qualitative and quantitative analysis of social media comments

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import os
from datetime import datetime
import seaborn as sns

# libraries for getting data from tiktok and youtube
from TikTokApi import TikTokApi # https://github.com/davidteather/TikTok-Api
from googleapiclient.discovery import build
import asyncio

# libraries for topic analysis
from bertopic import BERTopic
from deep_translator import GoogleTranslator
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
import plotly.io as pio
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import langdetect
import regex

# LLM support for analysing the comments
from mlx_lm import load, generate
from transformers import pipeline
from IPython.display import display
import platform

pio.renderers.default='iframe'

# Get Data from TikTok and YouTube

In [3]:
# set the number of videos, comments and the hashtag
hashtag = 'menstruation'
num_videos = 100
num_comments = 100
date_threshold = 2023 # for now only year supported

In [4]:
# access tokens
ms_token_tiktok = ''
my_api_key_youtube = ''

## TikTok

### Get videos to hashtags and comments to video

In [6]:
async def get_hashtag_videos(api, hashtag, num_videos):
    # await api.create_sessions(ms_tokens=[ms_token], num_sessions=1, sleep_after=3)
    await api.create_sessions(headless=False, ms_tokens=[ms_token_tiktok], num_sessions=1, sleep_after=3)
    tag = api.hashtag(name=hashtag)
    count = 0
    results = []
    dates = []
    async for video in tag.videos(count=num_videos*1000): # this count parameter is not quite working
        date = video.as_dict['createTime']
        date_converted = datetime.utcfromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
        if count < num_videos and int(date_converted[0:4]) >= date_threshold:
            results.append(video.as_dict)
            dates.append(date)
            count += 1
    return results, dates
    
async def get_comments(api, ms_token_tiktok, video_id, num_comments):
    await api.create_sessions(headless=False, ms_tokens=[ms_token_tiktok], num_sessions=1, sleep_after=3)
    video = api.video(id=video_id)
    count = 0
    comments = []
    async for comment in video.comments(count=num_comments): # this count parameter is not quite working
        if count < num_comments:
            comments.append(comment.as_dict)
            count += 1
    return comments

async def tiktok_get_comments_to_hashtag(api, hashtag, num_videos, num_comments):
    
    await asyncio.sleep(2)
    results, dates = await get_hashtag_videos(api, hashtag, num_videos)
    comments_all = []
    for video in results:
        if 0 == 0:
            if 'id' in video:
                comments = await get_comments(api, ms_token_tiktok, video['id'], num_comments)
                comments_all.append(comments)
    await api.close_sessions()
    
    return results, comments_all, dates

def parse_tiktok_output(results, comments_all):
    count = 0
    videos = []
    for result in results:
        if 0 == 0:
            if 'contents' in result:
                # print(result['contents'])
                comments = []
                for item in result['contents']:
                    desc = item['desc']
                for comment in comments_all[count]:
                    comments.append([comment['comment_language'],comment['text'].strip()])
                videos.append({'id': count, 'stats':result['stats'], 'desc':desc, 'comments': comments})
                count += 1
    return videos

### Get comments from TikTok

In [7]:
api = TikTokApi()
results_tiktok, comments_all_tiktok, dates_tt = await tiktok_get_comments_to_hashtag(api, hashtag, num_videos, num_comments)

videos_tiktok = parse_tiktok_output(results_tiktok, comments_all_tiktok)

## YouTube

In [8]:
def access_video_yt(video_id, max_results):
    
    youtube = build('youtube', 'v3', developerKey=my_api_key_youtube)
    request = youtube.videos().list(part='snippet,statistics', id=video_id)
    # https://developers.google.com/youtube/v3/docs/commentThreads/list
    comments_request = youtube.commentThreads().list(part='id,snippet,replies', videoId=video_id, maxResults=max_results, order='relevance')
    
    stats = request.execute()
    try:
        comments = comments_request.execute()
    except:
        comments = []
        print(f'Video with ID {video_id} cannot be scraped for comments.')
    
    return stats, comments

def extract_infos_yt(details):
    
    title = details['items'][0]['snippet']['title']
    view_count = details['items'][0]['statistics']['viewCount']

    return title, view_count

def search_videos_yt(query, max_results):

    results = []
    
    youtube = build('youtube', 'v3', developerKey=my_api_key_youtube)
    # https://developers.google.com/youtube/v3/docs/search/list
    request = youtube.search().list(part='id', type='video', videoDuration='short', q=query, maxResults=max_results, order='relevance', publishedAfter=str(date_threshold)+'-01-01T00:00:00Z')
    response = request.execute()
    
    for video in response['items']:
        results.append(video['id']['videoId'])

    while len(results) < max_results:
        request = youtube.search().list(part='id', type='video', pageToken=response['nextPageToken'], videoDuration='short', q=query, maxResults=max_results, order='relevance', publishedAfter=str(date_threshold)+'-01-01T00:00:00Z')
        response = request.execute()
        for video in response['items']:
            results.append(video['id']['videoId'])
    
    return results, response

def extract_comments_yt(comments):
    
    comment_list = []

    if len(comments) > 0:
        for item in comments['items']:
            comment_list.append(item['snippet']['topLevelComment']['snippet']['textDisplay'].strip())

    return comment_list

In [None]:
results_yt, response = search_videos_yt(hashtag, num_videos)

videos_yt = []
count = 0
# save results_yt for video IDs
for result in results_yt:
    stats, comments = access_video_yt(result, num_comments)
    title, view_count = extract_infos_yt(stats)
    comments_list = extract_comments_yt(comments)
    videos_yt.append({'id': count, 'stats':stats, 'desc':title, 'comments': comments_list})
    count += 1

## Translate the comments
Using https://pypi.org/project/deep-translator/

In [None]:
comments_tt = []
comments_tt_per_video = []
list_per_video_tt = []
counter = 0
for video in videos_tiktok:
    # translate comment and analyse it
    concat = ''
    per_video = []
    for comment in video['comments']:
        comm = comment[1].strip()
        if comm is None:
            continue
        if comment[0] == 'en':
            comments_tt.append(comm)
            concat += comm
            concat += ';'
            per_video.append(comm)
        elif comment[0] == 'un':
            print('un: ', comm)
            continue
        else:
            # Use Google translator to tranlate everything that is not in English into English
            translated = GoogleTranslator(source='auto', target='en').translate(comm)
            translated = translated
            if translated is not None:
                comments_tt.append(translated)
                concat += translated
                concat += ';'
                per_video.append(translated)
    comments_tt_per_video.append(concat)
    list_per_video_tt.append(per_video)


comments_yt = []
comments_yt_per_video = []
list_per_video_yt = []
print('youtube')
counter = 0
for video in videos_yt:
    # translate comment and analyse it
    concat = ''
    per_video = []
    for comment in video['comments']:
        comment = comment.strip()
        if comment is None:
            continue
        # Use Google translator to tranlate everything that is not in English into English
        try:
            language = langdetect.detect(comment)
        except:
            print(comment)
            language = "error"
        if language == 'en':
            translated = comment
        elif language == 'error':
            continue
        else:
            try:
                translated = GoogleTranslator(source='auto', target='en').translate(comment)
            except:
                print(comment)
                translated = ''
        if translated is not None:
            concat += translated
            concat += ';'
            per_video.append(translated)
            comments_yt.append(translated)
    comments_yt_per_video.append(concat)
    list_per_video_yt.append(per_video)

In [None]:
comments_yt = [x for x in comments_yt if x is not None]
comments_tt = [x for x in comments_tt if x is not None]

# Analyse the comments
## BERTopic together with Llama Chatbot

In [None]:
# initialise Llama Chatbot

model, tokenizer = load("mlx-community/Meta-Llama-3-8B-Instruct-4bit")

SYSTEM_MSG = "You are a helpful chatbot assistant that cares a lot about menstruation topics."

def generateFromPrompt(promptStr,maxTokens=100):

    messages = [ {"role": "system", "content": SYSTEM_MSG},
              {"role": "user", "content": promptStr}, ]
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    prompt = tokenizer.decode(input_ids)
    response = generate(model, tokenizer, prompt=prompt,max_tokens=maxTokens)

    return(response)


response = generateFromPrompt("Please introduce yourself")

print(response)

In [153]:
# define BERTopic model
custom_stopwords = ['br', 'https', 'fp', 'ft', 'bro', 'lol', 'wsh', 'fr', 'rn', '39', 'href', 'youtube', 'href', 'www', 'com', 'quot']

def run_BERTopic(comments):
    # remove stop words
    all_stopwords = list(ENGLISH_STOP_WORDS.union(custom_stopwords))  
    vectorizer_model = CountVectorizer(
        stop_words=all_stopwords,
        min_df=5 
    )
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    umap_model = UMAP(n_neighbors=10, n_components=10, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=10, metric='euclidean') #cluster_selection_epsilon=0.5,

    # Initialize BERTopic with more words per topic
    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        top_n_words=15,  # Increase the number of words per topic
        verbose=True
    )
    # Fit the model
    topics, probabilities = topic_model.fit_transform(comments)
    
    # View the number of topics identified
    print("Number of topics identified:", len(topic_model.get_topic_info()) - 1)  # Exclude -1 for outliers

    # Get topic information
    topics_info = topic_model.get_topic_info()
    
    # Print each topic with only the words (no scores)
    for topic_num in topics_info.Topic:
        if topic_num != -1:  # Exclude the outlier topic
            topic_words = [word for word, _ in topic_model.get_topic(topic_num)]  # Extract only words
            print(f"Topic {topic_num}: {', '.join(topic_words)}")

    return topic_model, topics_info

def generate_topic_names(topic_model, topics_info):
    # Generate topic names
    topic_names = []
    topic_nums = []
    topic_wordlists = []
    
    for topic_num in topics_info.Topic:
        if topic_num != -1:  # Exclude the outlier topic
            topic_words = [word for word, _ in topic_model.get_topic(topic_num)]  # Extract only words
            topic_nums.append(topic_num)
            topic_names.append(generateFromPrompt(f"The main topic is {hashtag}. Please give a not too general but conscience subtheme that unifies these words: {topic_words}. Only state the topic:"))            
            topic_wordlists.append(",".join(topic_words))
    
    df_topics = pd.DataFrame({"Topic":topic_nums,"Name":topic_names,"Words":topic_wordlists})
    return df_topics

def count_occurance(df_topics, topic_model, comments):
    
    topic_distr, _ = topic_model.approximate_distribution(comments)

    # Get a binary matrix of documents per topic
    df = pd.DataFrame(comments)
    binary_df = df.iloc[:, : 9]
    df_topics_merged = []
    
    THRESHOLD = 0.2 # Probability threshold to 'count' a topic as 'mentioned' in a document
    
    for topic in range(topic_distr.shape[1]): 
        docs = topic_distr[:,topic] > THRESHOLD
        docs = [1 if d else 0 for d in docs]
        df_topics['Name'][topic] = df_topics['Name'][topic].replace('"','')
        topic_name = df_topics['Name'][topic].replace('"','')
        if topic_name in binary_df.columns:
            print(topic_name)
            existing = binary_df.loc[:,topic_name].tolist()
            merged = [or_fct(x,y) for x,y in zip(existing, docs)]
            binary_df[topic_name] = merged
        else:
            binary_df[topic_name] = docs
            df_topics_merged.append([topic_name, df_topics['Words'][topic]])

    df_topics_merged = pd.DataFrame(df_topics_merged)
    # count how often topic occurs in comments
    column_counts = binary_df.iloc[:, 1 : ].sum(axis=0)
    column_counts_format = []
    for count in column_counts:
        column_counts_format.append(count)
    df_topics_merged['counter'] = column_counts_format

    return topic_distr, df_topics_merged

In [None]:
topic_model_yt, topics_info_yt = run_BERTopic(comments_yt)
topic_model_tt, topics_info_tt = run_BERTopic(comments_tt)

In [101]:
df_topics_yt = generate_topic_names(topic_model_yt, topics_info_yt)
df_topics_tt = generate_topic_names(topic_model_tt, topics_info_tt)

In [None]:
# merging of same topics and counting of their occurences in the comments
topic_distr_yt, df_topics_yt_merged = count_occurance(df_topics_yt, topic_model_yt, comments_yt)
topic_distr_tt, df_topics_tt_merged = count_occurance(df_topics_tt, topic_model_tt, comments_tt)

# Llama Chatbot

In [27]:
comment_yt_con = '|'.join(comments_yt)
comment_tt_con = '|'.join(comments_tt)

## Identify Needs

In [None]:
# note that you might need further interations on split-up text if the text is too long
needs_yt = ''
for video in comments_yt_per_video:
    if len(video) < 100:
        continue
    response = generateFromPrompt(f"Please identify some needs of people interacting with {hashtag} videos in the following list of comments : ' " + video + "'",maxTokens=200)
    needs_yt += response
    needs_yt += ';NEXT;'

response = generateFromPrompt("Please summarise the summaries in the following text and identify 5 needs :' " + needs_yt + "'",maxTokens=500)

print("Needs for YouTube\n "+response+"...")

In [None]:
# note that you might need further interations on split-up text if the text is too long
needs_tt = ''
for video in comments_tt_per_video:
    if len(video) < 100:
        continue
    response = generateFromPrompt(f"Please identify some needs of people interacting with {hashtag} videos in the following list of comments: ' " + video + "'",maxTokens=200)
    needs_tt += response
    needs_tt += ';NEXT;'

response = generateFromPrompt("Please summarise the summaries in the following text and identify 5 needs : ' " + needs_tt + "'",maxTokens=500)

print("Needs for TikTok\n "+response+"...")

## Identify Experiences

In [None]:
# note that you might need further interations on split-up text if the text is too long
exp_yt = ''
for video in comments_yt_per_video:
    if len(video) < 100:
        continue
    response = generateFromPrompt(f"Please identify some experiences people interacting with {hashtag} videos have in the following list of comments: ' " + video + "'",maxTokens=200)
    exp_yt += response
    exp_yt += ';NEXT;'

response = generateFromPrompt("Please summarise the following text and identify 5 experiences: ' " + exp_yt + "'",maxTokens=500)

print("Experiences for YouTube\n "+response)

In [None]:
# note that you might need further interations on split-up text if the text is too long
exp_tt = ''
for video in comments_tt_per_video:
    if len(video) < 100:
        continue
    response = generateFromPrompt(f"Please identify some experiences people interacting with {hashtag} videos have in the following list of comments: ' " + video + "'",maxTokens=200)
    exp_tt += response
    exp_tt += ';NEXT;'

response = generateFromPrompt("Please summarise the summaries in the following text and identify 5 experiences : ' " + exp_tt + "'",maxTokens=500)

print("Experiences for TikTok\n "+response)

# Plots

In [None]:
fig = plt.figure(layout="constrained", figsize=(12, 3))
gs = GridSpec(1, 2, figure=fig)
ax4 = fig.add_subplot(gs[1])
ax5 = fig.add_subplot(gs[0])

# 4: show topics and their count as identified by BERTopics + Llama for YouTube
df_topics_yt_merged_sorted = df_topics_yt_merged.sort_values(by=['counter'], ascending=False)
x_axis_yt = df_topics_yt_merged_sorted['counter'].to_list()
x_axis_yt_pc = [round(counter/len(comments_yt),2) for counter in x_axis_yt]
y_axis_yt_pre = df_topics_yt_merged_sorted[0].to_list()
y_axis_yt = []
for y in y_axis_yt_pre:
    if len(y) > 20:
        y_split = y.split(' ')
        y_split[int(len(y_split)/2) - 1] += '\n'
        y_axis_yt.append(' '.join(y_split))
    else:
        y_axis_yt.append(y)
hbars = ax4.barh(y_axis_yt[:5], x_axis_yt_pc[:5], align='center')
ax4.set_yticks(y_axis_yt[:5])
ax4.bar_label(hbars, labels=[ f'{n} \n{round(100*n_pc,2)}%' for n,n_pc in zip(x_axis_yt[:5], x_axis_yt_pc[:5])],
             padding=-28, color='black', fontsize=8, fontweight='bold')

ax4.set_title("Relative occurrence per topic\n in comments for YouTube", fontweight='bold')


# 5: show topics and their count as identified by BERTopics + Llama for TikTok
df_topics_tt_merged_sorted = df_topics_tt_merged.sort_values(by=['counter'], ascending=False)
x_axis_tt = df_topics_tt_merged_sorted['counter'].to_list()
x_axis_tt_pc = [round(counter/len(comments_tt),2) for counter in x_axis_tt]
y_axis_tt_pre = df_topics_tt_merged_sorted[0].to_list()
y_axis_tt = []
for y in y_axis_tt_pre:
    if len(y) > 20:
        y_split = y.split(' ')
        y_split[int(len(y_split)/2) - 1] += '\n'
        y_axis_tt.append(' '.join(y_split))
    else:
        y_axis_tt.append(y)

hbars = ax5.barh(y_axis_tt[:5], x_axis_tt_pc[:5], align='center')
ax5.set_yticks(y_axis_tt[:5])
ax5.invert_yaxis()  # labels read top-to-bottom
ax5.bar_label(hbars, labels=[ f'{n} \n{round(100*n_pc,2)}%' for n,n_pc in zip(x_axis_tt[:5], x_axis_tt_pc[:5])],
             padding=-28, color='black', fontsize=8, fontweight='bold')

ax5.set_title("Relative occurrence per topic\n in comments for TikTok", fontweight='bold')

fig.show()

## Save video IDs and date and input parameter of analysis
- IDs of TikTok videos
- IDs of YouTube videos

In [14]:
f = open("../social_analysis_videos.txt", "a")

f.write('Analysis conducted on ' + str(datetime.now())  + '.\n')
f.write(f'Using the hashtag/ searchterm {hashtag}, searching for maximum {num_videos} videos with maximum {num_comments} comments each.\n')

f.write('YouTube\n')
for result in results_yt:
    f.write(result + '\n')
f.write('TikTok\n')
for result in results_tiktok:
    f.write(result['id'] + '\n')

f.close()