<a href="https://colab.research.google.com/github/rivalsolmons/BTP/blob/main/TopicModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

All the pips required:

In [None]:
!pip install requests
## The below are for LDA and viz of the LDA
!pip install pyLDAvis
!pip install gensim
!pip install pyLDAvis.gensim_models

All of the imports required for loading videos to the API and retrieving the conversations

In [None]:
import pprint
from pprint import pprint #pretty-printing of Python data structures, used to print data in a readable format.
import requests #for making HTTP requests in Python
import glob
import os
from google.colab import files ## upload a video
import time

Expressing that we'd like "conversation" data and creating an index_id.  We'll be using the conversation for our topic modeling. Creates 2 dictionaries, makes a post request to the INDEXES_URL and stores it in the response variable.

In [None]:
API_URL = "https://api.twelvelabs.io/v1.1"

API_KEY = "[YOU API KEY]"

INDEXES_URL = f"{API_URL}/indexes"

INDEX_NAME = "Mr. Beast's Demo" # Use a descriptive name for your index



headers = { "x-api-key": API_KEY }

data = {
    "engine_id": "marengo2",
     "index_options":["conversation"],
     "index_name": INDEX_NAME,
}

response = requests.post(INDEXES_URL, headers=headers, json=data)
INDEX_ID = response.json().get('_id')
print (f'Status code: {response.status_code}')
pprint (response.json())

Uploading 4 video files for comparison

In [None]:
uploaded = files.upload()

In [None]:
uploaded = files.upload()

In [None]:
uploaded = files.upload()

In [None]:
uploaded = files.upload()

Looping through the different tasks to create a task_id for each one (Mr. Beast, you will need to add the names of your own files here)

In [None]:
TASKS_URL = f"{API_URL}/tasks"
file_names = ["TCPDS_Mohamed.mp4", "TCPDS_nicola2.mp4", "TCPDS_Timothy_datacollection.mp4", "yolov5_integration_announcement.mp4"]
tasks_list = []

data = {
    "index_id": INDEX_ID,
    "language": "en"
}

for file_name in file_names:
  file_stream = open(file_name,"rb")
  file_param=[
      ("video_file", (file_name, file_stream, "application/octet-stream")),]

  response = requests.post(TASKS_URL, headers=headers, data=data, files=file_param)
  TASK_ID = response.json().get("_id")
  tasks_list.append(TASK_ID)
  print (f"Status code: {response.status_code}")
  pprint (response.json())


Polling to understand when all 4 videos are "ready"


In [None]:
while True:
  all_ready = True
  for task in tasks_list:
    TASK_STATUS_URL = f"{API_URL}/tasks/{task}"
    response = requests.get(TASK_STATUS_URL, headers=headers)
    STATUS = response.json().get("status")
    print(response.json())
    if STATUS == "ready":
      print("ready")
    else:
      all_ready = False
  if all_ready:
    break
  time.sleep(600)

Getting the conversation results so that we can use the text to find topics in these videos.

In [None]:
SEARCH_URL = f"{API_URL}/search"

data = {
  "query": "data",
  "index_id": INDEX_ID,
  "search_options": ["conversation"],
}

response = requests.post(SEARCH_URL, headers=headers, json=data)
print (f"Status code: {response.status_code}")
pprint (response.json())

Starting analysis of topics.  Placing conversation data in a dataframe then we'll start preprocessing.

In [None]:
import requests
import pandas as pd
import re # Load the regular expression library

In [None]:
headers = { "x-api-key": API_KEY }

def list_videos(INDEX_ID):
  response = requests.get(f'{API_URL}/indexes/{INDEX_ID}/videos', headers=headers)
  return response.json()

def retrieve_transcription(INDEX_ID, video_id):
  response = requests.get(f'{API_URL}/indexes/{INDEX_ID}/videos/{video_id}/transcription', headers=headers)
  return response.json()


videos_response = list_videos(INDEX_ID)
videos = videos_response['data']
rows = []
for video in videos:
  name = video['metadata']['filename']
  transcription_response = retrieve_transcription(INDEX_ID, video['_id'])
  for blurb in transcription_response['data']:
    rows.append([name, blurb['value']])

df = pd.DataFrame(rows)
df = df.rename(columns={0: "video", 1: "text"})
df.head()



In [None]:
df['text_processed'] = \
df['text'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
df['text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
df['text_processed'].head()
df.head()

Installing the libraries required for the modeling piece of this analysis

In [None]:
import gensim #NLP library for topic modeling (LDA)
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel
from gensim.utils import simple_preprocess #basic text preprocessing
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords #we'll use for removing stopwords
import gensim.corpora as corpora
import pyLDAvis #Gensim tool for LDA model visualization
import pyLDAvis.gensim_models as lda
from pprint import pprint
import pickle
import random # to set a random seed


Removing the words that aren't helpful in the analysis of topics

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['hey','um','would','one','really','still','every','get','also','says','like','need','make','go','uh','yeah','using','use', 'want','usually','see','something','helped','used',
                   'find','think','nothing','related','comment', 'give','know','first','different','nice','new','hub','actually','bunch','going','say','quite','anytime','high','field','works',
                   'things','always','set','bit','trying','got','lot','okay','way','ever','reason','able','wondering'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]

data = df.text_processed.values.tolist()
data_words = list(sent_to_words(data))


# remove stop words
data_words = remove_stopwords(data_words)
print(data_words)


In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus)

In [None]:
# number of topics
num_topics = 6

# Build LDA model

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
## Print the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('.content'+str(num_topics))

if 1 == 1:
    LDAvis_prepared = lda.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './content'+ str(num_topics) +'.html')
LDAvis_prepared