In [12]:
from  youtube_transcript_api import YouTubeTranscriptApi as yt

import string
import re

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, PorterStemmer

In [3]:
video_ids = ['Dywaz0H7U84', 'pCmJ8wsAS_w', 'TQMayZS9o1U', 'rf_EQvubKlk', 'hh3BKTFE1dc']

In [4]:
video_details = {'video_id': [], 'transcript': []}

for i in video_ids:
    transcript = yt.get_transcript(i)
    video_details['video_id'].append(i)
    final_script = ''

    for t in transcript:
        final_script += f"{t['text']} "

    video_details['transcript'].append(final_script)

In [5]:
transcript_df = pd.DataFrame(data=video_details)

In [None]:
transcript_df.to_csv("S:/Dissertation 2023/Stock market analysis/stock_market_strategy_analysis/data_files/transcript.csv", header=True)

In [6]:
transcript_df.head()

Unnamed: 0,video_id,transcript
0,Dywaz0H7U84,hello everyone welcome to tradeline in today's...
1,pCmJ8wsAS_w,"Bollinger Bands\nIn this video, I m going to s..."
2,TQMayZS9o1U,if you want to learn how to use the RSI indica...
3,rf_EQvubKlk,MACD Strategy\nThis MACD strategy I m about ...
4,hh3BKTFE1dc,in this video i'm revealing a simple and profi...


In [7]:
def clean_transcript(text):
    text = re.sub(r'(\xa0|\n)', ' ', text)
    return text

In [9]:
transformed_scripts = []
for script in transcript_df['transcript']:
    transformed_scripts.append(clean_transcript(script))

transcript_df['transcript'] = transformed_scripts

In [8]:
transcript_df

Unnamed: 0,video_id,transcript
0,Dywaz0H7U84,hello everyone welcome to tradeline in today's...
1,pCmJ8wsAS_w,"Bollinger Bands In this video, I m going to sh..."
2,TQMayZS9o1U,if you want to learn how to use the RSI indica...
3,rf_EQvubKlk,MACD Strategy This MACD strategy I m about t...
4,hh3BKTFE1dc,in this video i'm revealing a simple and profi...


In [24]:
def create_freq_table(text, upper_thresh, lower_thresh):
    # get the stop words for english language
    stop_words = set(stopwords.words("english"))

    # separate the text into individual words
    words = word_tokenize(text)

    # initialize the object for stemmer class
    stemmer = PorterStemmer()

    # empty dictionary to store the word count of every word in the text
    frequency_table = {}

    """Stem the words and check if they are in the stop words. If not, increase the count of that word in the frequency table"""
    for w in words:
        w = stemmer.stem(w)

        if w in stop_words:
            continue
        if w in frequency_table:
            frequency_table[w] += 1
        else:
            frequency_table[w] = 1

    # get the maximum frequency value from the entire text
    max_frequency = float(max(frequency_table.values()))

    freq_copy = frequency_table.copy()

    for key in freq_copy.keys():
        frequency_table[key] = frequency_table[key]/max_frequency

        if frequency_table[key] >= upper_thresh or frequency_table[key] <= lower_thresh:
            del frequency_table[key]

    return frequency_table

In [15]:
print(create_freq_table(transformed_scripts[0]))

{'hello': 1, 'everyon': 1, 'welcom': 1, 'tradelin': 1, 'today': 2, "'s": 12, 'video': 8, 'want': 3, 'share': 1, 'simpl': 2, 'veri': 1, 'profit': 2, 'strategi': 4, 'use': 7, 'combin': 1, 'three': 2, 'popular': 1, 'free': 1, 'indic': 17, 'order': 4, 'success': 2, 'must': 8, 'consid': 1, 'sever': 1, 'import': 1, 'factor': 1, 'explain': 1, 'thi': 15, 'sure': 4, 'watch': 2, 'full': 1, 'make': 3, "n't": 2, 'miss': 2, 'anyth': 1, 'befor': 1, 'go': 8, 'ani': 4, 'show': 1, 'us': 3, 'support': 1, 'help': 2, 'produc': 1, 'lot': 2, 'new': 2, 'content': 1, 'hit': 1, 'subscrib': 2, 'button': 1, 'notif': 1, 'bell': 1, 'onli': 2, 'two': 2, 'click': 3, 'mean': 1, 'addit': 1, 'would': 3, 'like': 3, 'point': 1, 'appli': 1, 'wide': 1, 'rang': 2, 'asset': 1, 'class': 1, 'includ': 1, 'fork': 1, 'stock': 1, 'crypto': 1, 'time': 1, 'frame': 1, 'without': 1, 'ado': 1, 'let': 7, 'get': 1, 'start': 2, 'usual': 1, 'tradingu.com': 1, 'technic': 1, 'analysi': 1, "'re": 2, 'set': 10, 'alright': 1, 'add': 6, 'chart':

In [25]:
print(create_freq_table(transformed_scripts[0], 0.7, 0.1))

{'today': 0.11764705882352941, 'video': 0.47058823529411764, 'want': 0.17647058823529413, 'simpl': 0.11764705882352941, 'profit': 0.11764705882352941, 'strategi': 0.23529411764705882, 'use': 0.4117647058823529, 'three': 0.11764705882352941, 'order': 0.23529411764705882, 'success': 0.11764705882352941, 'must': 0.47058823529411764, 'sure': 0.23529411764705882, 'watch': 0.11764705882352941, 'make': 0.17647058823529413, "n't": 0.11764705882352941, 'miss': 0.11764705882352941, 'go': 0.47058823529411764, 'ani': 0.23529411764705882, 'us': 0.17647058823529413, 'help': 0.11764705882352941, 'lot': 0.11764705882352941, 'new': 0.11764705882352941, 'subscrib': 0.11764705882352941, 'onli': 0.11764705882352941, 'two': 0.11764705882352941, 'click': 0.17647058823529413, 'would': 0.17647058823529413, 'like': 0.17647058823529413, 'rang': 0.11764705882352941, 'let': 0.4117647058823529, 'start': 0.11764705882352941, "'re": 0.11764705882352941, 'set': 0.5882352941176471, 'add': 0.35294117647058826, 'chart':

In [30]:
from heapq import nlargest

def create_summary(text, n):
    sentences = sent_tokenize(text)

    words = word_tokenize(sentences[0])

    frequency = create_freq_table(text, 0.7, 0.1)
    print(frequency)
    rank = {}

    for word in words:
        if word in frequency:
            rank[word] += frequency[word]
        else:
            rank[word] = frequency[word]

    word_ids = get_ranking(rank, n)
    print(word_ids)

def get_ranking(text_rank, num):
    return nlargest(num, text_rank, key=text_rank.get)

In [31]:
create_summary(transformed_scripts[0], 50)

{'today': 0.11764705882352941, 'video': 0.47058823529411764, 'want': 0.17647058823529413, 'simpl': 0.11764705882352941, 'profit': 0.11764705882352941, 'strategi': 0.23529411764705882, 'use': 0.4117647058823529, 'three': 0.11764705882352941, 'order': 0.23529411764705882, 'success': 0.11764705882352941, 'must': 0.47058823529411764, 'sure': 0.23529411764705882, 'watch': 0.11764705882352941, 'make': 0.17647058823529413, "n't": 0.11764705882352941, 'miss': 0.11764705882352941, 'go': 0.47058823529411764, 'ani': 0.23529411764705882, 'us': 0.17647058823529413, 'help': 0.11764705882352941, 'lot': 0.11764705882352941, 'new': 0.11764705882352941, 'subscrib': 0.11764705882352941, 'onli': 0.11764705882352941, 'two': 0.11764705882352941, 'click': 0.17647058823529413, 'would': 0.17647058823529413, 'like': 0.17647058823529413, 'rang': 0.11764705882352941, 'let': 0.4117647058823529, 'start': 0.11764705882352941, "'re": 0.11764705882352941, 'set': 0.5882352941176471, 'add': 0.35294117647058826, 'chart':

KeyError: 'hello'