In [21]:
#importing libraries
from youtube_transcript_api import YouTubeTranscriptApi
import urllib.parse
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raaidtanveer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
#getting Youtube transcript
url = "https://www.youtube.com/watch?v=EKvHQc3QEow&list=PL19E79A0638C8D449"
url_data = urllib.parse.urlparse(url)
query = urllib.parse.parse_qs(url_data.query)
video_id = query["v"][0]
transcript = YouTubeTranscriptApi.get_transcript(video_id)
text = ""
for d in transcript:
    txt = d['text'].replace('\n', ' ')
    text += txt + ' '

In [23]:
#function to generate tfidf vectors for words in the text excluding stopwords. Takes only text parameter. Returns a pandas dataframe with the words and their respective tfidf vector values.
def generate_tfidf_vectors(text):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame({'word':feature_names,'value':denselist[0]})
    sorted_df = df.sort_values(by=['value'], ascending=False)
    final_df = sorted_df[~sorted_df['word'].isin(stopwords.words('english'))]
    final_df.reset_index(inplace=True,drop=True)
    return final_df
#function to search for sentences containing a particular word. Takes the text and word as parameters. Returns a list of sentences containing the word from the text.
def _find_sentences_with_word(raw_text, word):
    text = re.split('(?<=[.!?])',raw_text)
    neo_sentences = []
    for sent in text:
        if word in sent:
            neo_sentences.append(sent)
    return neo_sentences
#creates fill in the blank question. Takes the sentence and the word as parameter to create question. Returns fill int he blank question.
def _create_fill_in_blank(text, word):
    return text.lower().replace(word, '_____')
#generates fill in the blank questions based on the text and a word. Takes the text and word as paramters. Returns a list of questions. 
def question_gen(text, word):
    found_sentences = _find_sentences_with_word(text, word)
    questions = []
    if len(found_sentences) > 0:
        for sentence in found_sentences:
            question = _create_fill_in_blank(sentence, word)
            questions.append(question)
    return questions 

#generating a list of questions
questions = question_gen(text, generate_tfidf_vectors(text).iloc[0]['word'])

In [24]:
#creates a ranked pandas dataframe of questions. Ranks based on the average tfidf vector value per word multiplied by the tfidf vector value for the omitted word. Takes the text and search depth as parameter, search depth being the number of words with the highest tfidf vector values to use to generate questions. Returns a ranked dataframe of questions with their respective answers, average tfidf vector values per word in the sentence and scores(average tfidf vector value per word multiplied by the tfidf vector value for the omitted word). 
def ranked_question_df(text, search_depth):
    sentences = pd.DataFrame(columns=['sentences','avg_tfidf_per_word', 'answer', 'score'])
    tfidf_df = generate_tfidf_vectors(text)
    for word in tfidf_df.iloc[:search_depth]['word']:
        for sentence in _find_sentences_with_word(text, word):
            words = set(sentence.split(' '))
            avg = tfidf_df[tfidf_df['word'].isin(words)]['value'].sum()/len(words)
            sentences = sentences.append(pd.Series({'sentences':_create_fill_in_blank(sentence, word), 'avg_tfidf_per_word':avg, 'answer':word, 'score': avg * float(tfidf_df[tfidf_df['word']==word]['value'])}), ignore_index=True)
    return sentences.drop_duplicates().sort_values(by=['score'], ascending=False)

ranked_questions = ranked_question_df(text, 20).head()

In [25]:
dict(zip(ranked_questions.iloc[:5]['sentences'].tolist(), ranked_questions.iloc[:5]['answer'].tolist()))

{' your _____ of y is going to get smaller and smaller and smaller.': 'change',
 ' the _____ in distance is this right over here.': 'change',
 ' your change of y is _____ to get smaller and smaller and smaller.': 'going',
 ' so our _____ in time is equal to 9.': 'change',
 ' let me write it this way, his average speed is just going to be his _____ in distance over his _____ in time.': 'change'}