In [1]:
#importing libraries
from youtube_transcript_api import YouTubeTranscriptApi
import urllib.parse
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords
import requests
import json
import random
from pywsd.similarity import max_similarity
from pywsd.lesk import adapted_lesk
from pywsd.lesk import simple_lesk
from pywsd.lesk import cosine_lesk
from nltk.corpus import wordnet as wn
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raaidtanveer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Warming up PyWSD (takes ~10 secs)...took 12.80709719657898 secs.


In [2]:
#fetching youtube video transcript
url = "https://www.youtube.com/watch?v=2ujmYuSYzSk"
url_data = urllib.parse.urlparse(url)
query = urllib.parse.parse_qs(url_data.query)
video_id = query["v"][0]
transcript = YouTubeTranscriptApi.get_transcript(video_id)
text = ""
for d in transcript:
    txt = d['text'].replace('\n', ' ')
    text += txt + ' '

In [3]:
#function to generate tfidf vectors for words in the text excluding stopwords. Takes only text parameter. Returns a pandas dataframe with the words and their respective tfidf vector values.
def generate_tfidf_vectors(text):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame({'word':feature_names,'value':denselist[0]})
    sorted_df = df.sort_values(by=['value'], ascending=False)
    final_df = sorted_df[~sorted_df['word'].isin(stopwords.words('english'))]
    final_df.reset_index(inplace=True,drop=True)
    return final_df
#function to return a dictionary of format {word:[sentences]}. Takes text and word to search as parameters. 
def _find_sentences_with_words(raw_text, words):
    sentences = []
    for w in words:
        text = re.split('(?<=[.!?])',raw_text)
        neo_sentences = []
        for sent in text:
            if w in sent:
                neo_sentences.append(sent)
        sentences.append(neo_sentences)
    dictionary = dict.fromkeys(words, sentences)
    for key in dictionary:
        flat_list = []
        for sublist in dictionary[key]:
            for item in sublist:
                flat_list.append(item)
        dictionary[key] = flat_list
    return dictionary
    
d = _find_sentences_with_words(text, generate_tfidf_vectors(text)['word'])

In [4]:
# Distractors from Wordnet
def get_distractors_wordnet(syn,word):
    distractors=[]
    word= word.lower()
    orig_word = word
    if len(word.split())>0:
        word = word.replace(" ","_")
    hypernym = syn.hypernyms()
    if len(hypernym) == 0: 
        return distractors
    for item in hypernym[0].hyponyms():
        name = item.lemmas()[0].name()
        #print ("name ",name, " word",orig_word)
        if name == orig_word:
            continue
        name = name.replace("_"," ")
        name = " ".join(w.capitalize() for w in name.split())
        if name is not None and name not in distractors:
            distractors.append(name)
    return distractors

def get_wordsense(sent,word):
    word= word.lower()
    
    if len(word.split())>0:
        word = word.replace(" ","_")
    synsets = wn.synsets(word,'n')
    if synsets:
        #wup = max_similarity(sent, word, 'wup', pos='n')
        #adapted_lesk_output =  adapted_lesk(sent, word, pos='n')
        #lowest_index = min(synsets.index(wup),synsets.index(adapted_lesk_output))
        return synsets[0]
    else:
        return None

# Distractors from http://conceptnet.io/
def get_distractors_conceptnet(word):
    word = word.lower()
    original_word= word
    if (len(word.split())>0):
        word = word.replace(" ","_")
    distractor_list = [] 
    url = "http://api.conceptnet.io/query?node=/c/en/%s/n&rel=/r/PartOf&start=/c/en/%s&limit=5"%(word,word)
    obj = requests.get(url).json()

    for edge in obj['edges']:
        link = edge['end']['term'] 

        url2 = "http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=10"%(link,link)
        obj2 = requests.get(url2).json()
        for edge in obj2['edges']:
            word2 = edge['start']['label']
            if word2 not in distractor_list and original_word.lower() not in word2.lower():
                distractor_list.append(word2)
                   
    return distractor_list
#getting a dictionary of format {keyword:[distractors]}
key_distractor_list = {}
for keyword in d:
    wordsence = get_wordsense
for keyword in d:
    wordsense = get_wordsense(d[keyword][0], keyword)
    if wordsense:
        distractors = get_distractors_wordnet(wordsense,keyword)
        if len(distractors) ==0:
            distractors = get_distractors_conceptnet(keyword)
        if len(distractors) != 0:
            key_distractor_list[keyword] = distractors
    else:
        distractors = get_distractors_conceptnet(keyword)
        if len(distractors) != 0:
            key_distractor_list[keyword] = distractors

In [5]:
#finds a list of sentences from raw_text containing a particular word. Function takes raw text and word to look for as parameters. Returns a list of sentences containing word.
def _find_sentences_with_word(raw_text, word):
    text = re.split('(?<=[.!?])',raw_text)
    neo_sentences = []
    for sent in text:
        if word in sent:
            neo_sentences.append(sent)
    return neo_sentences

#creating distractor words
def _create_distractor(text, word):
    replace_word = ""
    for w in key_distractor_list[word]:
        if (w.lower() != word.lower()):
            replace_word = w
            break
    tf = round(np.random.random())
    if tf==0:
        answer = True
        replace_word = word.lower()
    else:
        answer = False
    return (text.replace(word, replace_word.lower()), answer)

#creates a ranked pandas dataframe of questions. Ranks based on the average tfidf vector value per word multiplied by the tfidf vector value for the substituted word. Takes the text and search depth as parameter, search depth being the number of words with the highest tfidf vector values to use to generate questions. Returns a ranked dataframe of questions with their respective answers, average tfidf vector values per word in the sentence and scores(average tfidf vector value per word multiplied by the tfidf vector value for the substituted word). 
def ranked_question_df(text):
    sentences = pd.DataFrame(columns=['sentences','avg_tfidf_per_word', 'answer', 'score'])
    tfidf_df = generate_tfidf_vectors(text)
    for word in key_distractor_list:
        for sentence in _find_sentences_with_word(text, word):
            words = set(sentence.split(' '))
            avg = tfidf_df[tfidf_df['word'].isin(words)]['value'].sum()/len(words)
            sentences = sentences.append(pd.Series({'sentences':_create_distractor(sentence, word)[0], 'avg_tfidf_per_word':avg, 'answer':_create_distractor(sentence, word)[1], 'score': avg * float(tfidf_df[tfidf_df['word']==word]['value'])}), ignore_index=True)
    return sentences.drop_duplicates().sort_values(by=['score'], ascending=False)

questions = ranked_question_df(text)

In [6]:
questions.head()

Unnamed: 0,sentences,avg_tfidf_per_word,answer,score
64,I think a big one here would be kind of an in...,0.015525,False,0.001087
130,I really like this memorial.,0.01549,True,0.000939
194,I think a big binary digit here would be kind...,0.015525,True,0.000899
89,And then the tone I would say is pigeonhole o...,0.012543,False,0.000879
28,"All absolute, so that is one short answer que...",0.011988,False,0.000856


In [7]:
dict(zip(questions.iloc[:5]['sentences'].tolist(),questions.iloc[:5]['answer'].tolist() ))

{' I think a big one here would be kind of an increase in anti-immigrant sentiment or nativist sentiment.': False,
 ' I really like this memorial.': True,
 ' I think a big binary digit here would be kind of an increase in anti-immigrant sentiment or nativist sentiment.': True,
 " And then the tone I would say is pigeonhole of somber, respectful, it's even pigeonhole of quiet.": False,
 ' All absolute, so that is one short answer question.': False}