# RAKE (RAPID AUTOMATIC KEYWORD EXTRACTION ALGORITHM) Implemenation

In [None]:

"""Implementation of Rapid Automatic Keyword Extraction algorithm.
As described in the paper `Automatic keyword extraction from individual
documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
"""
import re
import string
from collections import Counter, defaultdict
from itertools import chain, groupby, product

import nltk
from enum import Enum
from nltk.tokenize import wordpunct_tokenize


class Metric(Enum):
    """Different metrics that can be used for ranking."""

    DEGREE_TO_FREQUENCY_RATIO = 0  # Uses d(w)/f(w) as the metric
    WORD_DEGREE = 1  # Uses d(w) alone as the metric
    WORD_FREQUENCY = 2  # Uses f(w) alone as the metric


class Rake(object):
    """Rapid Automatic Keyword Extraction Algorithm."""

    def __init__(
        self,
        stopwords=None,
        punctuations=None,
        language="english",
        ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
        max_length=4,
        min_length=1,
    ):
        """Constructor.
        :param stopwords: List of Words to be ignored for keyword extraction.
        :param punctuations: Punctuations to be ignored for keyword extraction.
        :param language: Language to be used for stopwords
        :param max_length: Maximum limit on the number of words in a phrase
  
        :param min_length: Minimum limit on the number of words in a phrase

        """
        # By default use degree to frequency ratio as the metric.
        if isinstance(ranking_metric, Metric):
            self.metric = ranking_metric
        else:
            self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO

        # If stopwords not provided we use language stopwords by default.
        self.stopwords = stopwords
        if self.stopwords is None:
            self.stopwords = []
            stop_word_file='/kaggle/input/smartstoplists/SmartStoplist.txt'
            for line in open(stop_word_file):
                if line.strip()[0:1] != "#":
                    for word in line.split():  # in case more than one per line
                        self.stopwords.append(word)
                #self.stopwords = nltk.corpus.stopwords.words(language)


        # If punctuations are not provided we ignore all punctuation symbols.
        self.punctuations = punctuations
        if self.punctuations is None:
            self.punctuations = string.punctuation

        # All things which act as sentence breaks during keyword extraction.
        self.to_ignore = set(chain(self.stopwords, self.punctuations))

        # Assign min or max length to the attributes
        self.min_length = min_length
        self.max_length = max_length

        # Stuff to be extracted from the provided text.
        self.frequency_dist = None
        self.degree = None
        self.rank_list = None
        self.ranked_phrases = None

    def extract_keywords_from_text(self, text):
        """Method to extract keywords from the text provided.
        :param text: Text to extract keywords from, provided as a string.
        """
        sentences = nltk.tokenize.sent_tokenize(text)
        self.extract_keywords_from_sentences(sentences)

    def extract_keywords_from_sentences(self, sentences):
        """Method to extract keywords from the list of sentences provided.
        :param sentences: Text to extraxt keywords from, provided as a list
                          of strings, where each string is a sentence.
        """
        sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
        sentences = sentence_delimiters.split(text)
        phrase_list = self._generate_phrases(sentences)
        self._build_frequency_dist(phrase_list)
        self._build_word_co_occurance_graph(phrase_list)
        self._build_ranklist(phrase_list)

    def get_ranked_phrases(self):
        """Method to fetch ranked keyword strings.
        :return: List of strings where each string represents an extracted
                 keyword string.
        """
        return self.ranked_phrases

    def get_ranked_phrases_with_scores(self):
        """Method to fetch ranked keyword strings along with their scores.
        """
        return self.rank_list

    def get_word_frequency_distribution(self):
        """Method to fetch the word frequency distribution in the given text.
        """
        return self.frequency_dist

    def get_word_degrees(self):
        """Method to fetch the degree of words in the given text. Degree can be
        defined as sum of co-occurances of the word with other words in the
        given text.
        """
        return self.degree

    def _build_frequency_dist(self, phrase_list):
        """Builds frequency distribution of the words in the given body of text.
        :param phrase_list: List of List of strings where each sublist is a
                            collection of words which form a contender phrase.
        """
        self.frequency_dist = Counter(chain.from_iterable(phrase_list))

    def _build_word_co_occurance_graph(self, phrase_list):
        """Builds the co-occurance graph of words in the given body of text to
        compute degree of each word.
        :param phrase_list: List of List of strings where each sublist is a
                            collection of words which form a contender phrase.
        """
        co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
        for phrase in phrase_list:
            # For each phrase in the phrase list, count co-occurances of the
            # word with other words in the phrase.
            #
            # Note: Keep the co-occurances graph as is, to help facilitate its
            # use in other creative ways if required later.
            for (word, coword) in product(phrase, phrase):
                co_occurance_graph[word][coword] += 1
        self.degree = defaultdict(lambda: 0)
        for key in co_occurance_graph:
            self.degree[key] = sum(co_occurance_graph[key].values())

    def _build_ranklist(self, phrase_list):
        """Method to rank each contender phrase using the formula
              phrase_score = sum of scores of words in the phrase.
              word_score = d(w)/f(w) where d is degree and f is frequency.
        :param phrase_list: List of List of strings where each sublist is a
                            collection of words which form a contender phrase.
        """
        self.rank_list = []
        for phrase in phrase_list:
            rank = 0.0
            for word in phrase:
                if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
                    rank += 1.0 * self.degree[word] / self.frequency_dist[word]
                elif self.metric == Metric.WORD_DEGREE:
                    rank += 1.0 * self.degree[word]
                else:
                    rank += 1.0 * self.frequency_dist[word]
            self.rank_list.append((rank, " ".join(phrase)))
        self.rank_list.sort(reverse=True)
        self.ranked_phrases = [ph[1] for ph in self.rank_list]

    def _generate_phrases(self, sentences):
        """Method to generate contender phrases given the sentences of the text
        document.
        :param sentences: List of strings where each string represents a
                          sentence which forms the text.
        :return: Set of string tuples where each tuple is a collection
                 of words forming a contender phrase.
        """
        phrase_list = set()
        # Create contender phrases from sentences.
        for sentence in sentences:
            word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
            phrase_list.update(self._get_phrase_list_from_words(word_list))
        return phrase_list

    def _get_phrase_list_from_words(self, word_list):
        """Method to create contender phrases from the list of words that form
        a sentence by dropping stopwords and punctuations and grouping the left
        words into phrases. Only phrases in the given length range (both limits
        inclusive) would be considered to build co-occurrence matrix.
        :param word_list: List of words which form a sentence when joined in
                          the same order.
        :return: List of contender phrases that are formed after dropping
                 stopwords and punctuations.
        """
        groups = groupby(word_list, lambda x: x not in self.to_ignore)
        phrases = [tuple(group[1]) for group in groups if group[0]]
        return list(
            filter(
                lambda x: self.min_length <= len(x) <= self.max_length, phrases
            )
)

In [None]:
r = Rake()
text='A compiler is a computer program that translates computer code written in one programming language (the source language) into another language (the target language). The name compiler is primarily used for programs that translate source code from a high-level programming language to a lower level language (e.g., assembly language, object code, or machine code) to create an executable program.A compiler is likely to perform many or all of the following operations: preprocessing, lexical analysis, parsing, semantic analysis (syntax-directed translation), conversion of input programs to an intermediate representation, code optimization and code generation. Compilers implement these operations in phases that promote efficient design and correct transformations of source input to target output. Program faults caused by incorrect compiler behavior can be very difficult to track down and work around; therefore, compiler implementers invest significant effort to ensure compiler correctness.'
r.extract_keywords_from_text(text)
r.get_ranked_phrases_with_scores()

# TextRank Algorithm Implementation

In [None]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [None]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)



# Keyword Extraction from pdfs

    Keywords are extracted from the pdfs in the directory /kaggle/input/fileuploads



In [None]:
!pip install fitz

In [None]:
!pip install PyMuPDF

In [None]:
import sys
import fitz
import os
import re 
import csv
import json

     
with open('keywords.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["filename", "filetype", "keyword"])

for root, dirs, files in os.walk("/kaggle/input/fileuploads"):
    for filename in files:
        path='/kaggle/input/fileuploads/'+filename
        try:
            doc = fitz.open(path)  
            i=1
            keywordList = []
            print(filename+" : ")
            for page in doc:
                #using RAKE
                pgno=str(i)
                print("page :",pgno)
                text=page.getText()
                r = Rake()
                r.extract_keywords_from_sentences(text)
                keywords=r.get_ranked_phrases_with_scores()
                if(len(keywords)) > 15:
                    for j in range(15):
                        tmp=keywords[j]
                        key=re.sub(r'[^\w]', ' ',str(tmp[1])) 
                
                        keywordList.append(key)
                        #print(str(key))
                else:
                    if len(keywords)!=0:
                        for j in keywords:
                            key=re.sub(r'[^\w]', ' ',str(j[0])) 
                            
                            keywordList.append(key)
                            #print(key)
                
                print(keywordList)
                print(len(keywordList))
                #using textRank
                tr4w = TextRank4Keyword()
                tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
                node_weight = OrderedDict(sorted(tr4w.node_weight.items(), key=lambda t: t[1], reverse=True))
                kcount=0
                for k, (key, value) in enumerate(node_weight.items()):
                    if kcount >= 10:
                        break
                    if key not in keywordList:

                        keywordList.append(key)
                    kcount+=1
                    len(keywordList)
                    
                
                i+=1
            doc.close()
            print(keywordList)
            print(len(keywordList))
            stringlist=json.dumps(keywordList)

            with open('keywords.csv', 'a', newline='') as file:  
                writer = csv.writer(file)
                writer.writerow([filename, "pdf", stringlist])
            
        except Exception as e:
            emsg=str(e)
            doc.close()
            print(e)
                        

        

In [None]:
import os
import cv2
import tempfile
import subprocess
import json
def ocr(path):
    temp = tempfile.NamedTemporaryFile(delete=False)

    process = subprocess.Popen(['tesseract', path, temp.name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    process.communicate()

    with open(temp.name + '.txt', 'r') as handle:
        contents = handle.read()

    os.remove(temp.name + '.txt')
    os.remove(temp.name)

    return contents
for root, dirs, files in os.walk("/kaggle/input/abcdefg"):
    for filename in files:
        path='/kaggle/input/abcdefg/test.mp4'
        filename = 'test.mp4'
        cam = cv2.VideoCapture(path)
        currentframe = 0
        ar=[]
        
        while(True): 
          # reading from frame 
                ret,frame = cam.read() 
                if ret: 
                    if currentframe%100==0:
                        # if video is still left continue creating images 
                        name='frame.jpg'
                        # writing the extracted images 
                        cv2.imwrite(name, frame)
                        #img=cv2.imread('./frame.jpg')
                        #text=pytesseract.image_to_string(img)
                        text=ocr(name)
                        #print(text)
                        i=1
                        print("Extracting frame: "+str(currentframe))
                        r = Rake()
                        r.extract_keywords_from_sentences(text)
                        keywords=r.get_ranked_phrases_with_scores()
                        i+=1
                        if(len(keywords)) > 5:
                            for j in range(5):
                                tmp=keywords[j]
                                print(str(tmp))
                                key=re.sub(r'[^\w]', ' ',str(tmp[1]))         
                                if key not in ar:
                                    
                                    ar.append(key)
                        else:
                            if len(keywords)!=0:
                                for j in keywords:
                                    print(str(j))
                                    key=re.sub(r'[^\w]', ' ',str(j[1]))
                                    if key not in ar:
                                        
                                        ar.append(key)



                    currentframe =currentframe+1

                else:
                    stringlist=json.dumps(ar)

                    with open('keywords.csv', 'a', newline='') as file:  
                        writer = csv.writer(file)
                       
                        writer.writerow([filename, "video", stringlist])
                    print(ar)
                    break


In [None]:
import subprocess

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Install bert-as-service
!pip install bert-serving-server
!pip install bert-serving-client

In [None]:
# Download and unzip the pre-trained model
!wget http://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

In [None]:
# Start the BERT server
bert_command = 'bert-serving-start -model_dir /kaggle/working/uncased_L-12_H-768_A-12'
process = subprocess.Popen(bert_command.split(), stdout=subprocess.PIPE)

In [None]:
# Start the BERT client
from bert_serving.client import BertClient
bc = BertClient()

In [None]:

stop_words=[]
stop_word_file='/kaggle/input/smartstoplists/SmartStoplist.txt'
for line in open(stop_word_file):
    if line.strip()[0:1] != "#":
        for word in line.split():  # in case more than one per line
            stop_words.append(str(word))                             
keywords_extracted=["princess", "chess", "onam", "mahabali","football"]
qn="fifa"
query_vec = np.zeros((768,))
keyvector = np.zeros((768,))
                             
number = 0
embeddings = bc.encode([qn])

            
query_vec = embeddings[0]
qnvector=np.array(query_vec)
m=-1
mkey=keywords_extracted[0]
embeddings = bc.encode(keywords_extracted)
for i in range(len(embeddings)):
        keyvector = embeddings[i]   
        cosinesim=np.sum(qnvector*keyvector)/(np.sqrt(np.sum(qnvector**2))*np.sqrt(np.sum(keyvector**2)))
        if cosinesim > m:
            m=cosinesim
            mkey=keywords_extracted[i]
print(mkey)

In [None]:
import pandas as pd

stop_words=[]
stop_word_file='/kaggle/input/smartstoplists/SmartStoplist.txt'
for line in open(stop_word_file):
    if line.strip()[0:1] != "#":
        for word in line.split():  # in case more than one per line
            stop_words.append(str(word))
col_names = ['filename','filetype','keyword']
df = pd.read_csv("keywords.csv", names=col_names)
filenames = (df.filename).tolist()
filetypes = (df.filetype).tolist()
#keywords_extracted = (df.keyword).tolist()
keywords_extracted = df['keyword']

print (keywords_extracted)

In [None]:
import pickle

with open("../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl","rb") as f:
    embeddings_dict_glove = pickle.load(f)
    
print(len(embeddings_dict_glove))

In [None]:
import pandas as pd
import json
docNum=0
keyvector1 = np.zeros((300,))
keyvectors=['glove_vectors',]
for doc in keywords_extracted:
   
    #doc now hast the list of keywords of File 
    keyvectorsofdoc=[]

    if(docNum!=0):
        #convert the set of keyphrases stored as string back to list
        lst = json.loads(doc)
       
        #glove
        #go through all the keyphrases corresponding to docNUM 
        for phrase in lst:
            
            for word in phrase.split():
                if word not in stop_words:
                    if embeddings_dict_glove.get(word) is not None:
                        #print("word: ", word)
                        keyvector1 = embeddings_dict_glove.get(word)
                        print(keyvector1)
                        keyvectorsofdoc.append(keyvector1)
        keyvectors.append(keyvectorsofdoc)
        print(keyvectorsofdoc)
            


    docNum=docNum+1 

           

In [None]:
len(keyvectors)

In [None]:
df['glove_vectors']=keyvectors

In [None]:
df

In [None]:
#BERT keyphrase vectors
docNum=0
bertkeyvectors=['bertvectors']
for doc in keywords_extracted:
    if docNum!=0:
        #convert the set of keyphrases stored as string back to list
        lst = json.loads(doc)
        keyembeddings=bc.encode(lst)
        #print(keyembeddings)
        bertkeyvectors.append(keyembeddings)
    docNum+=1
        
        
        
        
    

In [None]:
len(bertkeyvectors)

In [None]:
df['bertvectors']=bertkeyvectors

In [None]:
df

In [None]:

df.to_csv('keywords_with_vectors.csv',index=False)

In [None]:
#col_names = ['filename','filetype','keyword','glovevectors','bervectors']
#df=pd.read_csv("keywords_with_vectors.csv", names=col_names)

In [None]:
import json
qns= ["what are autotrophs",
      "How many planets in solar system", 
      "Which all are the animal species in forest",
      "a game with bat and ball",
      "J K Rowling's novel based film series",
      "Famous monuments",
      "What is Greek Mythology",
      "Who is Princess of Wales",
      "Russian Nuclear disaster",
      "One Kerala festival",
      "Virus infection 2019",
      "A 8 8 indoor board game",
      "Translates computer code written",
      "First black president of United State ",
      "Nothern lights",
      "Quit india movement"]

filenames = (df.filename).tolist()
filetypes = (df.filetype).tolist()
with open('qnToPdfMap.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Question", "FileName", "FileType"])
#BERT 
query_vec = np.zeros((768,))
keyvector = np.zeros((768,))
#get question vectors using bert
qn_vectors= bc.encode(qns)
                             
    

qNum = 0
docNum=0


#get question vectors using GloVe
query_vec1 = np.zeros((300,))
keyvector1 = np.zeros((300,))
qn_vectors1 = []

for qn in qns:
    number = 0
    for word in qn.split():
        if word not in stop_words:
            if embeddings_dict_glove.get(word) is not None:
                query_vec1= query_vec1 + embeddings_dict_glove.get(word)
                number = number + 1
    if number!=0:
        query_vec1 = query_vec1/number
        #qnvector=np.array(query_vec)
        qn_vectors1.append(query_vec1)

    

    

gloveiterator=0
    

qNum = 0
docNum=0

glovevectors=df['glove_vectors']
bertvectors=df['bertvectors']


for qnvector in qn_vectors:
    qnvector1=qn_vectors1[gloveiterator]
    
    print("Q: ",qns[qNum])
    m=-1
    m1=-1
    m2=-1
    docNum=0
    #store mapped file using BERT avg cosine sim
    mfilename='' 
    mfiletype=''
    #store mapped file using BERT without avg cosine sim
    mfilename1=''
    mfiletype1=''
    #store mapped file using GLOVE avg cosine sim
    mfilename2=''
    mfiletype2=''
    docNum=0
    
   
    for i in range(len(df)):
        glovedoc=glovevectors[i]
        #doc now hast the list of keywords of File 
        cosineSimSum2=0
        numWords2=0
        if(docNum!=0):
            #convert the set of keyvectors stored as string back to list
            #lst = json.loads(glovedoc)
            #glove
            #go through all the keyphrases corresponding to docNUM 
            for keyvector1 in glovedoc:
                
                                       
                cosinesim = np.sum(qnvector1*keyvector1)/(np.sqrt(np.dot(qnvector1,qnvector1)) * np.sqrt(np.dot(keyvector1,keyvector1)))
                cosineSimSum2 += cosinesim
                numWords2 +=1
                                    
            if (numWords2!=0) and ((cosineSimSum2/numWords2) > m2):
                m2 = cosineSimSum2/numWords2
                mfilename2 = filenames[docNum]
                mfiletype2 = filetypes[docNum]
            #bert
            bertofdoc=bertvectors[i]
            #lst = json.loads(bertofdoc)
            numWords=0
            cosineSimSum=0
            for keyvector in bertofdoc:               
                #cosinesim = np.dot(qnvector, keyvector) / (np.sqrt(np.dot(qnvector,qnvector)) * np.sqrt(np.dot(keyvector,keyvector)))
                cosinesim = np.sum(qnvector*keyvector)/(np.sqrt(np.sum(qnvector**2))*np.sqrt(np.sum(keyvector**2)))
                if cosinesim > m1:
                    m1=cosinesim
                    #print(m)
                    mfilename1 = filenames[docNum]
                    #print(docNum)
                    mfiletype1 = filetypes[docNum]

                cosineSimSum += cosinesim
                numWords +=1
            if (numWords!=0) and ((cosineSimSum/numWords) > m):
                m = cosineSimSum/numWords
                mfilename = filenames[docNum]
                mfiletype = filetypes[docNum]
        docNum+=1
        
    mappedict={}
    filenamewritten=''
    filetypesdict={mfilename:mfiletype,
                   mfilename1:mfiletype1,
                   mfilename2:mfiletype2}
    #print(filetypesdict)
    cosinesimlist=[m,m1,m2]
    fileslist=[mfilename,mfilename1,mfilename2]
    filetypeslist=[mfiletype,mfiletype1,mfiletype2]
    for mappedfile in fileslist:
        if mappedfile in mappedict.keys():
            mappedict[mappedfile]=mappedict[mappedfile]+1
        else:
            mappedict[mappedfile]=1
    maxcnt=max(mappedict.values())
    if maxcnt>=2:
        for k,v in mappedict.items():
            if v==maxcnt:
                print("File: ",k)
                filenamewritten=k
                filetypewritten=filetypesdict[filenamewritten]
                print("FileType: ",filetypewritten)
                
                
                
                
                
    else:
        maxcosinesim=max(cosinesimlist)
        for i in range(3):
            if cosinesimlist[i]==maxcosinesim:
                print("File: ",fileslist[i])
                
                filenamewritten=fileslist[i]
                filetypewritten=filetypesdict[filenamewritten]
                print("FileType: ",filetypewritten)
                
                
    
    '''   
    print(m,m1,m2)
    
    print("File: ",mfilename)
    print("File: ",mfilename1)
    print("File: ",mfilename2)
    '''
    
    print(" ")
    
    
    with open('qnToPdfMap.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([qns[qNum],filenamewritten, filetypewritten])
    qNum = qNum + 1 
    gloveiterator=gloveiterator+1


In [None]:
'''
import json

qns= ["How many planets in solar system", 
      "Which all are the animal species in forest",
      "a game with bat and ball",
      "J K Rowling's novel based film series",
      "Famous monuments",
      "What is Greek Mythology",
      "Who is Princess of Wales",
      "Russian Nuclear disaster",
      "One Kerala festival",
      "Virus infection 2019",
      "A 8 8 indoor board game",
      "Translates computer code written",
      "First black president of United State ",
      "Nothern lights",
      "Quit india movement"]

with open('qnToPdfMap.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Question", "FileName", "FileType"])
#BERT 
query_vec = np.zeros((768,))
keyvector = np.zeros((768,))
#get question vectors using bert
qn_vectors= bc.encode(qns)
                             
    

qNum = 0
docNum=0


#get question vectors using GloVe
query_vec1 = np.zeros((300,))
keyvector1 = np.zeros((300,))
qn_vectors1 = []

for qn in qns:
    number = 0
    for word in qn.split():
        if word not in stop_words:
            if embeddings_dict_glove.get(word) is not None:
                query_vec1= query_vec1 + embeddings_dict_glove.get(word)
                number = number + 1
    if number!=0:
        query_vec1 = query_vec1/number
        #qnvector=np.array(query_vec)
        qn_vectors1.append(query_vec1)

    

    

gloveiterator=0
    

qNum = 0
docNum=0




for qnvector in qn_vectors:
    qnvector1=qn_vectors1[gloveiterator]
    
    print("Q: ",qns[qNum])
    m=-1
    m1=-1
    m2=-1
    docNum=0
    #store mapped file using BERT avg cosine sim
    mfilename='' 
    mfiletype=''
    #store mapped file using BERT without avg cosine sim
    mfilename1=''
    mfiletype1=''
    #store mapped file using GLOVE avg cosine sim
    mfilename2=''
    mfiletype2=''
    docNum=0
    
   
    for doc in keywords_extracted:
        #doc now hast the list of keywords of File 
        cosineSimSum2=0
        numWords2=0
        if(docNum!=0):
            #convert the set of keyphrases stored as string back to list
            lst = json.loads(doc)
            #glove
            #go through all the keyphrases corresponding to docNUM 
            for phrase in lst:
                for word in phrase.split():
                            if word not in stop_words:
                                if embeddings_dict_glove.get(word) is not None:
                                    #print("word: ", word)
                                    keyvector1 = embeddings_dict_glove.get(word)     
                                    cosinesim = np.sum(qnvector1*keyvector1)/(np.sqrt(np.dot(qnvector1,qnvector1)) * np.sqrt(np.dot(keyvector1,keyvector1)))
                                    cosineSimSum2 += cosinesim
                                    numWords2 +=1
                                    
            if (numWords2!=0) and ((cosineSimSum2/numWords2) > m2):
                m2 = cosineSimSum2/numWords2
                mfilename2 = filenames[docNum]
                mfiletype2 = filetypes[docNum]
            #bert
            keyembeddings=bc.encode(lst)
            numWords=0
            cosineSimSum=0
            for keyvector in keyembeddings:               
                #cosinesim = np.dot(qnvector, keyvector) / (np.sqrt(np.dot(qnvector,qnvector)) * np.sqrt(np.dot(keyvector,keyvector)))
                cosinesim = np.sum(qnvector*keyvector)/(np.sqrt(np.sum(qnvector**2))*np.sqrt(np.sum(keyvector**2)))
                if cosinesim > m1:
                    m1=cosinesim
                    #print(m)
                    mfilename1 = filenames[docNum]
                    #print(docNum)
                    mfiletype1 = filetypes[docNum]

                cosineSimSum += cosinesim
                numWords +=1
            if (numWords!=0) and ((cosineSimSum/numWords) > m):
                m = cosineSimSum/numWords
                mfilename = filenames[docNum]
                mfiletype = filetypes[docNum]
        docNum+=1
        
    mappedict={}
    cosinesimlist=[m,m1,m2]
    fileslist=[mfilename,mfilename1,mfilename2]
    for mappedfile in fileslist:
        if mappedfile in mappedict.keys():
            mappedict[mappedfile]=mappedict[mappedfile]+1
        else:
            mappedict[mappedfile]=1
    maxcnt=max(mappedict.values())
    if maxcnt>=2:
        for k,v in mappedict.items():
            if v==maxcnt:
                print("File: ",k)
    else:
        maxcosinesim=max(cosinesimlist)
        for i in range(3):
            if cosinesimlist[i]==maxcosinesim:
                print("File: ",fileslist[i])
                
                
    
    
    print(m,m1,m2)
    
    print("File: ",mfilename)
    print("File: ",mfilename1)
    print("File: ",mfilename2)
  
    
    print(" ")
    
    
    with open('qnToPdfMap.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([qns[qNum], mfilename, mfiletype])
    qNum = qNum + 1 
    gloveiterator=gloveiterator+1
'''

In [None]:
'''
filename = "keywords_squad.csv"
# opening the file with w+ mode truncates the file
f = open(filename, "w+")
f.close()
'''

# Testing

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
test=pd.read_csv('/kaggle/input/squad-csv-format/QA Dataset.csv')

In [None]:
test.head()

In [None]:
len(test)

In [None]:
import json
import re 
import csv
squad_questions=[]
for i in range(len(test)):
    print(i)
    row=test.iloc[i]
    text=row[1]
    squad_questions.append[]
    keywordList=[]
    r = Rake()
    r.extract_keywords_from_sentences(text)
    keywords=r.get_ranked_phrases_with_scores()
    if(len(keywords)) > 15:
        for j in range(15):
            tmp=keywords[j]
            key=re.sub(r'[^\w]', ' ',str(tmp[1])) 

            keywordList.append(key)
            #print(str(key))
    else:
        if len(keywords)!=0:
            for j in keywords:
                key=re.sub(r'[^\w]', ' ',str(j[0])) 
                
                keywordList.append(key)
                #print(key)

    print(keywordList)
    print(len(keywordList))
    #using textRank
    tr4w = TextRank4Keyword()
    tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    node_weight = OrderedDict(sorted(tr4w.node_weight.items(), key=lambda t: t[1], reverse=True))
    kcount=0
    for k, (key, value) in enumerate(node_weight.items()):
        if kcount >= 10:
            break
        if key not in keywordList:

            keywordList.append(key)
            kcount+=1
    len(keywordList)
    stringlist=json.dumps(keywordList)
    with open('keywords_squaad.csv', 'a', newline='') as file:  
        writer = csv.writer(file)
        writer.writerow([str(i), "text",stringlist])
        
    
    
    