In [10]:
import os
from docx import Document
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
import time
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
docs = []
path = "docs"
for dir in os.listdir(path):
    document = Document(os.path.join(path, dir))
    data = ''
    for para in document.paragraphs:
        data += " " + para.text
    docs.append(data)

In [8]:
class WordDOI:
    """docstring for WordDOI."""
    def __init__(self, doc):
        
        self.doc_list = doc.split(".")
        self.cleanDocList = []
        self.dataMap = {}
        self.allWords = set()
        self.deg = {}
        self.degList = []
        self.pairList = []
        self.nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
        self.stop = stopwords.words('english')

    # function for text cleaning
    def textCleaner(self, doc):
        doc = re.sub("[,.']", "", doc)
        doc = [token for token in doc.split(' ') if len(re.sub("[a-zA-Z]", "", token)) <= 0]
        doc = self.nlp(" ".join(doc))
        taglist = ['RB','RBR', 'RBS','JJR','JJ','JJS','NN','NNS','VB','VBG','VBP','VBN']
        poslist = ['ADJ','ADV','NOUN','VERB']
        doc = [token.lemma_.lower() for token in doc if token.tag_ in taglist and token.pos_ in poslist]
        doc = [token for token in doc if not token in self.stop and len(token)>2]
        return doc

    # function to clean the data and store it in the object
    def cleanData(self):
        self.cleanDocList = []

        # cleaning of data
        for i, row in enumerate(self.doc_list):
            row = self.textCleaner(row)
            self.cleanDocList.append(row)

    # function to create data logs like data map, all words list and store it in the object
    def createDataLog(self, freqLmt = 1):
        self.dataMap = {}              
        self.allWords = set()           

        # creating data map and storing all words
        for i, sen in enumerate(self.cleanDocList):
            for word in sen:
                self.dataMap[word] = self.dataMap.get(word, set())
                self.dataMap[word].add(i)
                self.allWords.add(word)

        tmp_all_words = []

        # filtering words
        for word in self.allWords:
            if len(self.dataMap[word]) > freqLmt:
                tmp_all_words.append(word)

        self.allWords = set(tmp_all_words)

    # function to find DOI list and store it in the object
    def findDOI(self):
        self.deg = {}
        self.degList = []

        # calculating DOI of each word
        for word1 in self.allWords:
            tempDeg = 0
            for word2 in self.allWords:
                if word1 == word2: continue
                common = self.dataMap[word1] & self.dataMap[word2]

                if len(common) == 0: continue
                tempDeg += len(common)

            self.deg[word1] = tempDeg

        # creating DOI
        for word in self.deg:
            dg = self.deg[word]
            self.degList.append((word, dg))

        self.degList = sorted(self.degList, key=lambda x : x[1], reverse=True)

    # function to find word pairs and store it in the object
    def findPairs(self):
        self.pairList = []
        probMap = {}
        probMapPrev = {}
        wordCount = {}

        # creating count map for next word and previous word
        for i, sen in enumerate(self.cleanDocList):
            prev = " "
            for word in sen:
                if prev == " ": prev = word
                else:
                    probMap[prev] = probMap.get(prev, dict())
                    probMap[prev][word] = probMap[prev].get(word, 0) + 1

                    probMapPrev[word] = probMapPrev.get(word, dict())
                    probMapPrev[word][prev] = probMapPrev[word].get(prev, 0) + 1

                    prev = word

                wordCount[word] = wordCount.get(word, 0) + 1

        # creating probabilty map for next using count map
        for keyWord in probMap:
            for key in probMap[keyWord]:
                probMap[keyWord][key] /= wordCount[keyWord]               

        # creating probabilty map for previous word using count map
        for keyWord in probMapPrev:
            for key in probMapPrev[keyWord]:
                probMapPrev[keyWord][key] /= wordCount[keyWord]

        # creating list of pairs and calculating there relevance score
        for k1 in probMap:
            for k2 in probMap[k1]:
                degSum = (self.deg.get(k1, 0)+self.deg.get(k2, 0))
                if degSum == 0: continue

                denom = len(self.allWords)/degSum

                k1Val = probMap[k1][k2]*(self.deg.get(k1, 0)/denom)
                k2Val = probMapPrev[k2][k1]*(self.deg.get(k2, 0)/denom)

                self.pairList.append(((k1, k2), (k1Val+k2Val)))

        self.pairList = sorted(self.pairList, key=lambda x : x[1], reverse=True)

    def getTopNDOI(self, N=-1):
        if N == -1:
            return self.degList
        elif N <= len(self.degList):
            return self.degList[:N]
        else:
            print(f'{N} exceeds list size (Top {len(self.degList)} returned)')
            return self.degList
   
    def getTopNPairs(self, N=-1):
        if N == -1:
            return self.pairList
        elif N <= len(self.pairList):
            return self.pairList[:N]
        else:
            print(f'{N} exceeds list size (Top {len(self.pairList)} returned)')
            return self.pairList

    # run all the required processes and then return top N DOI and top N pairs
    def getDOI(self, NDOI=-1, NPairs=-1):
        # cleaning of data
        try:
            start_time = time.time()
            self.cleanData()
            print(f"Time taken in cleaning {int((time.time() - start_time))} seconds " )
        except Exception as e:
            print("Error while cleaning the data")
            print("Exception occured - {}".format(e))
            print("Stoping the process")
            return [], []
        
        # creating data logs
        try:
            start_time = time.time()
            self.createDataLog()
            print(f"Time taken in logging {int((time.time() - start_time))} seconds ")
        except Exception as e:
            print("Error while logging the data")
            print("Exception occured - {}".format(e))
            print("Stoping the process")
            return [], []
        
        # finding DOI
        try:
            start_time = time.time()
            self.findDOI()
            print(f"Time taken in DOI {int((time.time() - start_time))} seconds " )
        except Exception as e:
            print("Error while DOI the data")
            print("Exception occured - {}".format(e))
            print("Stoping the process")
            return [], []
        
        # finding word pairs
        try:
            start_time = time.time()
            self.findPairs()
            print(f"time taken in finding pairs {int((time.time() - start_time))} seconds " )
        except Exception as e:
            print("Error while finding pairs the data")
            print("Exception occured - {}".format(e))
            print("Stoping the process")
            return [], []
        
        return self.getTopNDOI(NDOI), self.getTopNPairs(NPairs)

In [12]:
ww = WordDOI(docs[0])
ww.getDOI(NDOI=1)

Time taken in cleaning 1 seconds 
Time taken in logging 0 seconds 
Time taken in DOI 0 seconds 
time taken in finding pairs 0 seconds 


([('interview', 365)],
 [(('code', 'interview'), 1190.6107142857143),
  (('job', 'interview'), 495.6140172101449),
  (('prepare', 'code'), 274.61680194805194),
  (('interview', 'course'), 262.2160218253968),
  (('course', 'prepare'), 258.66747835497836),
  (('dynamic', 'programming'), 256.8863636363636),
  (('course', 'learn'), 189.88095238095238),
  (('good', 'course'), 146.7049486461251),
  (('online', 'course'), 145.30158730158732),
  (('prepare', 'interview'), 137.11122159090908),
  (('programming', 'job'), 127.74471343873518),
  (('link', 'join'), 121.76618303571428),
  (('structure', 'algorithm'), 118.12500000000001),
  (('join', 'course'), 115.72256944444445),
  (('interview', 'question'), 108.5966517857143),
  (('course', 'also'), 104.36344461697723),
  (('datum', 'structure'), 97.03125),
  (('access', 'course'), 96.79577380952381),
  (('course', 'code'), 93.53083900226757),
  (('programming', 'interview'), 91.88238636363637),
  (('code', 'question'), 91.73405612244898),
  (('c