# Special Sentence Extraction

Now that we can generate the concept map, and calculate the cognitive load per sentence, and display text blurbs in order of increasing cognitive load as we traverse the created learning path, let's look at pulling special types of sentences from the text. When a new concept is encountered, it must be introduced to the student in some way. Based on the student's input of what concepts they are familiar with, further concepts may be introduced in terms of known concepts. 


In [1]:
import nltk
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

#filename = 'A Mind For Numbers_ How to Excel at Math and Science (Even If You Flunked Algebra)'
#filename = 'animal_kingdom_wiki'
filename = 'wiki_human_digestive_system'

concepts = {}
import pickle
# Loading extracted concepts from file (see concept_extraction.ipynb)
#concepts = {'sents':sents,'rawtxt':rawtxt,'sent_to_npflat':sent_to_npflat,'sent_to_tags':sent_to_tags,'sent_to_ltags':sent_to_ltags,'np_to_sent':np_to_sent,'Conceptdata':Conceptdata}
with open('../processed_data/'+filename +'concepts.pickle', 'rb') as f:
    concepts = pickle.load(f)

# Loading idf dictionary (see Build_IDF_dictionary.ipynb)
with open('../processed_data/'+'idf_dict.pickle','rb') as f1:
    idf_dict =pickle.load(f1)

sents = concepts['sents']
rawtxt = concepts['rawtxt']
sent_to_npflat = concepts['sent_to_npflat']
sent_to_tags= concepts['sent_to_tags']
sent_to_ltags = concepts['sent_to_ltags']
np_to_sent = concepts['np_to_sent']
Conceptdata = concepts['Conceptdata']

import math



def get_idf(wrd,totaldocs=10788):
    wrd = wrd.lower()
    return idf_dict.get(wrd,math.log(totaldocs))


In [2]:
def calc_cl_per_sentence(sent_to_npflat,maxidf=9.1,include_pronouns=True):
    sent_to_clt = []
    for i in range(len(sent_to_npflat)):
        npinsent = sent_to_npflat[i]
        clt= 0
        for np in npinsent:
            tokens = np.split(' ')
            idf = 0
            for t in tokens:
                if t not in stop_words:
                    idf = idf + get_idf(t)/len(tokens)
            if (idf>=maxidf):
                clt = clt + 1
        if include_pronouns is True:
            pnpinsent = len([tok for tok in sent_to_ltags[24] if tok[1]=='PRP'])
            clt = clt + pnpinsent
        sent_to_clt.append(clt)
    return sent_to_clt

def plot_clt():
    
    from matplotlib import pyplot as plt
    plt.xlabel('document sentence #')
    plt.ylabel('Load added to working memory by sentence')
    plt.title('Cognitive Load for '+filename)
    plt.plot(list(range(1,len(sent_to_npflat)+1)),calc_cl_per_sentence(sent_to_npflat),drawstyle='steps')
    plt.savefig('cltfig1.png')
    return None

In [3]:
sent_to_clt = calc_cl_per_sentence(sent_to_npflat)
print('Mininum cognitive load sentence: ',sents[sent_to_clt.index(min(sent_to_clt))])
print('Maximum cognitive load sentence: ',sents[sent_to_clt.index(max(sent_to_clt))])

Mininum cognitive load sentence:  Digestion involves the breakdown of food into smaller and smaller components, until they can be absorbed and assimilated into the body.
Maximum cognitive load sentence:  This is achieved in the duodenum by the addition of bile from the gall bladder combined with the bicarbonate secretions from the pancreatic duct and also from secretions of bicarbonate-rich mucus from duodenal glands known as Brunner's glands.


In [4]:
plot_clt()

Functions to get blurbs for two concepts


In [5]:
import pandas as pd
def calc_clt_blurb_order(tuplist):
    tup_to_clt = {}
    for tup in tuplist:
        blurb_clt = 0
        for i in range(tup[0],tup[1]+1):
            blurb_clt = blurb_clt + sent_to_clt[i]
        tup_to_clt[tup] = blurb_clt
    tup_to_clt = pd.Series(tup_to_clt)
    tup_to_clt.sort_values(ascending=True)
    return list(tup_to_clt.sort_values(ascending=True).index)
            
        

In [6]:
def get_sentence_indices(np1,np2,max_distance=3):
    sents1 = np_to_sent[np1]
    sents2 = np_to_sent[np2]
    ind1 = 0
    ind2 = 0
    tuplist = []
    lensents1 = len(sents1)
    #print(lensents1)
    lensents2 = len(sents2)
    #print(lensents2)
    while(ind1<lensents1 and ind2 <lensents2):
        #print(ind1,ind2)
        if (sents1[ind1]<sents2[ind2]):
            #print('sent1 less than sent2')
            if sents2[ind2]-sents1[ind1]<=max_distance:
                tuplist.append((sents1[ind1],sents2[ind2]))
                ind1 = ind1+1
                ind2 = ind2 + 1
            else:
                #ind1 = bs.bisect_left(sents1,sents2[ind2])
                ind1 = ind1 + 1
        elif (sents1[ind1]>sents2[ind2]):
            #print('sent2 less than sent1')
            if sents1[ind1]-sents2[ind2] <= max_distance:
                tuplist.append((sents2[ind2],sents1[ind1]))
                ind1 = ind1 + 1
                ind2 = ind2 + 1
            else:
                #ind2 = bs.bisect_left(sents2,sents1[ind1])
                ind2 = ind2 + 1
        else:
            tuplist.append((sents1[ind1],sents2[ind2]))
            ind1 = ind1+1
            ind2 = ind2+1
    return tuplist

def get_blurbs(np1,np2,max_distance=3):
    blurblist = []
    tuplist = calc_clt_blurb_order(get_sentence_indices(np1,np2,max_distance))
    print(tuplist)
    for t in tuplist:
        blurb = []
        print(t)
        blurb = ' '.join(sents[t[0]:t[1]+1]).replace('\n', ' ').replace('\r', '')
        print(blurb)
        blurblist.append(blurb)
    return tuplist, blurblist

In [7]:
Conceptdata.sort_values(by=['Frequency'], ascending = False).head(10)

Unnamed: 0,Concept,Occurence,Frequency,Mean,Median,Sdev
11,food,"[1, 3, 4, 6, 7, 10, 11, 15, 18, 25, 44, 46, 49...",41,0.254933,0.233146,0.202711
34,stomach,"[8, 9, 12, 13, 26, 77, 108, 127, 129, 139, 141...",38,0.505914,0.473315,0.259822
52,small intestine,"[14, 15, 25, 68, 161, 163, 205, 220, 240, 243,...",30,0.672285,0.740169,0.265817
21,mouth,"[4, 6, 20, 22, 32, 34, 38, 41, 42, 43, 44, 53,...",27,0.25,0.185393,0.251661
33,esophagus,"[8, 13, 66, 89, 115, 119, 122, 125, 126, 127, ...",26,0.395527,0.376404,0.205135
37,duodenum,"[10, 78, 145, 150, 159, 161, 203, 207, 208, 22...",23,0.617978,0.648876,0.20897
8,digestion,"[0, 1, 2, 3, 7, 9, 10, 11, 12, 15, 17, 18, 25,...",23,0.210674,0.050562,0.23568
463,bile,"[176, 192, 194, 195, 200, 201, 203, 205, 206, ...",21,0.590556,0.595506,0.043195
7,liver,"[0, 19, 147, 176, 179, 180, 181, 183, 188, 189...",19,0.516115,0.530899,0.198811
3,tongue,"[0, 5, 11, 20, 33, 48, 49, 58, 59, 61, 64, 79,...",18,0.159176,0.168539,0.08811


In [140]:
tuplist, blurblist = get_blurbs('bile','newton',1)

[(62418, 62419), (62540, 62541), (65774, 65774), (65881, 65881), (68995, 68996), (89267, 89267), (101567, 101568), (114753, 114754)]
(62418, 62419)
His book Philosophiæ Naturalis Principia Mathematica ("Mathematical Principles of Natural Philosophy"), first published in 1687, laid the foundations of classical mechanics. Newton also made seminal contributions to optics, and shares credit with Gottfried Wilhelm Leibniz for developing the infinitesimal calculus.
(62540, 62541)
In this work, Newton stated the three universal laws of motion. Together, these laws describe the relationship between any object, the forces acting upon it and the resulting motion, laying the foundation for classical mechanics.
(65774, 65774)
In classical mechanics, Newton's third law implies that active and passive gravitational mass must always be identical (or at least proportional), but the classical theory offers no compelling reason why the gravitational mass has to equal the inertial mass.
(65881, 65881)
In

In [6]:
# printing be lemma sentences for one concept within 2 words before or after the concept word, and in present/past simple tense.
# printing ratio of concept's be sentences to all of the concept's sentences, and to all sentences

be_sents = [s for s in np_to_sent['classical mechanic'] if 'be' in [ltag[0] for ltag in sent_to_ltags[s]]]
print(be_sents,len(be_sents)/len(np_to_sent['classical mechanic']),len(be_sents)/len(sents))

be_sents_clt = pd.Series([sent_to_clt[s] for s in be_sents],be_sents)
be_sents_clt.sort_values( ascending = True,inplace = True)
be_sents_clt

for s in be_sents_clt.index:
    print(s,': ',sents[s])


KeyError: 'classical mechanic'

In [9]:
# find the place of the concept in the sentence (assuming only one occurence, or working with only the first occurence)
# get the window of ltags to consider
# get only VBP,VBZ,VBD forms of be in that window
# explore ones that only contain WDT, WP, WP$


def get_def_sentence_indices(cncpt,window=2,be_tagforms = ['VBP','VBZ','VBD'],that_tagforms = ['WDT','WP','WP$','TO']):
    def_sents_index = []
    cncpt_first = cncpt.split(' ')[0]
    cncpt_last = cncpt.split(' ')[-1]
    for s in np_to_sent[cncpt]:
        lemmlist = [ltag[0] for ltag in sent_to_ltags[s]]
        taglist = [ltag[1] for ltag in sent_to_ltags[s]]
        cncptindex = lemmlist.index(cncpt_first)
        lemmwindow = lemmlist[max(0,cncptindex-window):min(len(lemmlist),cncptindex+window+len(cncpt.split(' ')))] 
        tagwindow = taglist[max(0,cncptindex-window):min(len(lemmlist),cncptindex+window+len(cncpt.split(' ')))] 
                
        if ('be' in lemmwindow):
            beindex = lemmwindow.index('be')
            if tagwindow[beindex] in be_tagforms:
                if len(set(that_tagforms).intersection(set(taglist)))>0:
                    def_sents_index.append(s)
    return(def_sents_index)

# index errors if item not in list.                    

In [12]:
slist = get_def_sentence_indices('mouth')
for s in slist:
    print(sents[s])

[2]
The mouth is the first part of the upper gastrointestinal tract and is equipped with several structures that begin the first processes of digestion.
Underlying the mucous membrane in the mouth is a thin layer of smooth muscle tissue and the loose connection to the membrane gives it its great elasticity.
A common gum disease in the mouth is gingivitis which is caused by bacteria in plaque.


In [104]:
# get instructing sentences that take the form of do this, then do that. (method)

# get sentences with qualifiers (rules, assumptions)

# consider extracting lists from np extractor



In [161]:
Conceptdata.sort_values(by='Frequency',ascending=False).head(100000)['Concept']

167                                               fabric
15                                               example
380                                                 time
27                                                number
483                                                 year
16                                                    cm
20                                                  inch
271                                                place
288                                                 edge
283                                                  end
0                                                   side
122                                                 work
1336                                                term
4644                                               woman
1360                                              result
1017                                              people
1761                                                part
458                            

In [119]:
len('newton'.split(' '))

1

In [171]:
[1,2,3,4].index(9)

ValueError: 9 is not in list