# Building a concept map
### Generating a graph of linked concepts
### Recommending learning paths based on student background

In [1]:
filename = 'Cognitive_Load_Theory'

concepts = {}
import pickle
# Loading extracted concepts from file (see concept_extraction.ipynb)
#concepts = {'sents':sents,'rawtxt':rawtxt,'sent_to_npflat':sent_to_npflat,'sent_to_tags':sent_to_tags,'sent_to_ltags':sent_to_ltags,'np_to_sent':np_to_sent,'Conceptdata':Conceptdata}
with open(filename +'concepts.pickle', 'rb') as f:
    concepts = pickle.load(f)
f.close()

# Loading idf dictionary (see Build_IDF_dictionary.ipynb)
with open('idf_dict.pickle','rb') as f1:
    idf_dict =pickle.load(f1)
f1.close()

sents = concepts['sents']
rawtxt = concepts['rawtxt']
sent_to_npflat = concepts['sent_to_npflat']
sent_to_tags= concepts['sent_to_tags']
sent_to_ltags = concepts['sent_to_ltags']
np_to_sent = concepts['np_to_sent']
Conceptdata = concepts['Conceptdata']

Function to get undirected graph between two concepts, weight value is inversely proportional to number of sentences between the two concepts. 

Bond strength = summation of 1/(1+sentence distance) for every time they occur within (max - min) sentence distance

1. max_sent_dist : the maximum sentence between first concept and second concept to consider as linked
2. min_sent_dist : (typically zero) minimum sentence distance between first and second concept to consider as linked
3. sent_to_npflat: dictionary : key: (sentence number in text), value: (list of concepts in the sentence) 


In [2]:
def build_graph(sent_to_npflat,max_sent_dist, min_sent_dist):
    npnp_bondstrengthdir = {}
    for i in range(len(sent_to_npflat)-max_sent_dist):
        for np1 in sent_to_npflat[i]:
            npnp_bondstrengthdir[np1] = {}
            for j in range(min_sent_dist, max_sent_dist):
                np2list = [np2 for np2 in sent_to_npflat[i+j] if np2!=np1]
                for np2 in np2list:
                    npnp_bondstrengthdir[np1][np2] =npnp_bondstrengthdir[np1].get(np2,0) + 1/(j+1)
    return npnp_bondstrengthdir

In [3]:
# Function for term frequency
def tf(np,rawtxt):
   p = re.compile(np)
   return len(p.findall(rawtxt.lower()))        

# Function for IDF
import math
totaldocs = 10788
def get_idf(wrd,totaldocs):
    wrd = wrd.lower()
    return idf_dict.get(wrd,math.log(totaldocs))


In [4]:
# function to find shortest distance (used for graph relationship weight calculations)
import bisect as bs

def find_shortest_distance(search_list, value):
    ins_point = bs.bisect_right(search_list,value)
    if ins_point < len(search_list):
        return min(abs(search_list[ins_point] - value), abs(search_list[ins_point - 1] - value))
    return abs(search_list[ins_point - 1] - value)


In [5]:
npnp_bondstrengthdir = build_graph(sent_to_npflat,1,0)

### Building a data frame with every relationship and metrics about individual concepts and their relationship

Metrics to calculate the importance of the individual concepts, importance of the relationship between them, and directionality.

Directionality: Does knowing ConceptA aid in understanding ConceptB? 

In [6]:
import itertools
from itertools import chain
import numpy as num
Concept1 = [[np1]*len(npnp_bondstrengthdir[np1]) for np1 in npnp_bondstrengthdir.keys()]
Concept1 = list(chain.from_iterable(Concept1))
Concept2 = [np2 for np1 in npnp_bondstrengthdir.keys() for np2 in npnp_bondstrengthdir[np1].keys()]

Bondstrength = [npnp_bondstrengthdir[Concept1[i]][Concept2[i]] for i in range(len(Concept1))]

# Number of sentences in which concept occurs
FA = [len(np_to_sent[np1]) for np1 in Concept1]
FB = [len(np_to_sent[np2]) for np2 in Concept2]

# std deviation of occurence of concept: the spread - does it occur all over the document or just in one section. 
SdevA = [num.std(np_to_sent[np1]) for np1 in Concept1]
SdevB = [num.std(np_to_sent[np2]) for np2 in Concept2]

## Computing the mean bond strength of concept X to other concepts (compared with Bondstrength, gives an idea of how 
## strongly X is related to Y in comparison with other concepts)
meanBSA = [num.mean(list(npnp_bondstrengthdir[np1].values())) for np1 in Concept1]
meanBSB = [num.mean(list(npnp_bondstrengthdir.get(np2,{}).values())) for np2 in Concept2]


## Computing average shortest distance of each A to a B and vice versa. A metric for co-occurence of the two concepts
OcA = [np_to_sent[np1] for np1 in Concept1]
OcB = [np_to_sent[np2] for np2 in Concept2]

dAB=[]
dBA=[]

for i in range(len(Concept1)):
    dAB.append(num.mean([abs(find_shortest_distance(OcB[i],o)) for o in OcA[i]]))
    dBA.append(num.mean([abs(find_shortest_distance(OcA[i],o)) for o in OcB[i]]))
    

## Computing number of mappings for Concept1, Concept2 respectively and how many of those concepts intersect
Amap = [len(npnp_bondstrengthdir[np1]) for np1 in Concept1]
Bmap = [len(npnp_bondstrengthdir.get(np2,{})) for np2 in Concept2]
AmapintersectBmap = [len(set(npnp_bondstrengthdir[Concept1[i]].keys()) & set(npnp_bondstrengthdir.get(Concept2[i],{}).keys())) for i in range(len(Concept1))]

## How many concepts is A linked to that B is NOT linked to, and vice versa
AminusB = [Amap[i]-AmapintersectBmap[i] for i in range(len(Concept1))]
BminusA = [Bmap[i]-AmapintersectBmap[i] for i in range(len(Concept1))]


lensents = len(sents)
lennp = len(np_to_sent)

## First occurence of A and B wrt length of text
AfirstOc = [np_to_sent[np1][0]/lensents for np1 in Concept1]
BfirstOc = [np_to_sent[np2][0]/lensents for np2 in Concept2]


## IDF value for each concept (1-gram IDF dictionary. using average for multi word concept phrases)
nptoWtkeys = list(np_to_sent.keys())
nptoWtvals = [np.split() for np in nptoWtkeys]
nptoWt = dict(zip(nptoWtkeys,nptoWtvals))
nptoIDFvals = [num.mean([get_idf(t,totaldocs) for t in nptoWt[np1]]) for np1 in nptoWt.keys()]
nptoIDF = dict(zip(nptoWtkeys,nptoIDFvals))
IDFA = [nptoIDF[np1] for np1 in Concept1]
IDFB = [nptoIDF[np2] for np2 in Concept2]

## Seeing if one concept phrase is present in the other: example: long-term memory, memory. 
AinB = [1 if Concept1 in Concept2 else 0 for i in range(len(Concept1))]
BinA = [1 if Concept2 in Concept1 else 0 for i in range(len(Concept1))]

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [10]:
import pandas as pd
print('making into dataframe')

df = pd.DataFrame({'Concept1':Concept1,'Concept2': Concept2,'FA':FA,'FB':FB,'SdevA':SdevA,'SdevB':SdevB,'Bondstrength':Bondstrength, 'meanBSA':meanBSA, 'meanBSB':meanBSB,'dAB':dAB,'dBA':dBA,'Amap':Amap,'Bmap':Bmap,'AmapintersectBmap':AmapintersectBmap, 'AminusB':AminusB, 'BminusA':BminusA,'AfirstOc':AfirstOc,'BfirstOc':BfirstOc,'IDFA':IDFA,'IDFB':IDFB, 'AinB':AinB, 'BinA':BinA })

making into dataframe


In [16]:
# Looking at concepts with high co-occurence, and descending order of frequency

df[(df['dAB']<10) | (df['dBA']<10)].loc[:,['Concept1','Concept2','FA','FB','dAB','dBA']].sort_values(by=['FA','FB'],ascending=[False,False]).head(20)

Unnamed: 0,Concept1,Concept2,FA,FB,dAB,dBA
336,cognitive load,element interactivity,433,161,37.475751,2.913043
338,cognitive load,transaction cost,433,13,1756.86836,1.461538
339,cognitive load,memory limit,433,12,159.191686,1.833333
340,cognitive load,benefit,433,10,582.371824,4.6
341,cognitive load,off-loading information,433,1,1912.769053,0.0
774,learner,effect,420,128,22.371429,7.226562
775,learner,high element interactivity information,420,10,365.442857,6.5
373,information,element interactivity,407,161,211.302211,8.521739
380,information,modality effect,407,77,815.36855,4.064935
372,information,animation,407,71,197.53317,4.28169
