# Building a concept map
### Generating a graph of linked concepts
### Recommending learning paths based on student background

First we load extracted concepts from file. 

In [1]:
#filename = 'A Mind For Numbers_ How to Excel at Math and Science (Even If You Flunked Algebra)'
filename = 'CPTSD'
concepts = {}
import pickle
# Loading extracted concepts from file (see concept_extraction.ipynb)
#concepts = {'sents':sents,'rawtxt':rawtxt,'sent_to_npflat':sent_to_npflat,'sent_to_tags':sent_to_tags,'sent_to_ltags':sent_to_ltags,'np_to_sent':np_to_sent,'Conceptdata':Conceptdata}
with open('../processed_data/'+filename +'concepts.pickle', 'rb') as f:
    concepts = pickle.load(f)
f.close()


# Loading idf dictionary (see Build_IDF_dictionary.ipynb)
with open('../processed_data/idf_dict.pickle','rb') as f1:
    idf_dict =pickle.load(f1)
f1.close()

with open('../processed_data/'+filename +'conceptmap.pickle', 'rb') as f2:
    dfconceptmap = pickle.load(f2)
f2.close()

with open('../processed_data/'+filename + 'conceptimp.pickle','rb') as f3:
    dfConceptimp = pickle.load(f3)
f3.close()

sents = concepts['sents']
rawtxt = concepts['rawtxt']
sent_to_npflat = concepts['sent_to_npflat']
sent_to_tags= concepts['sent_to_tags']
sent_to_ltags = concepts['sent_to_ltags']
np_to_sent = concepts['np_to_sent']
Conceptdata = concepts['Conceptdata']
inv_eqn_dict = concepts['inv_eqn_dict']

In [2]:
# function to find shortest distance (used for graph relationship weight calculations)
import bisect as bs

def find_shortest_distance(search_list, value):
    ins_point = bs.bisect_right(search_list,value)
    if ins_point < len(search_list):
        return min(abs(search_list[ins_point] - value), abs(search_list[ins_point - 1] - value))
    return abs(search_list[ins_point - 1] - value)

Function to get undirected graph between two concepts, weight value is inversely proportional to number of sentences between the two concepts. 

Bond strength = summation of 1/(1+sentence distance) for every time they occur within (max - min) sentence distance

1. max_sent_dist : the maximum sentence between first concept and second concept to consider as linked
2. min_sent_dist : (typically zero) minimum sentence distance between first and second concept to consider as linked
3. sent_to_npflat: dictionary : key: (sentence number in text), value: (list of concepts in the sentence) 


In [3]:
# function to find shortest distance (used for graph relationship weight calculations)
import bisect as bs

def find_shortest_distance(search_list, value):
    ins_point = bs.bisect_right(search_list,value)
    if ins_point < len(search_list):
        return min(abs(search_list[ins_point] - value), abs(search_list[ins_point - 1] - value))
    return abs(search_list[ins_point - 1] - value)


In [4]:
import pandas as pd
import numpy as num

nplist = sorted(list(dfConceptimp.loc[(dfConceptimp['FA']>num.quantile(dfConceptimp['FA'],0.99)) & (dfConceptimp['IDFA']>4),'Concept1']))
Concept2 = nplist*len(nplist)
Concept1 = sorted(Concept2)
print(len(Concept1),len(Concept2))

## Computing average shortest distance of each A to a B and vice versa. A metric for co-occurence of the two concepts
OcA = [np_to_sent[np1] for np1 in Concept1]
OcB = [np_to_sent[np2] for np2 in Concept2]

dAB=[]
dBA=[]

for i in range(len(Concept1)):
    dAB.append(num.mean([abs(find_shortest_distance(OcB[i],o)) for o in OcA[i]]))
    dBA.append(num.mean([abs(find_shortest_distance(OcA[i],o)) for o in OcB[i]]))
# If dAB is small, then for most occurences of A there is an occurence of B nearby
# IF dBA is small, then for most occurences of B, there is an occurence of A nearby

df = pd.DataFrame({'Concept1':Concept1,'Concept2': Concept2,'dAB':dAB,'dBA':dBA})
df['min_dAB_dBA'] = df[['dAB','dBA']].min(axis=1)


4900 4900


In [5]:
df2 = df.copy()
print(df2.shape)
df2 = df2.set_index(['Concept1','Concept2']).loc[:,'min_dAB_dBA']
print(len(df2))
df3 = df2.unstack()
print(df3.shape)

(4900, 5)
4900
(70, 70)


In [6]:
from sklearn.cluster import AgglomerativeClustering
#n_clust = int(len(nplist)/)
max_cardinality = 8
dist_thresh = num.quantile(df['min_dAB_dBA'],0.5)
clustering = AgglomerativeClustering(n_clusters =  None,distance_threshold = dist_thresh, affinity = 'precomputed',linkage='average').fit_predict(num.array(df3))

In [7]:
cldf = pd.DataFrame({'Concept':nplist,'Cluster':clustering})

In [8]:
# number of concepts in each cluster
cldf.groupby('Cluster')['Concept'].size()

Cluster
0     21
1      2
2     10
3      4
4      6
5      2
6     12
7      1
8      2
9      1
10     1
11     1
12     3
13     2
14     2
Name: Concept, dtype: int64

In [9]:
# return all concepts in the cluster that contains a concept
centralconcept = 'flashback'
conceptlist = cldf.loc[cldf['Cluster']==cldf[cldf['Concept']==centralconcept]['Cluster'].iloc[0],'Concept']
print(list(conceptlist))

['abandonment', 'abandonment depression', 'abuse', 'awareness', 'client', 'contempt', 'decade', 'depression', 'emotional flashback', 'fear', 'feeling', 'flashback', 'home', 'matter', 'mother', 'parent', 'safety', 'sensation', 'shame', 'someone', 'toxic shame']


In [10]:
cldf.sort_values(by='Cluster')

Unnamed: 0,Concept,Cluster
0,abandonment,0
65,toxic shame,0
59,someone,0
58,shame,0
55,sensation,0
53,safety,0
49,parent,0
45,mother,0
43,matter,0
29,flashback,0


In [11]:
ax = dfConceptimp['FA'].plot.hist(bins=120, alpha=0.5)

In [12]:
dfConceptimp[dfConceptimp['FA'] >= num.quantile(dfConceptimp['FA'],0.995)]

Unnamed: 0,Concept1,TFIDFA,FA,SdevA,Amap,AfirstOc,IDFA
76,survivor,0.327011,133,906.538361,19,0.0,8.187577
355,flashback,0.387622,139,806.154125,17,0.0003,9.28619
432,life,0.113586,85,970.437752,12,0.001201,4.449908
570,emotional flashback,0.118703,46,900.393132,13,0.002703,8.593043
637,abandonment,0.128106,54,985.640902,16,0.003303,7.899895
666,therapist,0.161741,58,1063.2036,7,0.003604,9.28619
673,year,0.017677,57,1014.643637,9,0.003604,1.032702
696,cptsd,0.200782,72,977.133751,5,0.004505,9.28619
736,client,0.169768,101,916.494161,10,0.005105,5.59731
933,mother,0.086448,31,1079.302149,6,0.006607,9.28619


In [13]:
dfConceptimp.loc[dfConceptimp['Concept1'].isin(conceptlist),:].sort_values(by = ['FA','SdevA','Amap','IDFA'],ascending = [False]*4)

Unnamed: 0,Concept1,TFIDFA,FA,SdevA,Amap,AfirstOc,IDFA
2141,parent,0.247609,178,893.921803,6,0.025526,4.632229
355,flashback,0.387622,139,806.154125,17,0.0003,9.28619
21249,fear,0.172479,125,912.501355,8,0.026126,4.594842
1113,feeling,0.194935,104,1013.344868,4,0.009009,6.241667
736,client,0.169768,101,916.494161,10,0.005105,5.59731
29608,shame,0.188376,73,907.225171,9,0.044444,8.593043
637,abandonment,0.128106,54,985.640902,16,0.003303,7.899895
28515,depression,0.110374,51,809.911326,6,0.036336,7.206748
570,emotional flashback,0.118703,46,900.393132,13,0.002703,8.593043
29473,toxic shame,0.083147,33,983.713258,1,0.043844,8.39031


In [329]:
from nltk.corpus import wordnet as wn
wn.synsets('dog')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [330]:
wn.synsets('dog', pos=wn.VERB)

[Synset('chase.v.01')]

In [331]:
print(wn.synset('dog.n.01').definition())

a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds


In [361]:
wn.synset('arrangement.n.01').definition()

'the thing arranged or agreed to'