# Concept extraction from text
  
    

## 1. Loading text file into string 

### Option 1. Downloading a wikipedia article's text

In [None]:
from bs4 import BeautifulSoup
import requests
url = 'https://en.wikipedia.org/wiki/Facet'

source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')


text_set = soup.find_all(['p']) ## This will skip headings ('h2','h3') and lists that are made as links( 'li')
text_list = [p1.get_text() for p1 in text_set]
tags_list = [p1.name for p1 in text_set ]

rawtxt = ''.join(text_list)

print("length of material")
print(len(rawtxt))

print("Sample of text")
print(rawtxt[0:500])

#### Save rawtxt as is for later:

In [None]:
filename = 'facetwiki.txt'
path_name = "C:/Users/Arati/Documents/personal docs/python_introduction_course/textdata/"
with open(path_name + filename,"a",encoding="utf-8") as myfile:
    myfile.write(rawtxt)
myfile.close()

### Option 2. Getting file from disk:

In [1]:
filename = 'Cognitive_Load_Theory.txt'
path_name = "C:/Users/Arati/Documents/personal docs/python_introduction_course/textdata/"
with open (path_name +filename, "r",encoding="utf-8") as myfile:
    rawtxt=myfile.read()
myfile.close()
#rawtxt = rawtxt.encode('ascii','ignore')

## Extracting list of concepts:

### Importing libraries and modules

In [2]:
from nltk import word_tokenize
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree
import re
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))
import nltk
wnl = nltk.WordNetLemmatizer()
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

### Sentence splitting

In [3]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(rawtxt)
 
tokenizer = PunktSentenceTokenizer(trainer.get_params())
sents = tokenizer.tokenize(rawtxt)

print("Number of sentences in text "+str(len(sents)))
print(len(sents))

print("Sample of sentences:")
print(sents[0:5])

Number of sentences in text 4484
4484
Sample of sentences:
['Explorations in the Learning Sciences, Instructional Systems and Performance Technologies\n\nPreface\n\nWithout knowledge of human cognitive processes, instructional design is blind.', 'In\xa0the absence of an appropriate framework to suggest instructional techniques, we\nare likely to have difficulty explaining why instructional procedures do or do not\nwork.', 'Lacking knowledge of human cognition, we would be left with no overarching\nstructure linking disparate instructional processes and guiding procedures.', 'Unless\nwe can appeal to the manner in which human cognitive structures are organised,\nknown as human cognitive architecture, a rational justification for \xadrecommending\none instructional procedure over another is unlikely to be available.', 'At best, we\nwould be restricted to using narrow, empirical grounds indicating that particular\nprocedures seem to work.']


### Token handling functions: 
1. validchar(wrd): checks if token is a valid alphanumeric+hyphens word
2. lemmatize_by_pos(tag) lemmatizes token by part of speech
3. chunk_this(grammar_rule_key,sentence_tags) chunks a particular grammar rule key (see chunkrules)
4. eqn_label: extracts equation terms and replaces all occurences in text with a textkey, which is then treated as a noun phrase. Also updates equation dictionary
5. display_equation (displays equation term by key)
6. chunker: chunks each sentence by each chunking rule

In [4]:
def validchar(wrd):
    p = re.compile(r'[^0-9a-zA-Z_-]')
    if p.search(wrd) is None:
        return 1
    else:
        return 0

def lemmatize_by_pos(tag):
    token = tag[0].lower()
    pos = tag[1]
    if token in stop_words:
        return (token,pos)
    if pos.startswith('J'):
        # adjective form
        lemma = wnl.lemmatize(token,'s')
    elif pos.startswith('N'):
        # noun form
        lemma = wnl.lemmatize(token,'n')
    elif pos.startswith('R'):
        # adverb
        lemma = wnl.lemmatize(token,'r')
    elif pos.startswith('V'):
        lemma = wnl.lemmatize(token,'v')
    else:
        lemma = token
    return (lemma,pos)

global eqn_dict
eqn_dict = {}
global eqn_count
eqn_count = 1

def eqn_label(tokens):
    global eqn_count
    global eqn_dict
    EQNlist = [wrd for wrd in tokens if not(wrd.isalnum()) and re.search(r'[\[\]\{\}\+*^=_%$]',wrd) and len(wrd)>1 ]
    ## replace queations with a label and save to equation dictionary
    for eqn in EQNlist:
        
        if not(eqn in eqn_dict):
            
            eqn_dict[eqn] = ''.join(['equation',str(eqn_count)])
            eqn_count = eqn_count + 1                          
        else:    
            tokens[tokens.index(eqn)] = eqn_dict[eqn]
                  
    return tokens

global inv_eqn_dict
inv_eqn_dict = dict([[value,key] for key,value in eqn_dict.items()])

def display_equation(reptokens):
    for wrd in reptokens:
        if wrd in inv_eqn_dict:
            reptokens[reptokens.index(wrd)] = inv_eqn_dict[wrd]
    return reptokens

Setting up chunking rules:

Chunking done in batches to enable overlapping tokens to be extracted. 

In [5]:
chunkrules = {}

# Define chunking rules here:
chunkrules['JJNP'] = r"""    
    JJNP: {<RB.*>?<J.*>?<NN.*>{1,}}       
"""
## Examples: "reusable contactless stored value smart card"

def chunk_this(grammar_rule_key,sentence_tags):
    setlist = []
    cp = nltk.RegexpParser(chunkrules[grammar_rule_key])
    J = cp.parse(sentence_tags) 
    for i in range(len(J)):
        if not(isinstance(J[i],tuple)):
            if (J[i].label()==grammar_rule_key):
                setlist.append((' '.join([J[i][j][0] for j in range(len(J[i])) if (validchar(J[i][j][0])==1)])))
    setlist = list(set(setlist))
    setlist = [wrd.lower() for wrd in setlist if len(wrd)>0]
    return setlist

def chunker(sentence_tags):
    return [chunk_this(key,sentence_tags)  for key in chunkrules]


Process each sentence:

In [6]:
%%time 
sent_to_np = {}
sent_to_ltags = {}
sent_to_tags = {}

for i in range(len(sents)):
    tokens = word_tokenize(sents[i])
    reptokens = eqn_label(tokens)
    tags = nltk.pos_tag(reptokens)
    lemmatags = [lemmatize_by_pos(t) for t in tags]
    sent_to_np[i] = chunker(lemmatags)
    sent_to_ltags[i] = lemmatags
    sent_to_tags[i] = tags

Wall time: 14.9 s


In [7]:
sent_to_np[0]

[['learning science',
  'instructional design',
  'instructional system',
  'exploration',
  'cognitive process',
  'performance technology preface without knowledge']]

In [8]:
# Flatten lists of lists containing chunks of different rules, dictionary of np to sent
import itertools
sent_to_npflat = {}
np_to_sent = {}
for key in sent_to_np:
    sent_to_npflat[key] = list(set((itertools.chain(*sent_to_np[key]))))  
    for np in sent_to_npflat[key]:            
        if np in np_to_sent:                           
            np_to_sent[np].append(key)
        else:                
            np_to_sent[np]=[key]

### Create dataframe with some metrics:
- Concept: concept phrase
- Occurence: list of sentences in which the phrase occurs
- Frequency: number of sentences in which the phrase occurs
- Mean: average of sentence numbers in the text in which the phrase occurs normalized to number of sentences
- Median: median of sentence numbers in the text in which the phrase occurs normalized to number of sentences. Lets us know if phrase occurs much more in the beginning of the text, or towards the end. can indicate how central the phrase is to the text. 
- Sdev: standard deviation of the sentences in which the phrase occurs (indicates the dispersion of the phrase in the text)

In [9]:
import numpy as num
import pandas as pd
Concept = pd.Series([key for (key,value) in np_to_sent.items()])
Occurence = pd.Series([num.array(value) for (key,value) in np_to_sent.items()])
Frequency = pd.Series([len(o) for o in Occurence])
Mean= pd.Series([num.mean(o) for o in Occurence])/len(sents)
Median = pd.Series([num.median(o) for o in Occurence])/len(sents)
Sdev = pd.Series([num.std(o) for o in Occurence])/len(sents)
Conceptdata = pd.DataFrame({'Concept':Concept,'Occurence':Occurence,'Frequency':Frequency,'Mean':Mean,'Median':Median,'Sdev':Sdev})

In [10]:
Conceptdata.sort_values(by='Frequency',ascending=False).head(20)

Unnamed: 0,Concept,Occurence,Frequency,Mean,Median,Sdev
53,cognitive load,"[15, 46, 47, 48, 133, 975, 979, 1150, 1169, 11...",433,0.572308,0.537244,0.253204
143,learner,"[46, 174, 191, 201, 277, 278, 279, 282, 285, 2...",420,0.647316,0.695027,0.229364
65,information,"[17, 18, 19, 25, 27, 40, 128, 133, 147, 148, 1...",407,0.455098,0.485281,0.289363
106,example,"[29, 58, 91, 94, 97, 99, 124, 129, 145, 151, 1...",276,0.555967,0.549175,0.256705
132,memory,"[40, 41, 44, 46, 436, 507, 863, 864, 866, 880,...",226,0.469096,0.296945,0.28829
203,student,"[69, 189, 307, 621, 676, 1217, 1293, 1295, 129...",211,0.605917,0.579393,0.213465
30,problem,"[6, 33, 212, 219, 227, 228, 232, 239, 256, 278...",181,0.511788,0.460303,0.267847
308,sweller,"[107, 201, 207, 241, 277, 282, 293, 486, 527, ...",173,0.55164,0.51851,0.217999
64,chapter,"[17, 20, 35, 36, 47, 48, 49, 54, 55, 56, 89, 1...",166,0.542654,0.593555,0.284864
131,long-term memory,"[40, 41, 43, 149, 329, 384, 387, 389, 391, 393...",163,0.330377,0.23372,0.28735


### Save as csv:

In [11]:
Conceptdata.to_csv(filename[0:-4]+'.csv',sep=',')

### Save dictionaries and dataframe to pickle file

In [12]:
import pickle
concepts = {'sent_to_npflat':sent_to_npflat,'sent_to_tags':sent_to_tags,'sent_to_ltags':sent_to_ltags,'np_to_sent':np_to_sent,'Conceptdata':Conceptdata}
with open(filename[0:-4]+'concepts.pickle', 'wb') as f:
    pickle.dump(concepts, f)
f.close()