# Concept extraction from text
  
    

## 1. Loading text file into string 

### Option 1. Downloading a wikipedia article's text

In [25]:
from bs4 import BeautifulSoup
import requests
url = 'https://en.wikipedia.org/wiki/Star'

source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')


text_set = soup.find_all(['p']) ## This will skip headings ('h2','h3') and lists that are made as links( 'li')
text_list = [p1.get_text() for p1 in text_set]
tags_list = [p1.name for p1 in text_set ]

rawtxt = ''.join(text_list)

print("length of material")
print(len(rawtxt))

print("Sample of text")
print(rawtxt[0:500])

length of material
59186
Sample of text


A star is an astronomical object consisting of a luminous spheroid of plasma held together by its own gravity. The nearest star to Earth is the Sun. Many other stars are visible to the naked eye from Earth during the night, appearing as a multitude of fixed luminous points in the sky due to their immense distance from Earth. Historically, the most prominent stars were grouped into constellations and asterisms, the brightest of which gained proper names. Astronomers have assembled star catalogu


#### Save rawtxt as is for later:

In [26]:
filename = 'starwiki.txt'
path_name = "C:/Users/Arati/Documents/personal docs/python_introduction_course/textdata/"
with open(path_name + filename,"a",encoding="utf-8") as myfile:
    myfile.write(rawtxt)
myfile.close()

### Option 2. Getting file from disk:

In [13]:
filename = 'Cognitive_Load_Theory.txt'
path_name = "C:/Users/Arati/Documents/personal docs/python_introduction_course/textdata/"
with open (path_name +filename, "r",encoding="utf-8") as myfile:
    rawtxt=myfile.read()
myfile.close()
#rawtxt = rawtxt.encode('ascii','ignore')

## Extracting list of concepts:

### Importing libraries and modules

In [27]:
from nltk import word_tokenize
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree
import re
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))
import nltk
wnl = nltk.WordNetLemmatizer()
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

### Sentence splitting

In [28]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(rawtxt)
 
tokenizer = PunktSentenceTokenizer(trainer.get_params())
sents = tokenizer.tokenize(rawtxt)

print("Number of sentences in text "+str(len(sents)))
print(len(sents))

print("Sample of sentences:")
print(sents[0:5])

Number of sentences in text 423
423
Sample of sentences:
['\n\nA star is an astronomical object consisting of a luminous spheroid of plasma held together by its own gravity.', 'The nearest star to Earth is the Sun.', 'Many other stars are visible to the naked eye from Earth during the night, appearing as a multitude of fixed luminous points in the sky due to their immense distance from Earth.', 'Historically, the most prominent stars were grouped into constellations and asterisms, the brightest of which gained proper names.', 'Astronomers have assembled star catalogues that identify the known stars and provide standardized stellar designations.']


### Token handling functions: 
1. validchar(wrd): checks if token is a valid alphanumeric+hyphens word
2. lemmatize_by_pos(tag) lemmatizes token by part of speech
3. chunk_this(grammar_rule_key,sentence_tags) chunks a particular grammar rule key (see chunkrules)
4. eqn_label: extracts equation terms and replaces all occurences in text with a textkey, which is then treated as a noun phrase. Also updates equation dictionary
5. display_equation (displays equation term by key)
6. chunker: chunks each sentence by each chunking rule

In [29]:
def validchar(wrd):
    p = re.compile(r'[^0-9a-zA-Z_-]')
    if p.search(wrd) is None:
        return 1
    else:
        return 0

def lemmatize_by_pos(tag):
    token = tag[0].lower()
    pos = tag[1]
    if token in stop_words:
        return (token,pos)
    if pos.startswith('J'):
        # adjective form
        lemma = wnl.lemmatize(token,'s')
    elif pos.startswith('N'):
        # noun form
        lemma = wnl.lemmatize(token,'n')
    elif pos.startswith('R'):
        # adverb
        lemma = wnl.lemmatize(token,'r')
    elif pos.startswith('V'):
        lemma = wnl.lemmatize(token,'v')
    else:
        lemma = token
    return (lemma,pos)

global eqn_dict
eqn_dict = {}
global eqn_count
eqn_count = 1

def eqn_label(tokens):
    global eqn_count
    global eqn_dict
    EQNlist = [wrd for wrd in tokens if not(wrd.isalnum()) and re.search(r'[\[\]\{\}\+*^=_%$]',wrd) and len(wrd)>1 ]
    ## replace queations with a label and save to equation dictionary
    for eqn in EQNlist:
        
        if not(eqn in eqn_dict):
            
            eqn_dict[eqn] = ''.join(['equation',str(eqn_count)])
            eqn_count = eqn_count + 1                          
        else:    
            tokens[tokens.index(eqn)] = eqn_dict[eqn]
                  
    return tokens

global inv_eqn_dict
inv_eqn_dict = dict([[value,key] for key,value in eqn_dict.items()])

def display_equation(reptokens):
    for wrd in reptokens:
        if wrd in inv_eqn_dict:
            reptokens[reptokens.index(wrd)] = inv_eqn_dict[wrd]
    return reptokens

Setting up chunking rules:

Chunking done in batches to enable overlapping tokens to be extracted. 

In [30]:
chunkrules = {}

# Define chunking rules here:
chunkrules['JJNP'] = r"""    
    JJNP: {<RB.*>?<J.*>?<NN.*>{1,}}       
"""
## Examples: "reusable contactless stored value smart card"

def chunk_this(grammar_rule_key,sentence_tags):
    setlist = []
    cp = nltk.RegexpParser(chunkrules[grammar_rule_key])
    J = cp.parse(sentence_tags) 
    for i in range(len(J)):
        if not(isinstance(J[i],tuple)):
            if (J[i].label()==grammar_rule_key):
                setlist.append((' '.join([J[i][j][0] for j in range(len(J[i])) if (validchar(J[i][j][0])==1)])))
    setlist = list(set(setlist))
    setlist = [wrd.lower() for wrd in setlist if len(wrd)>0]
    return setlist

def chunker(sentence_tags):
    return [chunk_this(key,sentence_tags)  for key in chunkrules]


Process each sentence:

In [31]:
%%time 
sent_to_np = {}
sent_to_ltags = {}
sent_to_tags = {}

for i in range(len(sents)):
    tokens = word_tokenize(sents[i])
    reptokens = eqn_label(tokens)
    tags = nltk.pos_tag(reptokens)
    lemmatags = [lemmatize_by_pos(t) for t in tags]
    sent_to_np[i] = chunker(lemmatags)
    sent_to_ltags[i] = lemmatags
    sent_to_tags[i] = tags

Wall time: 1.22 s


In [32]:
sent_to_np[0]

[['luminous spheroid',
  'astronomical object consisting',
  'star',
  'own gravity']]

In [33]:
# Flatten lists of lists containing chunks of different rules, dictionary of np to sent
import itertools
sent_to_npflat = {}
np_to_sent = {}
for key in sent_to_np:
    sent_to_npflat[key] = list(set((itertools.chain(*sent_to_np[key]))))  
    for np in sent_to_npflat[key]:            
        if np in np_to_sent:                           
            np_to_sent[np].append(key)
        else:                
            np_to_sent[np]=[key]

### Create dataframe with some metrics:
- Concept: concept phrase
- Occurence: list of sentences in which the phrase occurs
- Frequency: number of sentences in which the phrase occurs
- Mean: average of sentence numbers in the text in which the phrase occurs normalized to number of sentences
- Median: median of sentence numbers in the text in which the phrase occurs normalized to number of sentences. Lets us know if phrase occurs much more in the beginning of the text, or towards the end. can indicate how central the phrase is to the text. 
- Sdev: standard deviation of the sentences in which the phrase occurs (indicates the dispersion of the phrase in the text)

In [34]:
import numpy as num
import pandas as pd
Concept = pd.Series([key for (key,value) in np_to_sent.items()])
Occurence = pd.Series([num.array(value) for (key,value) in np_to_sent.items()])
Frequency = pd.Series([len(o) for o in Occurence])
Mean= pd.Series([num.mean(o) for o in Occurence])/len(sents)
Median = pd.Series([num.median(o) for o in Occurence])/len(sents)
Sdev = pd.Series([num.std(o) for o in Occurence])/len(sents)
Conceptdata = pd.DataFrame({'Concept':Concept,'Occurence':Occurence,'Frequency':Frequency,'Mean':Mean,'Median':Median,'Sdev':Sdev})

In [35]:
Conceptdata.sort_values(by='Frequency',ascending=False).head(20)

Unnamed: 0,Concept,Occurence,Frequency,Mean,Median,Sdev
2,star,"[0, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 19,...",173,0.470859,0.498818,0.282697
5,sun,"[1, 17, 27, 28, 29, 46, 47, 57, 91, 104, 107, ...",45,0.538167,0.56974,0.291298
32,core,"[6, 15, 17, 18, 20, 115, 122, 136, 138, 159, 1...",35,0.545424,0.51773,0.304799
47,mass,"[9, 17, 19, 65, 104, 106, 118, 149, 154, 162, ...",26,0.535279,0.570922,0.30578
82,time,"[17, 50, 145, 156, 158, 161, 165, 183, 240, 26...",25,0.613995,0.64539,0.247549
44,luminosity,"[9, 11, 49, 104, 106, 117, 128, 138, 139, 222,...",22,0.555233,0.741135,0.307297
61,temperature,"[10, 11, 69, 121, 138, 152, 162, 164, 171, 293...",20,0.55792,0.693853,0.291179
31,helium,"[6, 7, 13, 14, 75, 115, 136, 138, 151, 154, 15...",20,0.413712,0.365248,0.311947
346,year,"[83, 85, 86, 124, 139, 142, 143, 146, 150, 153...",18,0.417126,0.358156,0.167897
4,earth,"[1, 2, 5, 29, 79, 84, 107, 180, 214, 234, 235,...",17,0.441663,0.50591,0.309461


### Save as csv:

In [11]:
Conceptdata.to_csv(filename[0:-4]+'.csv',sep=',')

### Save dictionaries and dataframe to pickle file

In [24]:
import pickle
concepts = {'sents':sents,'rawtxt':rawtxt,'sent_to_npflat':sent_to_npflat,'sent_to_tags':sent_to_tags,'sent_to_ltags':sent_to_ltags,'np_to_sent':np_to_sent,'Conceptdata':Conceptdata}
with open(filename[0:-4]+'concepts.pickle', 'wb') as f:
    pickle.dump(concepts, f)
f.close()