#TextRank

In [1]:
"""
Dependencuas: nltk, numpy, networkx
"""
#!python2
#coding: utf-8-sig

import io
import nltk
#nltk.download()<----Es necesario hacerlo la primera vez
import itertools
from operator import itemgetter
import networkx as nx
import os
import codecs


In [2]:

#apply syntactic filters based on POS tags
def filter_for_tags(tagged, tags=['NN', 'JJ', 'NNP']):
    return [item for item in tagged if item[1] in tags]

def normalize(tagged):
    return [(item[0].replace('.', ''), item[1]) for item in tagged]

def unique_everseen(iterable, key=None):
    "List unique elements, preserving order. Remember all elements ever seen."
    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
    # unique_everseen('ABBCcAD', str.lower) --> A B C D
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in itertools.ifilterfalse(seen.__contains__, iterable):
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

def lDistance(firstString, secondString):
    "Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
    if len(firstString) > len(secondString):
        firstString, secondString = secondString, firstString
    distances = range(len(firstString) + 1)
    for index2, char2 in enumerate(secondString):
        newDistances = [index2 + 1]
        for index1, char1 in enumerate(firstString):
            if char1 == char2:
                newDistances.append(distances[index1])
            else:
                newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
        distances = newDistances
    return distances[-1]

def buildGraph(nodes):
    "nodes - list of hashables that represents the nodes of the graph"
    gr = nx.Graph() #initialize an undirected graph
    gr.add_nodes_from(nodes)
    nodePairs = list(itertools.combinations(nodes, 2))

    #add edges to the graph (weighted by Levenshtein distance)
    for pair in nodePairs:
        firstString = pair[0]
        secondString = pair[1]
        levDistance = lDistance(firstString, secondString)
        gr.add_edge(firstString, secondString, weight=levDistance)

    return gr

def extractKeyphrases(text):
    #tokenize the text using nltk
    wordTokens = nltk.word_tokenize(text)

    #assign POS tags to the words in the text
    tagged = nltk.pos_tag(wordTokens)
    textlist = [x[0] for x in tagged]
    
    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)

    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)

   #this will be used to determine adjacent words in order to construct keyphrases with two words

    graph = buildGraph(word_set_list)

    #pageRank - initial value of 1.0, error tolerance of 0,0001, 
    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
    aThird = len(word_set_list) / 3
    keyphrases = keyphrases[0:aThird+1]

    #take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
    #together
    modifiedKeyphrases = set([])
    dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
    i = 0
    j = 1
    while j < len(textlist):
        firstWord = textlist[i]
        secondWord = textlist[j]
        if firstWord in keyphrases and secondWord in keyphrases:
            keyphrase = firstWord + ' ' + secondWord
            modifiedKeyphrases.add(keyphrase)
            dealtWith.add(firstWord)
            dealtWith.add(secondWord)
        else:
            if firstWord in keyphrases and firstWord not in dealtWith: 
                modifiedKeyphrases.add(firstWord)

            #if this is the last word in the text, and it is a keyword,
            #it definitely has no chance of being a keyphrase at this point    
            if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
                modifiedKeyphrases.add(secondWord)
        
        i = i + 1
        j = j + 1
        
    return modifiedKeyphrases

def extractSentences(text):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary

def writeFiles(keyphrases, fileName):
    "outputs the keyphrases and summaries to appropriate files"
    print "Generating output to " + 'keywords/' + fileName
    keyphraseFile = io.open('keywords/' + fileName, 'wb')
    for keyphrase in keyphrases:
        #keyphraseFile.write(codecs.BOM_UTF16_LE)
        keyphraseFile.write(keyphrase.encode("iso-8859-1") + '\n')
    keyphraseFile.close()

    #print "Generating output to " + 'summaries/' + fileName
    #summaryFile = io.open('summaries/' + fileName, 'w', encoding="iso-8859-1")
    #summaryFile.write(summary)
    #summaryFile.close()

    print "-"


In [5]:

#retrieve each of the articles
articles = os.listdir("articles")
alain = os.listdir("keywords")
for article in articles:
    if article not in alain:
        print 'Reading articles/' + article
        articleFile = io.open('articles/' + article, 'r', encoding="iso-8859-1")
        text = articleFile.read()
        keyphrases = extractKeyphrases(text)
        writeFiles(keyphrases, article)


Reading articles/1880603.txt


Exception KeyboardInterrupt in 'zmq.backend.cython.message.Frame.__dealloc__' ignored


KeyboardInterrupt: 

#RAKE

In [5]:
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
# Automatic keyword extraction from indi-vidual documents. 
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

import re
import operator
import codecs
debug = False
test = True


def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words


def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
    sentences = sentence_delimiters.split(text)
    return sentences


def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern


def generate_candidate_keywords(sentence_list, stopword_pattern):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "":
                phrase_list.append(phrase)
    return phrase_list


def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        #if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  #orig.
            #word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score


def generate_candidate_keyword_scores(phrase_list, word_score):
    keyword_candidates = {}
    for phrase in phrase_list:
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates


class Rake(object):
    def __init__(self, stop_words_path):
        self.stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)

    def run(self, text):
        sentence_list = split_sentences(text)

        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)

        word_scores = calculate_word_scores(phrase_list)

        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)

        sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
        return sorted_keywords


if test:
    text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

    # Split text into sentences
    sentenceList = split_sentences(text)
    #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
    stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
    stopwordpattern = build_stop_word_regex(stoppath)

    # generate candidate keywords
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

    # calculate individual word scores
    wordscores = calculate_word_scores(phraseList)

    # generate candidate keyword scores
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    if debug: print keywordcandidates

    sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    if debug: print sortedKeywords

    totalKeywords = len(sortedKeywords)
    if debug: print totalKeywords
    #print sortedKeywords[0:(totalKeywords / 3)]

    rake = Rake("SmartStoplist.txt")
    keywords = rake.run(text)
    #print keywords
    for i in keywords:
        print i[0]


minimal generating sets
linear diophantine equations
minimal supporting set
minimal set
linear constraints
upper bounds
natural numbers
nonstrict inequations
strict inequations
mixed types
considered types
set
types
considered
constructing
solutions
solving
system
compatibility
systems
criteria
construction
algorithms
components


In [9]:
articles = os.listdir("articles")
for article in articles:
    print 'Reading articles/' + article
    articleFile = io.open('articles/' + article, 'r', encoding="iso-8859-1")
    text = articleFile.read()
   # Split text into sentences
    sentenceList = split_sentences(text)
    #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
    stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
    stopwordpattern = build_stop_word_regex(stoppath)

    # generate candidate keywords
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

    # calculate individual word scores
    wordscores = calculate_word_scores(phraseList)

    # generate candidate keyword scores
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    if debug: print keywordcandidates

    sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    if debug: print sortedKeywords

    totalKeywords = len(sortedKeywords)
    if debug: print totalKeywords
    #print sortedKeywords[0:(totalKeywords / 3)]

    rake = Rake("SmartStoplist.txt")
    keywords = rake.run(text)
    #print keywords
    
    "outputs the keyphrases and summaries to appropriate files"
    print "Generating output to " + 'keywords/' + article
    keyphraseFile = io.open('keywordsRAKE/' + article, 'wb')
    for keywords in keywords:
        #keyphraseFile.write(codecs.BOM_UTF16_LE)
        keyphraseFile.write(keywords[0].encode("iso-8859-1") + '\n')
    keyphraseFile.close()



Reading articles/100166.txt
Generating output to keywords/100166.txt
Reading articles/101.txt
Generating output to keywords/101.txt
Reading articles/1016800.txt
Generating output to keywords/1016800.txt
Reading articles/101973.txt
Generating output to keywords/101973.txt
Reading articles/1036681.txt
Generating output to keywords/1036681.txt
Reading articles/106364.txt
Generating output to keywords/106364.txt
Reading articles/1067802.txt
Generating output to keywords/1067802.txt
Reading articles/1097231.txt
Generating output to keywords/1097231.txt
Reading articles/1116998.txt
Generating output to keywords/1116998.txt
Reading articles/112878.txt
Generating output to keywords/112878.txt
Reading articles/1133633.txt
Generating output to keywords/1133633.txt
Reading articles/1137519.txt
Generating output to keywords/1137519.txt
Reading articles/114199.txt
Generating output to keywords/114199.txt
Reading articles/1144477.txt
Generating output to keywords/1144477.txt
Reading articles/1159615

# newspaper

In [11]:
from newspaper import Article, fulltext

In [21]:
articles = os.listdir("articles")
for article in articles:
    print 'Reading articles/' + article
    print article
    
    a = Article("https://github.com/samorogu/mineriaTextos/blob/master/CorpusTaggersFInal/documents/"+article)
    try:
        a.download()
        a.parse()
        a.nlp()
        authors = a.authors
        keywords = a.keywords
        "outputs the keyphrases and summaries to appropriate files"
        print "Generating output to " + 'keywords/' + article
        keyphraseFile = io.open('keywords_newspaper/' + article, 'wb')
        for keywords in keywords:
            #keyphraseFile.write(codecs.BOM_UTF16_LE)
            keyphraseFile.write(keywords.encode("iso-8859-1") + '\n')
        keyphraseFile.close()

    except:
        print "chin"


Reading articles/100166.txt
100166.txt
Generating output to keywords/100166.txt
Reading articles/101.txt
101.txt
Generating output to keywords/101.txt
Reading articles/1016800.txt
1016800.txt
Generating output to keywords/1016800.txt
Reading articles/101973.txt
101973.txt
Generating output to keywords/101973.txt
Reading articles/1036681.txt
1036681.txt
Generating output to keywords/1036681.txt
Reading articles/106364.txt
106364.txt
Generating output to keywords/106364.txt
Reading articles/1067802.txt
1067802.txt
Generating output to keywords/1067802.txt
Reading articles/1097231.txt
1097231.txt
Generating output to keywords/1097231.txt
Reading articles/1116998.txt
1116998.txt
Generating output to keywords/1116998.txt
Reading articles/112878.txt
112878.txt
Generating output to keywords/112878.txt
Reading articles/1133633.txt
1133633.txt
Generating output to keywords/1133633.txt
Reading articles/1137519.txt
1137519.txt
Generating output to keywords/1137519.txt
Reading articles/114199.txt


# Comparativa

In [93]:
keywordsRAKE = os.listdir("keywordsRAKE")
keywordsTR = os.listdir("keywords")
keywords_newspaper = os.listdir("keywords_newspaper")
tags=os.listdir("tag")
rake=0
textrank=0
newspap=0
for keyword in keywordsTR:
    tags = io.open('tag/' + keyword[:-3]+'tags', 'r', encoding="iso-8859-1")
    TR = io.open('keywords/' + keyword, 'r', encoding="iso-8859-1")
    news = io.open('keywords_newspaper/' + keyword, 'r', encoding="iso-8859-1")
    RAKE = io.open('keywordsRAKE/' + keyword, 'r', encoding="iso-8859-1")

    TAGS = tags.read()
    for i in xrange(0,5):
        #text_newspaper=news.readline()
        text_rank=TR.readline()
        text_rake=RAKE.readline()
        text_newspaper=news.readline()

        #print text_newspaper
        #print "------------------------------------------"
        #print text
      
        if TAGS.find(text_rank): 
            #print "---->newspaper"+"---->"+linea
            textrank=textrank+1
        if TAGS.find(text_rake): 
            #print "---->newspaper"+"---->"+linea
            rake=rake+1  
        if TAGS.find(text_newspaper): 
            newspap=newspap+1       
print newspap/(5.0*179) 
print textrank/(5.0*179) 
print rake/(5.0*179) 

0.972067039106
1.0
0.997765363128
