# TextRank Model

In [32]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import os
nltk.download("stopwords")
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
# TextRank

def read_article_textrank(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []

    for sentence in article:
        #print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences


def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

def generate_summary(file_name, top_n=5):
    
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article_textrank(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize text
    print("Summarized Text: \n", ". ".join(summarize_text))
    
# let's begin
# Change the integer as per required length of summary
for i in range(10):
    print( "link"+ str(i+1))
    generate_summary("../input/link"+ str(i+1) +".txt", 3)
    print("\n")
#print(os.listdir('../input/link1.txt'))

link1
Summarized Text: 
 In a move to accelerate their perfume development, 124-year-old Swiss flavor and fragrance giant Givaudan launched an AI program, Carto, which optimizes their production by making perfume recommendations based on a chart that details the individual properties of different fragrances. AI PERFECTING THE ART OF PERFUME MAKING. In a fascinating development in the field, Artificial Intelligence has entered the business of perfume making and soon robots may redefine the perfume industry to emerge as the noses of the future


link2
Summarized Text: 
 Instead of scrambling from one cheap labor nation to another for cutting down the manufacturing cost, apparel companies can now invest in sewing robots for cost-efficient manufacturing. This could encourage a trend of onshoring in apparel production instead of companies resorting to cheap labor nations for cutting down the manufacturing cost. has come up with yet another innovative solution for the complete automation of 

# TF-IDF Model

In [34]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

In [41]:
def read_article_tfidf(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    #for sentence in article:
        #print(sentence)  
    return article
    #sentences = []

    #for sentence in article:
        #print(sentence)
     #   sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    #sentences.pop()
    #print(sentences)
    
    #return sentences

def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.
    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable


def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix


def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue


def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary


def run_summarization(file_name):
    text = read_article_tfidf(file_name)
    text = '. '.join(text)
    """
    :param text: Plain summary_text of long article
    :return: summarized summary_text
    """

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''
    # 1 Sentence Tokenize
    sentences = sent_tokenize(text)
    total_documents = len(sentences)
    #print(sentences)

    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(sentences)
    #print(freq_matrix)

    '''
    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
    '''
    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    #print(tf_matrix)

    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    #print(count_doc_per_words)

    '''
    Inverse document frequency (IDF) is how unique or rare a word is.
    '''
    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    #print(idf_matrix)

    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    #print(tf_idf_matrix)

    # 7 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(tf_idf_matrix)
    #print(sentence_scores)

    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    #print(threshold)

    # 9 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.25 * threshold)
    return summary


#if __name__ == '__main__':
for i in range(10):
    text_str = ("../input/link"+ str(i+1) +".txt")
    print("link"+ str(i+1))
    result = run_summarization(text_str)
    print("Summarized Text: \n",''.join(result))
    print("\n")

link1
Summarized Text: 
  AI PERFECTING THE ART OF PERFUME MAKING. The French have dominated the realm of fragrance since the 16th century. Perfume making is indeed a craft that requires heightened sensory perception. E-nose recognizes a range of smells and collects information on them to run algorithms that track the source. How is an e-nose particularly helpful? Firstly, it can be integrated into many other devices, which can enhance its overall function to include many more features. A robot or software can handle the ingredients more deftly and precisely. Therefore, they do not, in any way, invalidate the skills of a perfumer.


link2
Summarized Text: 
  This could prove to be highly advantageous for an industry for which consistency in size and fitting is vital. An increase in the cost of manufacturing is duly reflected in the retail cost of the apparel. The water-soluble substance is then rinsed off in hot water once the pieces of clothing have been completely assembled. Proponen

# GenSim Model

In [36]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint
from gensim.summarization import summarize

In [38]:
def read_article_gensim(file_name):
    with open(file_name) as f:
        file_content = f.read()
    return file_content

for i in range(10):
    text_str = ("../input/link"+ str(i+1) +".txt")
    text = (read_article_gensim(text_str))
    print("link"+ str(i+1))
    print("Summarized Text:")
    # As per word count
    article = summarize(text, word_count=60)
    # As per ratio
    #article = summarize(text, ratio=0.5))
    #Normal
    ## As per word countsummarize(text))
    
    #pprint(article)
    sentences = []
    for sentence in article:
        sentences.append(sentence.replace("\n", " "))
    #sentences.pop() 
    print(''.join(sentences))
    print(" ")



link1
Summarized Text:
In a fascinating development in the field, Artificial Intelligence has entered the business of perfume making and soon robots may redefine the perfume industry to emerge as the noses of the future. The app learns of each user’s likes and lifestyle and based on this data, it recommends perfumes from a vast database of over 30,000 fragrances.
 
link2
Summarized Text:
While the earliest attempts of automation during the industrial revolution were directed at minimizing human efforts in labor-intensive processes, the tremendous advancements achieved in the field of technology in the past few decades have made it possible to completely eliminate human intervention from many quarters of the manufacturing industry. By drastically cutting down the labor requirement and increasing the rate of production by more than double, the Sewbot can potentially spark off a drastic transformation in the functioning of the apparel industry.
 
link3
Summarized Text:
3FG15 Features The 

In [44]:

for i in range(10):
    print( "link"+ str(i+1))
    text_str = ("../input/link"+ str(i+1) +".txt")
    
    print("TextRank_Model")
    generate_summary(text_str, 3)
    print("\n")
    
    print("TF-IDF_Model")
    result = run_summarization(text_str)
    print("Summarized Text: \n",''.join(result))
    print("\n")
    
    print("GenSim_Model")
    text = (read_article_gensim(text_str))
    print("Summarized Text:")
    # As per word count
    article = summarize(text, word_count=60)
    sentences = []
    for sentence in article:
        sentences.append(sentence.replace("\n", " "))
    #sentences.pop() 
    print(''.join(sentences))
    print("\n")
    
    

link1
TextRank_Model
Summarized Text: 
 In a move to accelerate their perfume development, 124-year-old Swiss flavor and fragrance giant Givaudan launched an AI program, Carto, which optimizes their production by making perfume recommendations based on a chart that details the individual properties of different fragrances. AI PERFECTING THE ART OF PERFUME MAKING. In a fascinating development in the field, Artificial Intelligence has entered the business of perfume making and soon robots may redefine the perfume industry to emerge as the noses of the future


TF-IDF_Model
Summarized Text: 
  AI PERFECTING THE ART OF PERFUME MAKING. The French have dominated the realm of fragrance since the 16th century. Perfume making is indeed a craft that requires heightened sensory perception. E-nose recognizes a range of smells and collects information on them to run algorithms that track the source. How is an e-nose particularly helpful? Firstly, it can be integrated into many other devices, which 

Summarized Text: 
 Benefits of Using a VFD in Place of an RVSS For many, perhaps most users, the primary benefit of VFD is 100% starting torque at 100% current (no inrush) versus 32% starting torque and 200% inrush typical of an RVSS at 40% starting voltage. (Image courtesy of Parker Hannifin.) Solid state reduced voltage starters convert fixed voltage/frequency into variable voltage at fixed frequency to start 3-phase induction motors, after which the bypass contactor shunts the RVSS, directly connecting the motor to the main AC input supply. The VFD applies both a variable voltage and a variable frequency to the motor


TF-IDF_Model
Summarized Text: 
  Yes! Listen to this story Play Mute Parker Hannifin has sponsored this post. (Image courtesy of Parker Hannifin.) This causes mechanical stress and significant heating to the motor and motor windings. The same ramping can be used during stopping. (Image courtesy of Parker Hannifin.) (Image courtesy of Parker Hannifin.) Limiting starts 