In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en import English
import numpy as np

In [None]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [None]:
import requests
sample_text_url = 'https://raw.githubusercontent.com/pradipto111/TextSummarizer/main/sample_text.txt'
req = requests.get(sample_text_url)
text_corpus = req.text
print(text_corpus)

Junk foods taste good that’s why it is mostly liked by everyone of any age group especially kids and school going children. They generally ask for the junk food daily because they have been trend so by their parents from the childhood. They never have been discussed by their parents about the harmful effects of junk foods over health. According to the research by scientists, it has been found that junk foods have negative effects on the health in many ways. They are generally fried food found in the market in the packets. They become high in calories, high in cholesterol, low in healthy nutrients, high in sodium mineral, high in sugar, starch, unhealthy fat, lack of protein and lack of dietary fibers. Processed and junk foods are the means of rapid and unhealthy weight gain and negatively impact the whole body throughout the life. It makes able a person to gain excessive weight which is called as obesity. Junk foods tastes good and looks good however do not fulfil the healthy calorie r

In [None]:
doc = nlp(text_corpus.replace("\n", ""))
sentences = [sent.string.strip() for sent in doc.sents]

In [None]:
print("Senetence are: \n", sentences)

Senetence are: 
 ['Junk foods taste good that’s why it is mostly liked by everyone of any age group especially kids and school going children.', 'They generally ask for the junk food daily because they have been trend so by their parents from the childhood.', 'They never have been discussed by their parents about the harmful effects of junk foods over health.', 'According to the research by scientists, it has been found that junk foods have negative effects on the health in many ways.', 'They are generally fried food found in the market in the packets.', 'They become high in calories, high in cholesterol, low in healthy nutrients, high in sodium mineral, high in sugar, starch, unhealthy fat, lack of protein and lack of dietary fibers.', 'Processed and junk foods are the means of rapid and unhealthy weight gain and negatively impact the whole body throughout the life.', 'It makes able a person to gain excessive weight which is called as obesity.', 'Junk foods tastes good and looks good 

In [None]:
# Let's create an organizer which will store the sentence ordering to later reorganize the 
# scored sentences in their correct order
sentence_organizer = {k:v for v,k in enumerate(sentences)}

In [None]:
print("Our sentence organizer: \n", sentence_organizer)

Our sentence organizer: 
 {'Junk foods taste good that’s why it is mostly liked by everyone of any age group especially kids and school going children.': 0, 'They generally ask for the junk food daily because they have been trend so by their parents from the childhood.': 1, 'They never have been discussed by their parents about the harmful effects of junk foods over health.': 2, 'According to the research by scientists, it has been found that junk foods have negative effects on the health in many ways.': 3, 'They are generally fried food found in the market in the packets.': 4, 'They become high in calories, high in cholesterol, low in healthy nutrients, high in sodium mineral, high in sugar, starch, unhealthy fat, lack of protein and lack of dietary fibers.': 5, 'Processed and junk foods are the means of rapid and unhealthy weight gain and negatively impact the whole body throughout the life.': 6, 'It makes able a person to gain excessive weight which is called as obesity.': 7, 'Junk 

In [None]:
# Let's now create a tf-idf (Term frequnecy Inverse Document Frequency) model
tf_idf_vectorizer = TfidfVectorizer(min_df=2,  max_features=None, 
                                    strip_accents='unicode', 
                                    analyzer='word',
                                    token_pattern=r'\w{1,}',
                                    ngram_range=(1, 3), 
                                    use_idf=1,smooth_idf=1,
                                    sublinear_tf=1,
                                    stop_words = 'english')

In [None]:
# Passing our sentences treating each as one document to TF-IDF vectorizer
tf_idf_vectorizer.fit(sentences)

TfidfVectorizer(min_df=2, ngram_range=(1, 3), smooth_idf=1,
                stop_words='english', strip_accents='unicode', sublinear_tf=1,
                token_pattern='\\w{1,}', use_idf=1)

In [None]:
# Transforming our sentences to TF-IDF vectors
sentence_vectors = tf_idf_vectorizer.transform(sentences)

In [None]:
# Getting sentence scores for each sentences
sentence_scores = np.array(sentence_vectors.sum(axis=1)).ravel()

# Sanity checkup
print(len(sentences) == len(sentence_scores))

True


In [None]:
# Getting top-n sentences
N = 3
top_n_sentences = [sentences[ind] for ind in np.argsort(sentence_scores, axis=0)[::-1][:N]]


In [None]:
# Let's now do the sentence ordering using our prebaked sentence_organizer
# Let's map the scored sentences with their indexes
mapped_top_n_sentences = [(sentence,sentence_organizer[sentence]) for sentence in top_n_sentences]
print("Our top_n_sentence with their index: \n")
for element in mapped_top_n_sentences:
    print(element)

# Ordering our top-n sentences in their original ordering
mapped_top_n_sentences = sorted(mapped_top_n_sentences, key = lambda x: x[1])
ordered_scored_sentences = [element[0] for element in mapped_top_n_sentences]

# Our final summary
summary = " ".join(ordered_scored_sentences)

Our top_n_sentence with their index: 

('Eating junk food daily lead us to the nutritional deficiencies in the body because it is lack of essential nutrients, vitamins, iron, minerals and dietary fibers.', 14)
('Some of the foods like french fries, fried foods, pizza, burgers, candy, soft drinks, baked goods, ice cream, cookies, etc are the example of high-sugar and high-fat containing foods.', 9)
('For instance, foods like French fries, burgers, candy, and cookies, all have high amounts of sugar and fats.', 28)


In [None]:
print("Summary: \n", summary)

Summary: 
 Some of the foods like french fries, fried foods, pizza, burgers, candy, soft drinks, baked goods, ice cream, cookies, etc are the example of high-sugar and high-fat containing foods. Eating junk food daily lead us to the nutritional deficiencies in the body because it is lack of essential nutrients, vitamins, iron, minerals and dietary fibers. For instance, foods like French fries, burgers, candy, and cookies, all have high amounts of sugar and fats.


In [None]:
def summarizer(text, tokenizer, max_sent_in_summary=3):
    # Create spacy document for further sentence level tokenization
    doc = nlp(text_corpus.replace("\n", ""))
    sentences = [sent.string.strip() for sent in doc.sents]
    # Let's create an organizer which will store the sentence ordering to later reorganize the 
    # scored sentences in their correct order
    sentence_organizer = {k:v for v,k in enumerate(sentences)}
    # Let's now create a tf-idf (Term frequnecy Inverse Document Frequency) model
    tf_idf_vectorizer = TfidfVectorizer(min_df=2,  max_features=None, 
                                        strip_accents='unicode', 
                                        analyzer='word',
                                        token_pattern=r'\w{1,}',
                                        ngram_range=(1, 3), 
                                        use_idf=1,smooth_idf=1,
                                        sublinear_tf=1,
                                        stop_words = 'english')
    # Passing our sentences treating each as one document to TF-IDF vectorizer
    tf_idf_vectorizer.fit(sentences)
    # Transforming our sentences to TF-IDF vectors
    sentence_vectors = tf_idf_vectorizer.transform(sentences)
    # Getting sentence scores for each sentences
    sentence_scores = np.array(sentence_vectors.sum(axis=1)).ravel()
    # Getting top-n sentences
    N = max_sent_in_summary
    top_n_sentences = [sentences[ind] for ind in np.argsort(sentence_scores, axis=0)[::-1][:N]]
    # Let's now do the sentence ordering using our prebaked sentence_organizer
    # Let's map the scored sentences with their indexes
    mapped_top_n_sentences = [(sentence,sentence_organizer[sentence]) for sentence in top_n_sentences]
    # Ordering our top-n sentences in their original ordering
    mapped_top_n_sentences = sorted(mapped_top_n_sentences, key = lambda x: x[1])
    ordered_scored_sentences = [element[0] for element in mapped_top_n_sentences]
    # Our final summary
    summary = " ".join(ordered_scored_sentences)
    return summary

In [None]:
print("Summarizer Result: \n", summarizer(text=text_corpus, tokenizer=nlp, max_sent_in_summary=3))

Summarizer Result: 
 Some of the foods like french fries, fried foods, pizza, burgers, candy, soft drinks, baked goods, ice cream, cookies, etc are the example of high-sugar and high-fat containing foods. Eating junk food daily lead us to the nutritional deficiencies in the body because it is lack of essential nutrients, vitamins, iron, minerals and dietary fibers. For instance, foods like French fries, burgers, candy, and cookies, all have high amounts of sugar and fats.
