### Tf-Idf Summarizer with Noun Biases and Stop-words ignorance

In [None]:
import spacy
import numpy as np
nlp = spacy.load("en_core_web_sm")  # pre-trained pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# remove stopwords before making tokens
vectorizer = TfidfVectorizer(stop_words = 'english')

# http://research.nii.ac.jp/ntcir/workshop/OnlineProceedings3/NTCIR3-TSC-SekiY.pdf

In [None]:
def summarizer(text, num_of_sentences):
  doc = nlp(text)


  sentences = [sent.text for sent in doc.sents] # convert the document into seperate sentences for processing and ranking sentences according to their tf-idf scores

  vectorizer.fit(sentences)
  X = vectorizer.transform(sentences)
  X = X.toarray() # converting sparse matrix to numpy matrix

  n = X.shape[0]  # find the number of sentences in the data

  tf_idf_matrix = X.sum(axis=1) # find the sum of score of individual word in sentence and sum them

  # writing mean function, so that instead of ranking sentence only on the basis of sum of tf-idf scores, we will also take into consideration number of words present in sentence.
  # otherwise larger sentence containing irrelevant words will be ranked higher than smaller sentence which are very relevant
  for i in range(0,n):
    sent_len = (X[i] != 0).sum(0)
    tf_idf_matrix[i] = tf_idf_matrix[i]/sent_len

  # Give higher priorities to nouns, numbers, places.
  total_numbers = 0
  total_places = 0
  total_dates = 0
  total_nouns = 0

  # find total nouns present in the document

  for i in range(0,n):
    temp_doc = nlp(sentences[i])
    for token in temp_doc:
      if(token.pos_ == 'PROPN'):
        # token is a proper noun
        total_nouns += 1

      if(token.like_num):
        total_numbers += 1

    for ent in temp_doc.ents:
      if(ent.label_ == 'GPE'):
        total_places += 1
      elif(ent.label_ == 'DATE'):
        total_dates += 1


  # optional function -> to increase score of the sentences containing Nouns
  for i in range(0,n):
    nouns = 0
    places = 0
    numbers = 0
    dates = 0
    temp_doc = nlp(sentences[i])
    for token in temp_doc:
      if(token.pos_ == 'PROPN'):
        # token is a proper noun
        nouns += 1
      if(token.like_num):
        numbers += 1
      for ent in temp_doc.ents:
        if(ent.label_ == 'GPE'):
          total_places += 1
        elif(ent.label_ == 'DATE'):
          total_dates += 1

      # increase the tf-idf sentence score of this sentence based on given weightage
      total_nouns += 1
      total_places += 1
      total_numbers += 1
      total_dates += 1
      tf_idf_matrix[i] = tf_idf_matrix[i]*(1 + ((1.2)*nouns/total_nouns) + (numbers/total_numbers) + ((1.1)*places/total_places) + ((0.8)*dates/total_dates))

  sentence_score = np.argsort(tf_idf_matrix, axis=0) # sentence with highest score will be at last, and the one with lowest score will be at beginning
  summary = [sentences[score] for score in sentence_score]

  summary = summary[-num_of_sentences:]

  summary.reverse()

  return ' '.join(summary)

In [None]:
text = """All is not well in the ruling alliance in Maharashtra. The inclusion of eight Nationalist Congress Party (NCP) MLAs led by Ajit Pawar has upset several Shiv Sena leaders loyal to Chief Minister Eknath Shinde.  The Sena leaders are upset because some of them will not get their desired position in the Shinde-led ministry after Ajit Pawar's stunning switch that split the party founded by his uncle Sharad Pawar. All is not well in the ruling alliance in Maharashtra. The inclusion of eight Nationalist Congress Party (NCP) MLAs led by Ajit Pawar has upset several Shiv Sena leaders loyal to Chief Minister Eknath Shinde.
The Sena leaders are upset because some of them will not get their desired position in the Shinde-led ministry after Ajit Pawar's stunning switch that split the party founded by his uncle Sharad Pawar. "In politics, when our rival gang wants to join us, we have to take them in and that is what the BJP did. After the NCP joined us, people in our group were upset because some of our leaders would not get their desired position," Shirsat said. The Shinde Sena MLAs have urged Eknath Shinde and deputy chief minister Devendra Fadnavis to resolve this issue.
"Udhhav Thackeray was used by Sharad Pawar as the Chief Minister. The NCP used to run the government when Uddhav was the Chief Minister. Eknath Shinde will decide the course of action now," Shirsat added.
Notably, Sena MLAs had cited the inadequate allocation of funds by Ajit Pawar, who was finance minister in the Maha Vikas Aghadi (MVA) government, to their constituencies as one of the reasons for rebelling against Uddhav Thackeray. """

In [None]:
output_sentences = 4
text_summary = summarizer(text,output_sentences)

In [None]:
text_summary

'The inclusion of eight Nationalist Congress Party (NCP) MLAs led by Ajit Pawar has upset several Shiv Sena leaders loyal to Chief Minister Eknath Shinde.   The inclusion of eight Nationalist Congress Party (NCP) MLAs led by Ajit Pawar has upset several Shiv Sena leaders loyal to Chief Minister Eknath Shinde. \n All is not well in the ruling alliance in Maharashtra. All is not well in the ruling alliance in Maharashtra.'