In [None]:
from sklearn.model_selection import train_test_split
from sklearn import neural_network
from sklearn import  metrics
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.models import Sequential
import pandas as pd
import numpy as np
import keras
import re
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
english_stopwords.extend([",",".","$","%","'s","``","''"])

In [None]:
pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))'

In [None]:
data_path = "/content/drive/MyDrive/ML Folder/textsum_dataset/featuresDataset3.csv"

In [None]:
model_path = "/content/drive/MyDrive/ML Folder/textsum_dataset/summarizer_model.joblib"

In [None]:
df = pd.read_csv(data_path)

In [None]:
df = df.drop(columns=["Unnamed: 0"])

In [None]:
df.head()

Unnamed: 0,FisrtPara,FirstSente,ParaLoc,SenteLoc,SenteLen,ThematicWords,ProperNouns,StatRatio,Included
0,1,1,0.628319,3.141593,21,0.746269,2.962963,2.222222,1
1,0,1,0.339482,1.047198,23,2.158273,0.0,2.112676,0
2,0,0,0.339482,0.565803,11,6.25,2.941176,0.0,1
3,0,0,0.339482,0.435788,20,1.941748,3.809524,0.0,1
4,0,1,0.261473,0.523599,13,1.449275,11.428571,0.0,0


In [None]:
X = df.iloc[:,:-1]
Y = df.iloc[:, -1]
print(Y.head())
print(X.head())
print(len(Y))
print(len(X))

0    1
1    0
2    1
3    1
4    0
Name: Included, dtype: int64
   FisrtPara  FirstSente   ParaLoc  SenteLoc  SenteLen  ThematicWords  \
0          1           1  0.628319  3.141593        21       0.746269   
1          0           1  0.339482  1.047198        23       2.158273   
2          0           0  0.339482  0.565803        11       6.250000   
3          0           0  0.339482  0.435788        20       1.941748   
4          0           1  0.261473  0.523599        13       1.449275   

   ProperNouns  StatRatio  
0     2.962963   2.222222  
1     0.000000   2.112676  
2     2.941176   0.000000  
3     3.809524   0.000000  
4    11.428571   0.000000  
43217
43217


In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
# model = neural_network.MLPClassifier(
#     alpha= 0.001,
#     hidden_layer_sizes= (100),
#     solver= 'adam',
#     random_state= 50)
# model.fit(x_train, y_train)

In [None]:
from joblib import dump, load
# dump(model, 'summarizer_model.joblib')
model = load(model_path)

In [None]:
predicted = model.predict(x_val)
print("Classification Report:\n %s:" % (metrics.classification_report(y_val, predicted)))

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.86      0.79      5370
           1       0.69      0.49      0.57      3274

    accuracy                           0.72      8644
   macro avg       0.71      0.68      0.68      8644
weighted avg       0.72      0.72      0.71      8644
:


In [None]:
def extract_features(article):
  X=[]
  pattern = r'(?<=[.?!])(?:\s*(?=[^0-9.]|[0-9]\.[0-9]))'
  allWords=nltk.tokenize.word_tokenize(article)
  allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w.lower() not in english_stopwords and w.isalnum())
  mostCommon= [k for k,c in allWordExceptStopDist.most_common(10)]

  pos_tags = nltk.pos_tag(allWords)
  proper_nouns = [word for word, pos_tag in pos_tags if pos_tag == 'NNP']
  stats = [item for item, pos_tag in pos_tags if pos_tag in ['CD']]

  articleFeatureVects=[]
  yVect=[]
  for j,para in enumerate(article.split("\n\n")):
      for k,sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern,para)]):
          if len(sente)==0:
              continue
          senteFeatureVect=[]
          senteFeatureVect.append(1 if j==0 else 0)   # eliminate this feature
          senteFeatureVect.append(1 if k==0 else 0)
          senteFeatureVect.append(np.absolute(np.pi*np.cos(j)/len(article.split("\n\n"))))
          senteFeatureVect.append(np.absolute(np.pi*np.cos(k)/len(para.split(". "))))
          senteFeatureVect.append(len(sente.split(" ")))

          thematicWords=0
          propnounWords=0
          statsWords=0
          for word in sente.split(" "):
              if word in mostCommon:
                  thematicWords+=1
              if word in proper_nouns:
                  propnounWords+=1
              if word in stats:
                  statsWords+=1
          thematicWords=100*thematicWords/(len(sente)-thematicWords+1 if len(sente)-thematicWords==0 else len(sente)-thematicWords)
          propnounWords=propnounWords/len(sente)*200
          statsWords=statsWords/len(sente)*300

          senteFeatureVect.append(thematicWords)
          senteFeatureVect.append(propnounWords)
          senteFeatureVect.append(statsWords)

          articleFeatureVects.append(senteFeatureVect)
      # break

  X.extend(articleFeatureVects)
  return (X)

In [None]:
article = """
Modi said the people of Rajasthan had taught Congress a lesson in the first phase. “In the first phase, half of Rajasthan has punished the Congress and taught it a lesson. Full of patriotism, Rajasthan knows that Congress cannot ever make a strong Bharat. The country doesn’t want a Congress government. The country doesn’t want the pre-2014 situation to return. Everyone used to threaten the weak Congress government and everyone was busy looting the country. No one even used to ask for the Prime Minister, and the government used to be run by remote control,” he said.

Highlighting his government’s achievements, particularly for Rajasthan, Modi said, around 19 lakh poor, Dalit, backward, and tribal families in the state have been given pucca houses, of which 1 lakh homes have been constructed in Jalore and Sirohi.

The Congress has fielded Vaibhav Gehlot, son of former chief minister Ashok Gehlot, against BJP’s Lumbaram Choudhary from Jalore. The BJP’s Devji Patel is the sitting MP from the seat.
"""

In [None]:
article2 = """
In the early hours of Friday, Mehrdad, an engineer in Isfahan, Iran, woke to the sound of explosions rattling the windows and shaking the ground. In Tehran, passengers about to board flights were abruptly told the airspace was closed.

Israel, they soon learned, had attacked Iran.

As booms and gunfire went off in the distance, Mehrdad, 43, came to realize that the Israelis’ target was a military base on the outskirts of the city. He and his pregnant wife remained fearful that war would break out, he said in an interview by phone.

“I think Israel wanted to test the water and evaluate with last night’s strikes,” said Mehrdad, who, like others interviewed for this article, asked that his last name be withheld for fear of retribution. “I fear the worst is coming, but I also hope that things end here.”

So, apparently, does the Iranian government, which after a week of promising a forceful response to any Israeli attack on Iranian territory, appeared to be standing down from nearly going to the brink of war with Israel. Facing deep economic troubles and a restive population, the government seems to have adopted a two-track policy, analysts say, declaring victory over Israel and cracking down at home.

“The external and internal challenges are two sides of the same coin for the establishment,” Abbas Abdi, a prominent analyst and writer in Tehran, said in a telephone interview. “With both Israel and internal dissent, they are taking an aggressive approach because they think both issues have reached a boiling point where if they do nothing it will only get worse.”

The tit-for-tat attacks between Iran and Israel over the past three weeks were a startling and worrisome departure from the shadow warfare they have waged for decades, raising fears of a regional war. Iran responded to a deadly Israeli attack on its embassy compound in Damascus, Syria, by launching a barrage of more than 300 drones and missiles directly at Israel for the first time. A majority of them were intercepted.

World leaders implored Israel to respond with restraint, which it did on Friday, attacking an Iranian air force base with drones. The strike damaged the radar of an S-300 system responsible for the air defense of Natanz nuclear facility in central Iran. Israel also fired air-to-ground missiles toward Iran but deliberately inflicted little damage. Afterward, Iranian state news media and officials downplayed the attack.

Editors’ Picks

Atlas, a Humanoid Robot From Boston Dynamics, Is Leaping Into Retirement

Watch the Lyrid Meteor Shower Reach Its Peak

It’s 4/20. These Restaurants Know You Have the Munchies.
Image
People on a street walk past a large billboard depicting missiles.
Iran and Israel have waged shadow warfare for decades, but in recent weeks the conflict has escalated, raising fears of a regional war.Credit...Arash Khamooshi for The New York Times
Nasser Imani, an analyst in Tehran with close ties to the government, said Iran had dealt effectively with Israel and could now afford to de-escalate.

“Iranian officials do not want war with Israel,” he said in a telephone interview. “Iran will end it here and not directly engage any more because they feel they have established enough deterrence for now.”
"""

In [None]:
article3 = """
India's power consumption grew nearly 10% year-on-year to 70.66 billion units (BU) in the first half of April this year, showing improvement in economic activities and consumption patterns, according to the power ministry data.

According to the data, power consumption in the country rose to 70.66 BU during April 1-15 this year from 64.24 MU in the year-ago period.


The peak power demand met or the highest supply in a day rose to about 218 GW in the first half of April compared to 206 GW in the same period a year ago.


The highest supply in a day during the entire month of April last year was about 216 GW.

The ministry has projected a peak power demand of 260 GW during the summer season (April to June) in view of longer heat wave duration. The peak power demand had touched an all-time high of 243 GW in September 2023.

The India Meteorological Department (IMD) has predicted above-normal maximum temperatures in most parts of the country during summer this year.


The experts said that the power demand will increase in the coming days as rains in different parts of the country have reduced the need for cooling appliances like air conditioners, desert coolers etc used during this time of the year.

However, they said that power consumption growth in double digits shows improvement in economic activities and change in consumption patterns.

They are of the view that the consumers in India are also increasing their energy consumption on various appliances and gadgets as is being done in the developed world.

Besides, they pointed out that the increase in electricity in the transport sector like electric buses, cars, rickshaws and railways has also changed the consumption pattern and increased per capita use.
"""

In [None]:
def count_total_words(text):
    # Split the text into words
    words = text.split()

    # Count the total number of words
    total_words = len(words)

    return total_words

In [None]:
def summarize(article, max_sents=1000):
  import warnings
  warnings.filterwarnings('ignore')
  features = extract_features(article)
  article_sentences = []
  for j,para in enumerate(article.split("\n\n")):
    for k,sente in enumerate([sentence.rstrip('.?!') for sentence in re.split(pattern,para)]):
      if len(sente)==0:
            continue
      article_sentences.append(sente)
  total_words = count_total_words(article)
  summary_array = []
  summarized_sentences = model.predict(features)
  for i in range(len(summarized_sentences)):
    if(summarized_sentences[i]):
      summary_array.append(article_sentences[i])
  summary="\n".join(summary_array[:min(len(summary_array),max_sents)])
  total_summary_words = count_total_words(summary)
  percent_compression = (total_words-total_summary_words)/total_words*100
  return (summary, total_words, total_summary_words, percent_compression)

In [None]:
def print_summary(summary_content):
  summary, total_words, total_summary_words, percent_compression = summary_content
  print("Summarized text: ", summary)
  print("Total words in the article: ", total_words)
  print("Total words in the summary: ", total_summary_words)
  print("Percentage compressed: ", percent_compression)

In [None]:
print_summary(summarize(article2))

Summarized text:  He and his pregnant wife remained fearful that war would break out, he said in an interview by phone
So, apparently, does the Iranian government, which after a week of promising a forceful response to any Israeli attack on Iranian territory, appeared to be standing down from nearly going to the brink of war with Israel
“The external and internal challenges are two sides of the same coin for the establishment,” Abbas Abdi, a prominent analyst and writer in Tehran, said in a telephone interview
“Iranian officials do not want war with Israel,” he said in a telephone interview
Total words in the article:  533
Total words in the summary:  99
Percentage compressed:  81.42589118198875


In [None]:
print_summary(summarize(article3))

Summarized text:  
India's power consumption grew nearly 10% year-on-year to 70.66 billion units (BU) in the first half of April this year, showing improvement in economic activities and consumption patterns, according to the power ministry data
According to the data, power consumption in the country rose to 70.66 BU during April 1-15 this year from 64.24 MU in the year-ago period

The peak power demand met or the highest supply in a day rose to about 218 GW in the first half of April compared to 206 GW in the same period a year ago
The ministry has projected a peak power demand of 260 GW during the summer season (April to June) in view of longer heat wave duration
The peak power demand had touched an all-time high of 243 GW in September 2023
The India Meteorological Department (IMD) has predicted above-normal maximum temperatures in most parts of the country during summer this year

The experts said that the power demand will increase in the coming days as rains in different parts of 