In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
from gensim import parsing
import glob
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import NMF
from nltk.corpus import stopwords 
from sklearn.decomposition import LatentDirichletAllocation 
from bs4 import BeautifulSoup

In [2]:
from rouge import Rouge

In [3]:
df = pd.read_csv("Wikipedia-Level-4-Articles-Cleaned-2.csv")

In [4]:
df = df.drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,Title,Article,Category,Summary,Article_clean,Summary_clean
0,Masaccio,"Masaccio (UK: , US: , Italian: [maˈzattʃo]; De...",People,"Masaccio (UK: , US: , Italian: [maˈzattʃo]; De...",masaccio italian decemb 1401 summer 1428 born ...,masaccio italian december 1401 summer 1428 bor...
1,Hokusai,"Katsushika Hokusai (葛飾 北斎, c. 31 October 1760 ...",People,"Katsushika Hokusai (葛飾 北斎, c. 31 October 1760 ...",katsushika hokusai octob 1760 may 1849 known s...,katsushika hokusai october 1760 may 1849 known...
2,Steven Spielberg,"Steven Allan Spielberg (; born December 18, 1...",People,"Steven Allan Spielberg (; born December 18, 1...",steven allan spielberg born decemb 1946 americ...,steven allan spielberg born december 1946 amer...
3,Che Guevara,"Ernesto ""Che"" Guevara (Spanish: [ˈtʃe ɣeˈβaɾa]...",People,"Ernesto ""Che"" Guevara (Spanish: [ˈtʃe ɣeˈβaɾa]...",ernesto che guevara spanish june 1928 octob 19...,ernesto che guevara spanish june 1928 october ...
4,Jöns Jacob Berzelius,Baron Jöns Jacob Berzelius (Swedish: [jœns ˈjɑ...,People,Baron Jöns Jacob Berzelius (Swedish: [jœns ˈjɑ...,baron jacob berzeliu swedish contemporari name...,baron jacob berzelius swedish contemporary nam...


In [5]:
def LDA_Summarizer(documents, num_sentences, num_of_topics=1000, num_of_top_words=20):
    summaries = []
    for document in documents:
        # Vectorizing the document and finind the vectorized representation
        vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
        document_model = vectorizer.fit_transform([document])
        
        #Fitting an LDA model to find the best topics
        lda = LatentDirichletAllocation(n_components=num_of_topics, max_iter=20, random_state=42)
        lda.fit(document_model)

        # Get the most probable words for each topic
        feature_names = vectorizer.get_feature_names_out()
        topic_words = []
        
        for topic in lda.components_:
            word_idx = np.argsort(topic)[::-1][:num_of_top_words]
            topic_words.append([feature_names[i] for i in word_idx])

        #Finding out the most probable topics in each of the sentences
        sentence_topics = []
        for sentence in nltk.sent_tokenize(document):
            sentence_model = vectorizer.transform([sentence])
            sentence_topics.append(lda.transform(sentence_model)[0])

        summary = []
        selected_sentences = set()
        while len(summary) < num_sentences:
            
            best_sentence = None
            best_score = 0
            
            for i, sentence in enumerate(sentence_topics):
                
                if i in selected_sentences: continue
                sentence_score = sum([sentence[j]*sentence_topics[j][k] for j in range(len(sentence_topics)) for k in range(num_of_topics)])
                
                #Selecting the best sentences based on the PD sum
                
                if sentence_score > best_score:
                    best_sentence = i
                    best_score = sentence_score
            if best_sentence is None: break
            
            summary.append((nltk.sent_tokenize(document)[best_sentence], best_sentence))
            print(nltk.sent_tokenize(document)[best_sentence])
            selected_sentences.add(best_sentence)

        summary = sorted(summary, key=lambda x: x[1])
        summaries.append(' '.join(list(map(lambda x: x[0], summary))))
    return summaries

In [6]:
articles = list(df["Article"])
summary_list = list(df["Summary"])

In [7]:
articles = articles[101:102]
summary_list = summary_list[101:102]

In [8]:
summaries = LDA_Summarizer(articles,10)
for summary in summaries:
    print("-----")
    print(summary)
    print("-----")

  return np.exp(-1.0 * perword_bound)


Philip Cortelyou Johnson (July 8, 1906 – January 25, 2005) was an American architect best known for his works of modern and postmodern architecture.
Among his best-known designs are his modernist Glass House in New Canaan, Connecticut; the postmodern 550 Madison Avenue in New York, designed for AT&T; 190 South La Salle Street in Chicago; the Sculpture Garden of the Museum of Modern Art; and the Pre-Columbian Pavilion at Dumbarton Oaks.
In his obituary in 2005, The New York Times wrote that his works "were widely considered among the architectural masterpieces of the 20th century.
"In 1930, Johnson became the first director of the architecture department of the Museum of Modern Art in New York.
There he arranged for visits by Walter Gropius and Le Corbusier and negotiated the first American commission for Mies van der Rohe, when he fled Nazi Germany.
In 1932, he organized the first exhibition on modern architecture at the Museum of Modern Art.
In 1934, Johnson resigned his position at t

In [9]:
rouge = Rouge()

In [10]:
scores = rouge.get_scores(summaries, summary_list, avg = True)
print(scores)

{'rouge-1': {'f': 0.888489203695267, 'p': 1.0, 'r': 0.7993527508090615}, 'rouge-2': {'f': 0.888086637661901, 'p': 1.0, 'r': 0.7987012987012987}, 'rouge-l': {'f': 0.8888888839506174, 'p': 1.0, 'r': 0.8}}


In [11]:
summary_list

['Philip Cortelyou Johnson (July 8, 1906 – January 25, 2005) was an American architect best known for his works of modern and postmodern architecture. Among his best-known designs are his modernist Glass House in New Canaan, Connecticut; the postmodern 550 Madison Avenue in New York, designed for AT&T; 190 South La Salle Street in Chicago; the Sculpture Garden of the Museum of Modern Art; and the Pre-Columbian Pavilion at Dumbarton Oaks. In his obituary in 2005, The New York Times wrote that his works "were widely considered among the architectural masterpieces of the 20th century."In 1930, Johnson became the first director of the architecture department of the Museum of Modern Art in New York. There he arranged for visits by Walter Gropius and Le Corbusier and negotiated the first American commission for Mies van der Rohe, when he fled Nazi Germany. In 1932, he organized the first exhibition on modern architecture at the Museum of Modern Art.\nIn 1934, Johnson resigned his position at

In [12]:
def NMF_Summarizer(documents, num_sentences, num_of_topics=1000, num_of_top_words=20):
    summaries = []
    for document in documents:
        # Vectorizing the document and finind the vectorized representation
        vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
        document_model = vectorizer.fit_transform([document])
        
        #Fitting an NMF model to find the best topics
        nmf = NMF(n_components=num_of_topics, random_state=42)
        nmf.fit(document_model)

        # Get the most probable words for each topic
        feature_names = vectorizer.get_feature_names_out()
        topic_words = []
        
        for topic in nmf.components_:
            word_idx = np.argsort(topic)[::-1][:num_of_top_words]
            topic_words.append([feature_names[i] for i in word_idx])

        #Finding out the most probable topics in each of the sentences
        sentence_topics = []
        for sentence in nltk.sent_tokenize(document):
            sentence_model = vectorizer.transform([sentence])
            sentence_topics.append(nmf.transform(sentence_model)[0])

        summary = []
        selected_sentences = set()
        while len(summary) < num_sentences:
            
            best_sentence = None
            best_score = 0
            
            for i, sentence in enumerate(sentence_topics):
                
                if i in selected_sentences: continue
                sentence_score = sum([sentence[j]*sentence_topics[j][k] for j in range(len(sentence_topics)) for k in range(num_of_topics)])
                
                #Selecting the best sentences based on the PD sum
                
                if sentence_score > best_score:
                    best_sentence = i
                    best_score = sentence_score
            if best_sentence is None: break
            
            summary.append((nltk.sent_tokenize(document)[best_sentence], best_sentence))
            print(nltk.sent_tokenize(document)[best_sentence])
            selected_sentences.add(best_sentence)

        summary = sorted(summary, key=lambda x: x[1])
        summaries.append(' '.join(list(map(lambda x: x[0], summary))))
    return summaries

In [13]:
summaries = NMF_Summarizer(articles,5)
for summary in summaries:
    print("-----")
    print(summary)
    print("-----")

  return np.sqrt(res * 2)


In 1947, he curated the first exhibition of modern architecture of the Museum of Modern Art including a model of the glass Farnsworth House of Mies.In 1949 he began building a new residence, the Glass House in New Canaan, Connecticut, that was completed in 1949.
"In 1930, Johnson became the first director of the architecture department of the Museum of Modern Art in New York.
There he arranged for visits by Walter Gropius and Le Corbusier and negotiated the first American commission for Mies van der Rohe, when he fled Nazi Germany.
The Man in the Glass House: Philip Johnson, Architect of the Modern Century.
Philip Cortelyou Johnson (July 8, 1906 – January 25, 2005) was an American architect best known for his works of modern and postmodern architecture.
-----
Philip Cortelyou Johnson (July 8, 1906 – January 25, 2005) was an American architect best known for his works of modern and postmodern architecture. "In 1930, Johnson became the first director of the architecture department of the

In [14]:
scores = rouge.get_scores(summaries, summary_list, avg = True)
print(scores)

{'rouge-1': {'f': 0.4829157133711428, 'p': 0.8153846153846154, 'r': 0.343042071197411}, 'rouge-2': {'f': 0.3844393551066404, 'p': 0.6511627906976745, 'r': 0.2727272727272727}, 'rouge-l': {'f': 0.4539007050953172, 'p': 0.7804878048780488, 'r': 0.32}}


In [15]:
summaries

['Philip Cortelyou Johnson (July 8, 1906 – January 25, 2005) was an American architect best known for his works of modern and postmodern architecture. "In 1930, Johnson became the first director of the architecture department of the Museum of Modern Art in New York. There he arranged for visits by Walter Gropius and Le Corbusier and negotiated the first American commission for Mies van der Rohe, when he fled Nazi Germany. In 1947, he curated the first exhibition of modern architecture of the Museum of Modern Art including a model of the glass Farnsworth House of Mies.In 1949 he began building a new residence, the Glass House in New Canaan, Connecticut, that was completed in 1949. The Man in the Glass House: Philip Johnson, Architect of the Modern Century.']

In [16]:
summary_list

['Philip Cortelyou Johnson (July 8, 1906 – January 25, 2005) was an American architect best known for his works of modern and postmodern architecture. Among his best-known designs are his modernist Glass House in New Canaan, Connecticut; the postmodern 550 Madison Avenue in New York, designed for AT&T; 190 South La Salle Street in Chicago; the Sculpture Garden of the Museum of Modern Art; and the Pre-Columbian Pavilion at Dumbarton Oaks. In his obituary in 2005, The New York Times wrote that his works "were widely considered among the architectural masterpieces of the 20th century."In 1930, Johnson became the first director of the architecture department of the Museum of Modern Art in New York. There he arranged for visits by Walter Gropius and Le Corbusier and negotiated the first American commission for Mies van der Rohe, when he fled Nazi Germany. In 1932, he organized the first exhibition on modern architecture at the Museum of Modern Art.\nIn 1934, Johnson resigned his position at