# Text Summarization

## Libraries

In [1]:
## Pre-loading the model before data source to optimize memory 

from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3')

In [2]:
import pandas as pd
import PyPDF2
import re
import nltk
import faiss
import time

# define a search 
def search(query, model, text_list):
    
    t=time.time()
    query_vector = model.encode([query])
    k = 5
    top_k = index.search(query_vector, k)
    print('totaltime: {}'.format(time.time()-t))
    return [text_list[_id] for _id in top_k[1].tolist()[0]]
    
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer
LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models #don't skip this 
import matplotlib.pyplot as plt

def preprocess(textstring):
   stops =  set(stopwords.words('english'))
   tokens = word_tokenize(textstring)
   return [token.lower() for token in tokens if token.isalpha() 
          and token not in stops]

from rouge_score import rouge_scorer
def print_rouge_score(rouge_score):
    for k,v in rouge_score.items():
        print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure))

## Text Soruces

* This text source is a Wireland ranch episode 1 podcast.

In [3]:
# creating a pdf file object
pdfFileObj = open('wireland_ranch.pdf', 'rb')
  
# creating a pdf reader object
pdfReader = PyPDF2.PdfReader(pdfFileObj)
  
# printing number of pages in pdf file
len(pdfReader.pages)

# creating a page object
pageObj = pdfReader.pages
  
# extracting text from page
# loop here to get it all 
text = []
for page in pageObj:
  page = re.sub("\\n", " ", page.extract_text())
  text.append(page)
  
print(text[0])

Episode 1: The Return of the Overseer FEMALE VOICE (hissing whispers underlay the voice): Welcome Overseer . It has been an unusually long time. It’s… [she hesitates as though *nice* is not the proper descriptor for the complex palette of emotions this moment has conjured] …*interesting* to see you again. The overseer opens his eyes, the lids heavy , weighed down by a dogged stubborn sleep still trying to drag him back into the beckoning arms of a slumber from which he’d just awoken. He finds he feels more *revived* than he does *awake*, as though the act of opening his eyes had done more than process light into images in his visual cortex but also maybe… and this thought arrives with a shudder , maybe… saved his life? FEMALE VOICE: Do you know where you are? He did not. And as his eyes adjusted to his surroundings he began to question if he even *wanted* to know . Some things are, after all, unknowable and this room he finds himself in seems to fit snugly into that category . But you 

In [4]:
text = ' '.join(text)
sentences = nltk.sent_tokenize(text)

len(sentences)

DF = pd.DataFrame(sentences, columns = ['sentence'])
DF.head()

Unnamed: 0,sentence
0,Episode 1: The Return of the Overseer FEMALE V...
1,It has been an unusually long time.
2,It’s… [she hesitates as though *nice* is not t...
3,"The overseer opens his eyes, the lids heavy , ..."
4,He finds he feels more *revived* than he does ...


## Create A Search Engine

* Using each sentence as my “documents”, I created a search engine to find specific pieces of text.
* Search for several items.
* Examine the results and comment on how well I think the search engine worked.

In [5]:
# only need to run this thing once and once it is 
# saved, you can "turn off" the chunk using eval = F in 
# Rstudio, or change the code type to markdown to save 
# the code for yourself in datalore but not run it
# Load a pre-trained model
#model = SentenceTransformer('msmarco-MiniLM-L-12-v3')
#wireland_embed = model.encode(DF['sentence'].to_list()) #same as sentences, but helps to have a DF in case you needed to do other cleaning 

In [6]:
sentences = DF['sentence'].to_list()
len(sentences)

236

In [7]:
## breaking the data into 3 smaller batches and processing each batch separately to manage memory usage better.
sentences_1 = sentences[0:78]
sentences_2 = sentences[78:156]
sentences_3 = sentences[156:236]

In [8]:
model = SentenceTransformer('msmarco-MiniLM-L-12-v3')
embeddings_1 = model.encode(sentences_1)
embeddings_2 = model.encode(sentences_2)
embeddings_3 = model.encode(sentences_3)

In [9]:
import numpy as np

# Concatenate along the first axis (axis=0)
wireland_embed = np.concatenate((embeddings_1, embeddings_2, embeddings_3), axis=0)

In [10]:
len(wireland_embed)

236

In [11]:
# Create an index using FAISS
index = faiss.IndexFlatL2(wireland_embed.shape[1])
index.add(wireland_embed)
faiss.write_index(index, 'index_wireland_reviews')

In [12]:
## Search for several items. 

# read in the index later when you need to use this again 
index = faiss.read_index('index_wireland_reviews')
# you do have to have the model open too 
model = SentenceTransformer('msmarco-MiniLM-L-12-v3')

search("overseer", model, DF['sentence'].to_list())

totaltime: 0.1043698787689209


['The overseer opens his eyes, the lids heavy , weighed down by a dogged stubborn sleep still trying to drag him back into the beckoning arms of a slumber from which he’d just awoken.',
 'Episode 1: The Return of the Overseer FEMALE VOICE (hissing whispers underlay the voice): Welcome Overseer .',
 'Ignore her overseer , she’s just a dirty girl, is all.',
 '--- Watch out [he can hear the eyeroll in the everywhere voice] for the majestic Sphinx, Overseer , she has been known to seek attention when she is in heat, which again, can be a doozy if you aren’ t into that sort of thing.',
 'And oh dear overseer , you will soon be at the intersection of all that is, and all that is not.']

In [13]:
search("crime", model, DF['sentence'].to_list())

totaltime: 0.09686398506164551


['The man in the station wagon stared at him in the rearview mirror , violence flashed in the mans eyes for a moment, the kind of violence only an insurrection or civil war can tame and just as quickly as it appeared it was gone and the wagon lurched forward, none worse for the wear .',
 'But money got the better of him, as money does to us all, so he pulled a quick and very illegal U-turn and headed toward the merchant in the square downtown.',
 'He slammed on his brakes and the car skidded toward the curb, his fender a quarter inch from grazing the bumper of the car ahead and for a second, he wondered about how a man with confederate flag and second amendment stickers on a wood paneled station wagon might react in a wreck type situation and well, he probably dodged a bullet there.',
 'Full of bar patrons pretending not to be drunk while cops watch with their beady cop eyes concealed in wraparound sunglasses, leaning on bicycles in spandex shorts, looking for a stumble or hint of hors

In [14]:
search("delivery", model, DF['sentence'].to_list())

totaltime: 0.019928932189941406


['--- Your delivery is on the desk, run along now, wouldn’ t want you to be too late.',
 'An average all day delivery shift in a perpetually collapsing economy veering dangerously toward what some might consider the end, while others, like our driver here, would consider an *improvement*.',
 'So, time to get the delivery and peace out, he decided.',
 'He drove, as most delivery drivers do, in a manner that was both antagonistic to public safety and necessary to make enough money to live another day and do the same thing all over again.',
 'The chime seemed different on the last delivery he made.']

### Summary 

**&nbsp;1. Examine the results**<br>
From the results, the search engine appears to function effectively. It successfully retrieved sentences that contained specific keywords such as `overseer`, `crime`, and `delivery`. The search engine produced relevant results for each query, including the term and its contextual usage within the text.

1. **"Overseer" Query**: The search returned sentences mentioning "overseer" and related contexts, such as the character's interaction and description. This indicates good precision in finding text related to the keyword.
2. **"Crime" Query**: The search engine retrieved sentences discussing scenarios involving potential criminal activity, capturing the broader context of "crime," which suggests good comprehension of query relevance.
3. **"Delivery" Query**: It returned sentences that accurately describe aspects of delivery, from the experience of a delivery driver to specific instances of deliveries, showing the system's ability to associate the query with various uses of the term in the text.

Overall, the search engine works well, with fast response times and accurate text retrieval based on the input queries, demonstrating relevance and efficiency.

## Create Text Summaries

* Create a human summary of the text.
* Create text summaries using LSA, TextRank, and Topic Modeling.
* Assess those summaries using the Rouge-N analyzer.
* Which summary was the best when compared to the human summary?

In [15]:
human_summary = "A crumbling shack in the Mojave desert houses the heartbeat at the center of the universe. \
Long ago, the heart was diminished to a parasite when new spoiled gods built of the more distasteful human energies \
usurped the throne and began their own type of reign. They battled and bickered and those arguments translated to our world \
in the form of tragedies, mass rituals, and monied black magic. Now, it seems that history is coming to a head and our spoiled gods \
are fighting harder than they have ever fought before. Dead in the center of all of this are two humans. One an unwitting delivery \
driver turned host for the parasitic heart. The other a disgraced drug addicted cop who went searching for the driver on behalf of \
his family. Everything else, we will discover together."

chatgpt_summary = "Episode 1: The Return of the Overseer follows the story of a delivery driver who receives an unusual order \
that takes him to Reynold’s Limited Curiosities, a mysterious shop. The driver encounters a strange, sentient desk in the shop, \
which is inhabited by a sphinx-like creature. Despite warnings from an enigmatic voice, the driver approaches the desk and is attacked \
by the creature, only to be saved by multicolored worms that emerge from the light fixtures. The creature is destroyed, and \
a glowing figure appears before the driver, guiding him to a room filled with jars containing strange contents. \
The figure disappears, leaving the driver bewildered. He quickly grabs his delivery and flees the shop, \
experiencing strange phenomena and feeling disconnected from reality."

### Text Rank

In [16]:
num_summary_sentences = 5
# be sure to put in one big long string 
# this will parse things into sentences for summarization
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
# builds a summarizer with a stemmer (which grabs english from above)
summarizer = TextRankSummarizer(stemmer)
# add the stops for the language we set (english)
summarizer.stop_words = get_stop_words(LANGUAGE)

tr_sum = []

for sentence in summarizer(parser.document, num_summary_sentences):
    tr_sum.append(str(sentence))
    
tr_sum = " ".join(tr_sum)

tr_sum

'The car stopped, rocking on its axles and for a second, Time evened back out and the world settled back where it belonged, as in *anywhere but on him * and he pressed the flashing red ACCEPT at the bottom of the screen. He did this for a few moments as the sound slowly faded and finally he could hear the buzzing fluorescent lights, blinking on the ceiling like morse code, both inside and outside the door and yeah they were louder than they should have been but that was barely noticeable compared to the alarm bell nightmare the preceding minutes wrought and this helped restore enough normalcy to the situation that our Driver felt as comfortable as he was going to get with walking inside. He chose to ignore the voice because it did not seem to be interested in, or courteous enough, to try making any sort of sense and he didn’ t know where this buzz he felt was coming from, but he could fucking dig it and this lady and her nonsense was coming dangerously close to ruining it for him and t

### LSA

In [17]:
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

lsa_sum = []

for sentence in summarizer(parser.document, num_summary_sentences):
    lsa_sum.append(str(sentence))
    
lsa_sum = " ".join(lsa_sum)

lsa_sum

'Normally this meant sitting on a zoom call with other corporate lawyers discussing the cost benefit of either ignoring that pesky business of sometimes the gas tank just fucking explodes on the new Kia Soul {redacted} model or doing a recall, [not in terms of human lives mind you, but *settlements and tax write offs*,] and *not* driving 65 through a school zone to get to the curiosities shoppe across town. ‘RLC,’ sat atop what appeared to be a slogan: ‘Where lost objects go to be found.’ The words cast in lowercase blackletter that gleamed luminescent gold shot through with clean lines of green giving off a real Saint Patrick’ s Day at the goth club vibe. The chest curved upward but still dipped down in a submissive pose, human breasts hanging down, nipples wrapped by the lips of suckling pig heads attached to the bodies of gluttonous babies, front arms bent outward behind the creatures awkwardly cradling them, palms outstretched before her supporting the weight of it all. The growl r

### Topic Modeling

In [18]:
# remember all the stuff from earlier that was loaded
# Create a dictionary representation of the documents.
# use our list of sentences from earlier
processed_sentences = [preprocess(sent) for sent in sentences]
# create the vocabulary list 
dictionary = Dictionary(processed_sentences)
# convert to a term by document matrix 
corpus = [dictionary.doc2bow(sent) for sent in processed_sentences]

# Train the topic model
LDAmodel = LdaModel(corpus = corpus, 
                id2word = dictionary,
                iterations = 400, 
                num_topics = 10,
                random_state = 100,
                update_every = 1,
                chunksize = 100,
                passes = 10,
                alpha = 'auto',
                per_word_topics = True)
                
probs = [LDAmodel.get_document_topics(sentence) for sentence in corpus]

save_probs = []
i = 0 # looping variable
for document in probs:
  for (topic, prob) in document:
    if topic == 0: # this is the topic zero but you can pick another one
      save_probs.append((sentences[i], prob))
  i = i + 1
      
DF = pd.DataFrame(save_probs, columns = ["sentence", "prob"])

topic_sum = " ".join(DF.sort_values(by = ["prob"], ascending = False)[0:num_summary_sentences].sentence)

topic_sum

'And through that film, the fluorescent lights strobed and stalled, going bright to dim to dark every couple of seconds and had he been inside, he would have noticed the buzzing bug murder zap noise that accompanied each phase of fluorescence creating a smothering atmosphere that would’ve made his jaw clench tight, and his hair stand on end. As the sun set over his anywhere town USA in a wash of pastels, night began seeping into the sky like ink blots on a Rorschach test, appearing to our driver as butterflies or genocides depending on his mood and the traffic and - both changed minute to minute. He crossed the threshold slowly and as he did a bell rang, presumably a way of letting whoever worked here know they had a visitor , but it must have been broken or… altered, because it rang so loudly , he swore he felt his eardrums vibrate and pulse with each pump of his quickening heart. If you are the impatient sort, come to the back room, this is inadvisable and may lead to stress and nigh

### Rouge-N analyzer
Assess those summaries using the Rouge-N analyzer.

In [19]:
# build a blank model
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
# add the gold standard and summary you want to compare
# scores = scorer.score(gold_standard, summary)
# print the scores
# print_rouge_score(scores)

# compare to overall podcast paragraph
print("\n############################## Compared to overall podcast paragraph ##############################")
print("#### TextRank ####")
print_rouge_score(scorer.score(human_summary, tr_sum))

print("\n#### LSA ####")
print_rouge_score(scorer.score(human_summary, lsa_sum))

print("\n#### Topic Modeling ####")
print_rouge_score(scorer.score(human_summary, topic_sum))

# compare to chat gpt
print("\n############################## Compared to ChatGpt ##############################")
print("#### TextRank ####")
print_rouge_score(scorer.score(chatgpt_summary, tr_sum))

print("\n#### LSA ####")
print_rouge_score(scorer.score(chatgpt_summary, lsa_sum))

print("\n#### Topic Modeling ####")
print_rouge_score(scorer.score(chatgpt_summary, topic_sum))


############################## Compared to overall podcast paragraph ##############################
#### TextRank ####
rouge1 Precision: 0.20 Recall: 0.42 fmeasure: 0.27

#### LSA ####
rouge1 Precision: 0.19 Recall: 0.39 fmeasure: 0.26

#### Topic Modeling ####
rouge1 Precision: 0.23 Recall: 0.36 fmeasure: 0.28

############################## Compared to ChatGpt ##############################
#### TextRank ####
rouge1 Precision: 0.18 Recall: 0.40 fmeasure: 0.25

#### LSA ####
rouge1 Precision: 0.18 Recall: 0.39 fmeasure: 0.24

#### Topic Modeling ####
rouge1 Precision: 0.19 Recall: 0.34 fmeasure: 0.24


### Summary 

**&nbsp;1. Which summary was the best when compared to the human summary?**<br>
Based on the `Rouge-N scores` provided, the best summary compared to the human-generated summary was the one generated by __Topic Modeling__. It had the highest **fmeasure** of **0.28** compared to the _TextRank(0.27)_ and _LSA(0.26)_ summaries. This suggests that the Topic Modeling summarizer was more effective in capturing the essential points and reflecting the content and style of the human summary.
However, TextRank(0.25) performed better compared to the Topic Modeling(0.24) and LSA(0.24) summaries against ChatGpt.


### Visualization of topic models

In [20]:
vis = pyLDAvis.gensim_models.prepare(LDAmodel, corpus, dictionary, n_jobs = 1)
pyLDAvis.save_html(vis, 'LDA_Visualization.html') ##saves the file