In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import glob
import os
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
import re
import nltk
import math
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
! pip install rouge-score
from rouge_score import rouge_scorer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
def kl_divergence(p, q):
    kl = 0 
    lambda_param = 0.01
    for i in range(p.shape[0]): 
        p_i = p[i]
        q_i = q[i]
        kl += p_i * math.log((p_i + lambda_param) / (q_i + lambda_param ))

    return kl


def kl_summarize (cleaned_docs, num_of_sentences):

    summaries = []

    for cleaned_doc in cleaned_docs:

      vectorizer = TfidfVectorizer(stop_words='english')
      doc_term_matrix = vectorizer.fit_transform([cleaned_doc])

      summary = []
      picked_sentences = set()

      for _ in range(num_of_sentences):

          curr_sentence = ''
          curr_sentence_score = float('-inf')
          curr_sentence_index = 0

          for sentence in nltk.sent_tokenize(cleaned_doc):
            
              if curr_sentence_index not in picked_sentences: 
                  new_sentences = list(map(lambda x:x[0], summary))
                  new_sentences.append(sentence)
                  kl_score = kl_divergence(doc_term_matrix.T.toarray(), vectorizer.transform([' '.join(new_sentences)]).T.toarray())
                  if kl_score > curr_sentence_score:
                    curr_sentence_score = kl_score
                    curr_sentence = (sentence, curr_sentence_index)
                  curr_sentence_index += 1

          if curr_sentence != '':
              summary.append(curr_sentence)
              picked_sentences.add(curr_sentence[1])

      summary = sorted(summary, key = lambda x: x[1]) 
      summaries.append(' '.join(list(map(lambda x: x[0], summary))))

    return summaries

In [20]:
def lda_summarize(cleaned_docs, num_of_sentences, num_of_topics=1000, n_top_words=20):

    summaries = []

    for cleaned_doc in cleaned_docs:

      vectorizer = TfidfVectorizer(stop_words='english')
      doc_term_matrix = vectorizer.fit_transform([cleaned_doc])
      
      lda = LatentDirichletAllocation(n_components=num_of_topics, max_iter=20, random_state=42)
      lda.fit(doc_term_matrix)

      # Get the most probable words for each topic
      feature_names = vectorizer.get_feature_names_out()
      topic_words = []
      for topic_idx, topic in enumerate(lda.components_):
          top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
          top_features = [feature_names[i] for i in top_features_ind]

      # Get the topic distribution for each sentence
      sentence_topics = []
      for sentence in nltk.sent_tokenize(cleaned_doc):
          sentence_model = vectorizer.transform([sentence])
          sentence_topics.append(lda.transform(sentence_model)[0])

      # Pick the top sentences based on the topic diversity
      summary = []
      picked_sentences = set()

      for _ in range(num_of_sentences):

          best_sentence = ''
          best_score = 0

          for i, sentence in enumerate(sentence_topics):
              if i not in picked_sentences: 
                  sentence_score = sum([sentence[j]*sentence_topics[j][k] for j in range(len(sentence_topics)) for k in range(num_of_topics)])
                  if sentence_score > best_score:
                      best_sentence = i
                      best_score = sentence_score

          if best_sentence != '':
              summary.append((nltk.sent_tokenize(cleaned_doc)[best_sentence], best_sentence))
              picked_sentences.add(best_sentence)

      summary = sorted(summary, key=lambda x: x[1])
      summaries.append(' '.join(list(map(lambda x: x[0], summary))))

    return summaries

### DUC

In [5]:
path = '/content/drive/MyDrive/UML/HW5A/DUC2001'

contents = []
summaries = []

data = { 'Article' : [] , 'Content' : [] , 'Summary' : [] }

for name in glob.glob(path + '/*'):
    
    filename  = os.path.basename(name)
    contents = ''
    summaries = ''

    try:
        if filename == 'annotations.txt' or filename in 'notes.txt':
            continue
            
        with open(path + '/Summaries/{}.txt'.format(filename.lower())) as file:
            f = file.read()
            abs = f.find('Abstract:')
            len_abs = len('Abstract:')
            intr = f.find('Introduction:')
            len_intr = len('Introduction:')
            
            summaries = f[(abs+len_abs):intr] 
            contents = f[(intr+len_intr):]
            
    except:
        continue
        
    data['Article'].append(filename)
    data['Summary'].append(summaries.strip().replace('\n', ' '))
    data['Content'].append(contents.strip().replace('\n', ' ').replace('    ', ' ').replace(' \x1a', ''))

In [6]:
df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,Article,Content,Summary
0,FT911-5176,"Sir, Mr James Skinner (Letters, April 11) char...","Lord Bauer, House of Lords, makes the followin..."
1,FT943-4951,"The Jwaneng mine, on the fringes of the Kalaha...",De Beers' 100-year old Kimberley diamond mines...
2,FT923-5835,"GENERAL ACCIDENT, the leading British insurer,...","General Accident (GA), the leading British ins..."
3,FT934-9116,THE FIGHT over the North American Free Trade A...,After Vice President Gore's debate victory ove...
4,FT933-5709,THE MAN President Bill Clinton has chosen to h...,"President Clinton chose William Daley, son of ..."


In [60]:
transform_to_lower = lambda s: re.sub(r'(?<=\w)(\w+)', lambda x: x.group().lower(), s)

CLEAN_FILTERS = [strip_multiple_whitespaces, transform_to_lower]

def preprocessingContent(document):
    processed_words = preprocess_string(document, CLEAN_FILTERS)
    return processed_words

def joinList(processed_words):
    return ' '.join(processed_words)

In [35]:
df['cleanedContent'] = df['Content'].apply(preprocessingContent).apply(joinList)

In [36]:
df.head()

Unnamed: 0,Article,Content,Summary,cleanedContent
0,FT911-5176,"Sir, Mr James Skinner (Letters, April 11) char...","Lord Bauer, House of Lords, makes the followin...","Sir, Mr James Skinner (Letters, April 11) char..."
1,FT943-4951,"The Jwaneng mine, on the fringes of the Kalaha...",De Beers' 100-year old Kimberley diamond mines...,"The Jwaneng mine, on the fringes of the Kalaha..."
2,FT923-5835,"GENERAL ACCIDENT, the leading British insurer,...","General Accident (GA), the leading British ins...","General Accident, the leading British insurer,..."
3,FT934-9116,THE FIGHT over the North American Free Trade A...,After Vice President Gore's debate victory ove...,The Fight over the North American Free Trade A...
4,FT933-5709,THE MAN President Bill Clinton has chosen to h...,"President Clinton chose William Daley, son of ...",The Man President Bill Clinton has chosen to h...


#### KL Divergence

In [44]:
cleaned_docs = []
documents = []
for i in range(len(df)):
    content = df.iloc[i,1]
    cleanedContent = df.iloc[i,3]
    if cleanedContent != '\x1a':
        documents.append(content)
        cleaned_docs.append(cleanedContent)

summaries = kl_summarize(cleaned_docs[:10], 15)

for summary in summaries:
    print(summary)
    print('-'*100)

Sir, Mr James Skinner (Letters, April 11) charges me with misuse of statistics and understanding the debt burden of poor countries. He cites Africa to support his contention. Such considerations are reinforced by ubiquitous civil conflict. Bauer House of Lords, Westminster Sw1
----------------------------------------------------------------------------------------------------
The Jwaneng mine, on the fringes of the Kalahari desert in Botswana, is known as 'a gem in the world of gems' because it almost certainly is the richest diamond mine in the world - at least in terms of the value of the stones it yields. Jwaneng's position is being reinforced by a Usdollars 160m expansion programme at present being completed three months ahead of schedule and under budget. The value of Jwaneng to Botswana cannot be overstated. Now a so-called fourth stream is being completed which is adding one-third to processing capacity. Already it is 2kms long and 1km wide.
-------------------------------------

In [50]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

for i in range(10):
    print('Document', i+1)
    scores = scorer.score(df['Summary'][i], summaries[i])
    for k,v in scores.items():
        print(k,v)
    print('-'*110)

Document 1
rouge1 Score(precision=0.5, recall=0.1926605504587156, fmeasure=0.27814569536423844)
rouge2 Score(precision=0.1951219512195122, recall=0.07407407407407407, fmeasure=0.10738255033557047)
rougeL Score(precision=0.3333333333333333, recall=0.12844036697247707, fmeasure=0.18543046357615894)
--------------------------------------------------------------------------------------------------------------
Document 2
rouge1 Score(precision=0.3269230769230769, recall=0.3063063063063063, fmeasure=0.3162790697674419)
rouge2 Score(precision=0.11650485436893204, recall=0.10909090909090909, fmeasure=0.11267605633802817)
rougeL Score(precision=0.18269230769230768, recall=0.17117117117117117, fmeasure=0.17674418604651163)
--------------------------------------------------------------------------------------------------------------
Document 3
rouge1 Score(precision=0.48484848484848486, recall=0.3018867924528302, fmeasure=0.37209302325581395)
rouge2 Score(precision=0.23076923076923078, recall=0.1

#### LDA

In [52]:
cleaned_docs = []
documents = []
for i in range(len(df)):
    content = df.iloc[i,1]
    cleanedContent = df.iloc[i,3]
    if cleanedContent != '\x1a':
        documents.append(content)
        cleaned_docs.append(cleanedContent)

summaries = lda_summarize(cleaned_docs[:10], 15)

for summary in summaries:
    print(summary)
    print('-'*50)

  return np.exp(-1.0 * perword_bound)


Sir, Mr James Skinner (Letters, April 11) charges me with misuse of statistics and understanding the debt burden of poor countries. He cites Africa to support his contention. The statistics I used refer to Latin America, the principal thrust of the argument of the Bishop of Oxford, which focused largely on Brazil. The bulk of African debt is owed to official lenders under various aid agreements. The debts represent loans with a substantial grant element. The limiting case is the international development association loans, 50-year loans, unindexed for inflation, at zero interest. The debts of African countries have often been cancelled or rescheduled, frequently several times for the same country. To treat debt as necessarily burdensome also ignores the initial transfer of resources. This is like saying that banks, building societies, and governments issuing saving certificates are burdened when they pay interest. If the funds are used productively, debt service is not a burden in the 

In [53]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

for i in range(10):
    print('Document', i+1)
    scores = scorer.score(df['Summary'][i], summaries[i])
    for k,v in scores.items():
        print(k,v)
    print('-'*110)

Document 1
rouge1 Score(precision=0.2465277777777778, recall=0.6513761467889908, fmeasure=0.35768261964735515)
rouge2 Score(precision=0.06968641114982578, recall=0.18518518518518517, fmeasure=0.10126582278481014)
rougeL Score(precision=0.1284722222222222, recall=0.3394495412844037, fmeasure=0.18639798488664985)
--------------------------------------------------------------------------------------------------------------
Document 2
rouge1 Score(precision=0.20603015075376885, recall=0.7387387387387387, fmeasure=0.32220039292730845)
rouge2 Score(precision=0.0654911838790932, recall=0.23636363636363636, fmeasure=0.10256410256410255)
rougeL Score(precision=0.0829145728643216, recall=0.2972972972972973, fmeasure=0.12966601178781925)
--------------------------------------------------------------------------------------------------------------
Document 3
rouge1 Score(precision=0.37037037037037035, recall=0.8490566037735849, fmeasure=0.5157593123209169)
rouge2 Score(precision=0.2520661157024793

### 20NG

In [23]:
data, _ = fetch_20newsgroups(shuffle=True, random_state=11, remove=("headers", "footers", "quotes"), return_X_y=True)

In [54]:
remove_emails = lambda s: re.sub(r'^[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+$', '', s)

CLEAN_FILTERS = [remove_emails, strip_tags, strip_multiple_whitespaces]

def preprocessing(document):
    processed_words = preprocess_string(document, CLEAN_FILTERS)
    
    return processed_words

def joinList(processed_words):
    return ' '.join(processed_words)

In [58]:
cleaned_data = []

for d in data[:100]:
    cleaned_d = joinList(preprocessing(d.replace('\n', ' ')))
    cleaned_data.append(cleaned_d)

#### KL Divergence

In [61]:
summaries = kl_summarize(cleaned_data[:10], 15)

for summary in summaries:
    print(summary)
    print('-'*100)

--- Received from EEI.EEIIHY 353-1-2800455 93-04-26 12.28 -> VAX.XPERT..EXPO.LCS.MIT.EDU..INET -: - - - - - - - - - - > MAIL USER IN VAX AND INTERNET help
----------------------------------------------------------------------------------------------------
Now we have strong evidence of where the CPR really stands. Unbelievable and disgusting. !
----------------------------------------------------------------------------------------------------
But they can make you piss in a jar, and possibly provide DNA, semen, and hair samples or to undergo tests for gunpowder residues on your hand. I found it interesting the news reported his acts, but not his reasons).
----------------------------------------------------------------------------------------------------
Yes, that's true, but you have to be clear exactly what is an uninterpreted observation. We can agree that 'I perceive brightness' perhaps. Huh? That's up to you, I guess.
--------------------------------------------------------------

#### LDA

In [63]:
summaries = lda_summarize(cleaned_data[:10], 15)

for summary in summaries:
    print(summary)
    print('-'*100)

--- Received from EEI.EEIIHY 353-1-2800455 93-04-26 12.28 -> VAX.XPERT..EXPO.LCS.MIT.EDU..INET -: - - - - - - - - - - > MAIL USER IN VAX AND INTERNET help
----------------------------------------------------------------------------------------------------
Now we have strong evidence of where the CPR really stands. Unbelievable and disgusting. It only proves that we must never forget... ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Not so unconventional. Eugenic solutions to the Jewish Problem have been suggested by Northern Europeans in the past. Eugenics: a science that deals with the improvement (as by control of human mating) of hereditory qualities of race or breed. -- Webster's Ninth Collegiate Dictionary. This is nothing more than Feisal Husseini's statement that the Zionist entity must be disolved by forcing it to "engage" the surrounding "normal" Arab society. "a strong mixed stock", "integration of Israeli society into the Middle East in a graceful manner,"