<a href="https://colab.research.google.com/github/splAcharya/Extractive_Text_Summarization/blob/main/ExTs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extractive Text Summarization Of Wikipedia Articles

## Install Packages (if needed)

In [None]:
#gensim
!pip install --upgrade gensim

#rouge scorer
!pip install rouge-score

## Import Required Libraries

In [None]:
import time
import re
import urllib.request
from bs4 import BeautifulSoup as bs

import string
pm_set = set(list(string.punctuation))

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
sw_set = set(stopwords.words("english"))

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
import gensim.downloader
from gensim.models import Phrases

import multiprocessing
num_cores = multiprocessing.cpu_count()
print(f"Number of Cores: {num_cores}")

from wordcloud import WordCloud
from PIL import Image
from matplotlib import pyplot as plt
import numpy as np

from sklearn.utils.extmath import randomized_svd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as LSA, LatentDirichletAllocation as LDA
from sklearn.cluster import KMeans

from copy import deepcopy
from rouge_score import rouge_scorer
import pandas as pd
from collections import Counter, defaultdict

## Function to Webscrape Wikipedia Article

In [None]:
def fetch_wikipedia_article(url="https://en.wikipedia.org/wiki/Hurricane_Irene_(2005)"):
    scrapped_data = urllib.request.urlopen(url)
    article = scrapped_data.read()
    parsed_article = bs(article,"lxml")
    paragraphs = parsed_article.find_all("p")
    article_text = ""
    for p in paragraphs:
        article_text += p.text
    return article_text

## Function to Pre-process Web Scraped Wikipedia Article

In [None]:
def preprocess_article(article, min_length = 5):
    article_lc = article.lower()#lower case entire article
    article_st = sent_tokenize(article_lc)#convert to sentences
    lmtzr = WordNetLemmatizer()#instantiate lemmatize
    pp_article_st = []
    final_article_st = []
    for sentence in article_st:
        original_sentence = deepcopy(sentence)
        #print(sentence)
        sentence = re.sub(r"[\(.*\)]"," ", sentence)#remove data betweena nd include round backers()
        #print(sentence)
        sentence = re.sub(r"\n","",sentence) #remove "\n" characters
        #print(sentence)
        sentence = re.sub(r"\[\d+\]"," ",sentence)#remove refrences [n]
        #print(sentence)
        sentence = re.sub(r"\[.+\]"," ",sentence) #remove anything between [] including []
        #print(sentence)
        sentence = re.sub(r"[^a-zA-z0-9]", " ", sentence) #remove any non english characters
        #print(sentence)
        sentence = re.sub(r"\s+"," ",sentence) #remove any whitespaces, tabs, newline charaacters
        #print(sentence)
        sentence = [lmtzr.lemmatize(word) for word in word_tokenize(sentence) if word not in sw_set ]
        if len(sentence) >= min_length:
            sentence = (" ").join(sentence)
            pp_article_st.append(sentence)
            final_article_st.append(original_sentence)
    return final_article_st, pp_article_st

## Using the two function above to extract Wikipedia Article

In [None]:
article_text = fetch_wikipedia_article("https://en.wikipedia.org/wiki/Hurricane_Irene_(2005)")
article_st, pp_article_st = preprocess_article(article_text)
pp_article_st_wt = [word_tokenize(sent) for sent in pp_article_st]

In [None]:
pp_string = (" ").join(pp_article_st)

In [None]:
article_st[0]

In [None]:
pp_article_st[0:5]

In [None]:
article_text

## Generate Word Clouds For Raw and Preprocessed Wikipedia Article

In [None]:
wc_mask = np.array(Image.open(urllib.request.urlopen("http://pm1.narvii.com/5750/abb7fdf1f24f9515a1f1d16e6f05d3f03110c51f_00.jpg")))
wc = WordCloud(background_color="white",mask=wc_mask)
fig, ax = plt.subplots(1,2,figsize=(25,25))
ax[0].imshow(wc.generate(article_text))
ax[0].title.set_text("Raw")
ax[1].imshow(wc.generate(pp_string))
ax[1].title.set_text("Preprocessed")
plt.show()

## LSA Summary Generators

### Function to Generate BOW + LSA Summary

In [None]:
def generate_bow_lsa_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):
  """Generates an extractive summary using BOW feature generation scheme and LSA.

  Args:
    raw_sent_tokens(List[str]): An array containing all unprocessed sentences.
    pp_sent_tokens(List[str]): An array containing all preprocessed sentences.
    summary_size(int): Number of lines for summary as well as number of topics for LSA.
    ngram_tuple(tuple[int,int]): A tuple that species the ngram range for BOW feature extractoin scheme.

  Returns:
    A string that is the extractive summary for given article.
  """
  #start timinig
  tic = time.time()

  #instantiate BOW model
  count_vectorizer = CountVectorizer(ngram_range=ngram_tuple) 

  #generate feature vector by training BOW model on preprocessed article
  feature_vector = count_vectorizer.fit_transform(pp_sent_tokens) 

  #decompose feature vector into 3 matrices
  U, SIGMA, VT = randomized_svd(feature_vector, n_components = summary_size, n_iter = 100, random_state =100)

  #compute sentence scores
  sentence_scores = np.sqrt(np.dot(U**2, SIGMA**2))

  #sort the sentence scores in desceding oreder
  top_scores_index = np.argsort(sentence_scores)[::-1]

  #get the indices of highest scoring sentences
  top_scores_index = top_scores_index[0:summary_size]

  #convert raw sentence list from list type to numpy array
  article_array = np.array(raw_sent_tokens)

  #use the top scoring sentence indices to select sentences from raw article array
  bow_sum = list(article_array[top_scores_index])

  #convert the summary sentences from list to a single string
  bow_sum_str = " ".join(bow_sum)

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000

  #return the string
  return timeTaken, bow_sum_str

### Function to Generate TF-IDF + LSA Summary

In [None]:
def generate_tfidf_lsa_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):
  """Generates an extractive summary using TF-IDF feature generation scheme and LSA.

  Args:
    raw_sent_tokens(List[str]): An array containing all unprocessed sentences.
    pp_sent_tokens(List[str]): An array containing all preprocessed sentences.
    summary_size(int): Number of lines for summary as well as number of topics for LSA.
    ngram_tuple(tuple[int,int]): A tuple that species the ngram range for BOW feature extractoin scheme.

  Returns:
    A string that is the extractive summary for given article.
  """  
  #start timing
  tic = time.time()

  #instantiate TF-IDF model
  tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_tuple)

  #generate feature vector by training TF model on preprocessed article
  feature_vector = tfidf_vectorizer.fit_transform(pp_sent_tokens)

  #decompose feature vector into 3 matrices
  U, SIGMA, VT = randomized_svd(feature_vector, n_components = summary_size, n_iter = 100, random_state =100)

  #compute sentence scores
  sentence_scores = np.sqrt(np.dot(U**2, SIGMA**2))

  #sort the sentence scores in desceding order
  top_scores_index = np.argsort(sentence_scores)[::-1]

  #get the indices of highest scoring sentences
  top_scores_index = top_scores_index[0:summary_size]

  #convert raw sentence list from list type to numpy array
  article_array = np.array(raw_sent_tokens)

  #use the top scoring sentence indices to select sentences from raw article array
  tfidf_sum = list(article_array[top_scores_index])

  #convert the summary sentences from list to a single string
  tfidf_sum_str = " ".join(tfidf_sum)

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000

  #return the string
  return timeTaken, tfidf_sum_str

### Function to Generate Word Embedding + LSA Summary

In [None]:
def create_word_to_vec_model(pp_sent_tokens, ngram=1):

  #create work tokens
  pp_article_st_wt = [word_tokenize(sent) for sent in pp_sent_tokens]

  if ngram <= 1:
    w2v_model = Word2Vec(sentences=pp_article_st_wt,
                        min_count = 5, 
                        window = 5,
                        vector_size=10,
                        sample=6e-5,
                        alpha=0.001,
                        min_alpha = 0.0007,
                        negative=20,
                        workers=num_cores-2)   
    return w2v_model

  else:
    #create n gram phraser
    ngram_model = None
    for i in range(2, ngram+1):
      ngram_model = gensim.models.phrases.Phrases(pp_article_st_wt)

    #create wor2vec model
    w2v_model = Word2Vec(sentences=ngram_model[pp_article_st_wt],
                          min_count = 5, 
                          window = 5,
                          vector_size=10,
                          sample=6e-5,
                          alpha=0.001,
                          min_alpha = 0.0007,
                          negative=20,
                          workers=num_cores-2)  

    return w2v_model

In [None]:
def generate_wemb_lsa_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram = 1):

  #start timinng
  tic = time.time()

  #get word 2 vec model
  w2v_model = create_word_to_vec_model(pp_sent_tokens, ngram)#get w2v model
  
  #get word 2 vec model
  words = list(w2v_model.wv.index_to_key)
  word_emb_list = sorted(words)

  #create pandas dataframe 
  document_term_emb = np.zeros(shape=(len(pp_article_st_wt), len(word_emb_list)))
  dte_df = pd.DataFrame(document_term_emb)
  dte_df.columns = word_emb_list
  for i in range(len(pp_article_st)):
    freq_map = Counter(pp_article_st[i])
    for word in word_emb_list:
      if freq_map[word] > 0:
        dte_df[word].iloc[i] += np.median(w2v_model.wv[word])
        freq_map[word] -= 1  

  feature_vector = dte_df.to_numpy()
  U, SIGMA, VT = randomized_svd(feature_vector, n_components = summary_size, n_iter = 100, random_state =100)
  sentence_scores = np.sqrt(np.dot(U**2, SIGMA**2))
  top_scores_index = np.argsort(sentence_scores)[::-1]
  top_scores_index = top_scores_index[0:summary_size]
  article_array = np.array(raw_sent_tokens)
  wemb_sum = list(article_array[top_scores_index])
  wemb_sum_str = " ".join(wemb_sum)

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000

  return timeTaken, wemb_sum_str

### Function to Generated BOW + LSA + K-Means Summary

In [None]:
def generate_bow_lsa_kmeans_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):
  
  #start timinng
  tic = time.time()

  count_vectorizer = CountVectorizer(ngram_range=ngram_tuple) #intialize BOW model
  feature_vector = count_vectorizer.fit_transform(pp_sent_tokens) #train and fit BoW Model
  dimention_reduction_model = LSA(n_components = summary_size, n_iter = 100, random_state =100) #initialize LSA model
  reduced_feature_vector = dimention_reduction_model.fit_transform(feature_vector) #train and fit LSA model
  clustering_model = KMeans(n_clusters=summary_size, random_state=0, max_iter=500) #initialize K Means model
  clustering_model.fit(reduced_feature_vector) #train and fir clustering model
  cluster_centers = clustering_model.cluster_centers_ #get cluster centers

  #generate summary based on "summary_size" number of clusters
  #sentecens with value closest to each cluster centers are picked
  summary_sentences = []
  for i in range(cluster_centers.shape[0]):
    distances = np.linalg.norm(cluster_centers[i] - reduced_feature_vector, axis=1)
    sorted_indices = np.argsort(distances)
    summary_sentences.append(article_st[sorted_indices[0]])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000

  return timeTaken, "".join(summary_sentences)


### Function to Generated TF-IDF + LSA + K-Means Summary

In [None]:
def generate_tfidf_lsa_kmeans_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):

  #start timing
  tic = time.time()

  tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_tuple) #intialize TF-IDF model
  feature_vector = tfidf_vectorizer.fit_transform(pp_sent_tokens) #train and fit TF-IDF Model
  dimention_reduction_model = LSA(n_components = summary_size, n_iter = 100, random_state =100) #initialize LSA model
  reduced_feature_vector = dimention_reduction_model.fit_transform(feature_vector) #train and fit LSA model
  clustering_model = KMeans(n_clusters=summary_size, random_state=0, max_iter=500) #initialize K Means model
  clustering_model.fit(reduced_feature_vector) #train and fir clustering model
  cluster_centers = clustering_model.cluster_centers_ #get cluster centers

  #generate summary based on "summary_size" number of clusters
  #sentecens with value closest to each cluster centers are picked
  summary_sentences = []
  for i in range(cluster_centers.shape[0]):
    distances = np.linalg.norm(cluster_centers[i] - reduced_feature_vector, axis=1)
    sorted_indices = np.argsort(distances)
    summary_sentences.append(article_st[sorted_indices[0]])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000

  return timeTaken, "".join(summary_sentences)

### Function to Generated Word-Embeddings + LSA + K-Means Summary

In [None]:
def generate_wemb_lsa_kmeans_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram = 1):

  #start timing
  tic = time.time()

  #get word 2 vec model
  w2v_model = create_word_to_vec_model(pp_sent_tokens, ngram)#get w2v model
  
  #get word 2 vec model
  words = list(w2v_model.wv.index_to_key)
  word_emb_list = sorted(words)

  #create pandas dataframe 
  document_term_emb = np.zeros(shape=(len(pp_article_st_wt), len(word_emb_list)))
  dte_df = pd.DataFrame(document_term_emb)
  dte_df.columns = word_emb_list
  for i in range(len(pp_article_st)):
    freq_map = Counter(pp_article_st[i])
    for word in word_emb_list:
      if freq_map[word] > 0:
        dte_df[word].iloc[i] += np.median(w2v_model.wv[word])
        freq_map[word] -= 1  

  feature_vector = dte_df.to_numpy()
  dimention_reduction_model = LSA(n_components = summary_size, n_iter = 100, random_state =100) #initialize LSA model
  reduced_feature_vector = dimention_reduction_model.fit_transform(feature_vector) #train and fit LSA model
  clustering_model = KMeans(n_clusters=summary_size, random_state=0, max_iter=500) #initialize K Means model
  clustering_model.fit(reduced_feature_vector) #train and fir clustering model
  cluster_centers = clustering_model.cluster_centers_ #get cluster centers

  #generate summary based on "summary_size" number of clusters
  #sentecens with value closest to each cluster centers are picked
  summary_sentences = []
  for i in range(cluster_centers.shape[0]):
    distances = np.linalg.norm(cluster_centers[i] - reduced_feature_vector, axis=1)
    sorted_indices = np.argsort(distances)
    summary_sentences.append(article_st[sorted_indices[0]])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000 

  return timeTaken, "".join(summary_sentences)  

## LDA Summary Generators

### Function to Generate BOW + LDA Summary

In [None]:
def generate_bow_lda_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):

  #start timing
  tic = time.time() 

  #get BOW Features
  count_vectorizer = CountVectorizer(ngram_range=ngram_tuple) #intialize
  feature_vector = count_vectorizer.fit_transform(pp_sent_tokens) #generate
  print("BOW Features Generated!!!")

  #get LDA features
  lda_model = LDA(n_components=summary_size, random_state=0, max_iter = 100)
  document_topic_vector = lda_model.fit_transform(feature_vector)
  print("LDA Features Generated!!!")

  #
  #generatee summaries based on LDA probabilibities
  #

  sorted_probabilities = np.argsort(document_topic_vector, axis = 1) 
  top_probabilities = sorted_probabilities[:,-1]
  topic_bins = defaultdict(list)

  for i in range(document_topic_vector.shape[0]):
    topic_num = sorted_probabilities[i][-1]
    sent_index = i
    prob = document_topic_vector[i,sorted_probabilities[i][-1]]
    topic_bins[topic_num].append((sent_index, prob))

  print("Topic Bins Generated!!!")

  for k, v in topic_bins.items():
    topic_bins[k] = sorted(v, key=lambda x:x[1]) 

  topic_bins_keys = sorted(topic_bins.keys())
  topic_bin_list = [None] * document_topic_vector.shape[0]
  for tindex in topic_bins_keys:
    topic_bin_list[tindex] = topic_bins[tindex]

  print("Topic Bins List Generated!!!\n")

  sentences = []
  index = 0
  count = 0
  while count < N_LINE and topic_bin_list:
    index = index % len(topic_bin_list)
    if topic_bin_list[index]:
      sentences.append(topic_bin_list[index].pop())
      count += 1
    index += 1

  sentences = sorted(sentences, key=lambda x:x[0])
  summary = []
  for idx, prob in sentences:
    summary.append(article_st[idx])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000 

  return timeTaken, "".join(summary)
  #return summary

### Function to Generate TF-IDF + LDA Summary

In [None]:
def generate_tfidf_lda_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):


  #start timing
  tic = time.time()

  #get TFIDF Features
  tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_tuple)
  feature_vector = tfidf_vectorizer.fit_transform(pp_sent_tokens)
  print("TF-IDF Features Generated!!!")

  #get LDA features
  lda_model = LDA(n_components=summary_size, random_state=0, max_iter = 100)
  document_topic_vector = lda_model.fit_transform(feature_vector)
  print("LDA Features Generated!!!")

  #
  #generatee summaries based on LDA probabilibities
  #

  sorted_probabilities = np.argsort(document_topic_vector, axis = 1) 
  top_probabilities = sorted_probabilities[:,-1]
  topic_bins = defaultdict(list)

  for i in range(document_topic_vector.shape[0]):
    topic_num = sorted_probabilities[i][-1]
    sent_index = i
    prob = document_topic_vector[i,sorted_probabilities[i][-1]]
    topic_bins[topic_num].append((sent_index, prob))

  print("Topic Bins Generated!!!")

  for k, v in topic_bins.items():
    topic_bins[k] = sorted(v, key=lambda x:x[1]) 

  topic_bins_keys = sorted(topic_bins.keys())
  topic_bin_list = [None] * document_topic_vector.shape[0]
  for tindex in topic_bins_keys:
    topic_bin_list[tindex] = topic_bins[tindex]

  print("Topic Bins List Generated!!!\n")

  sentences = []
  index = 0
  count = 0
  while count < N_LINE and topic_bin_list:
    index = index % len(topic_bin_list)
    if topic_bin_list[index]:
      sentences.append(topic_bin_list[index].pop())
      count += 1
    index += 1

  sentences = sorted(sentences, key=lambda x:x[0])
  summary = []
  for idx, prob in sentences:
    summary.append(article_st[idx])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000 

  return timeTaken, "".join(summary)

### Function to Generate Word Embedding + LDA Summary

In [None]:
def generate_wemb_lda_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram = 1):

  #start timing
  tic = time.time()

  #get word 2 vec model
  w2v_model = create_word_to_vec_model(pp_sent_tokens, ngram)#get w2v model
  
  #get word 2 vec model
  words = list(w2v_model.wv.index_to_key)
  word_emb_list = sorted(words)

  #create pandas dataframe for feature vector
  document_term_emb = np.zeros(shape=(len(pp_article_st_wt), len(word_emb_list)))
  dte_df = pd.DataFrame(document_term_emb)
  dte_df.columns = word_emb_list
  for i in range(len(pp_article_st)):
    freq_map = Counter(pp_article_st[i])
    for word in word_emb_list:
      if freq_map[word] > 0:
        dte_df[word].iloc[i] += abs(np.median(w2v_model.wv[word]))
        freq_map[word] -= 1  

  #convert feature vector from dataframe to numpy
  feature_vector = dte_df.to_numpy()
  print("Wemb Features Generated!!!")

  #get LDA features
  lda_model = LDA(n_components=summary_size, random_state=0, max_iter = 100)
  document_topic_vector = lda_model.fit_transform(feature_vector)
  print("LDA Features Generated!!!")

  #
  #generatee summaries based on LDA probabilibities
  #

  sorted_probabilities = np.argsort(document_topic_vector, axis = 1) 
  top_probabilities = sorted_probabilities[:,-1]
  topic_bins = defaultdict(list)

  for i in range(document_topic_vector.shape[0]):
    topic_num = sorted_probabilities[i][-1]
    sent_index = i
    prob = document_topic_vector[i,sorted_probabilities[i][-1]]
    topic_bins[topic_num].append((sent_index, prob))

  print("Topic Bins Generated!!!")

  for k, v in topic_bins.items():
    topic_bins[k] = sorted(v, key=lambda x:x[1]) 

  topic_bins_keys = sorted(topic_bins.keys())
  topic_bin_list = [None] * document_topic_vector.shape[0]
  for tindex in topic_bins_keys:
    topic_bin_list[tindex] = topic_bins[tindex]

  print("Topic Bins List Generated!!!\n")

  sentences = []
  index = 0
  count = 0
  while count < N_LINE and topic_bin_list:
    index = index % len(topic_bin_list)
    if topic_bin_list[index]:
      sentences.append(topic_bin_list[index].pop())
      count += 1
    index += 1

  sentences = sorted(sentences, key=lambda x:x[0])
  summary = []
  for idx, prob in sentences:
    summary.append(article_st[idx])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000 

  return timeTaken, "".join(summary)

### Function to Generated BOW + LDA + K-Means Summary

In [None]:
def generate_bow_lda_kmeans_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):

  #start timing
  tic = time.time()

  count_vectorizer = CountVectorizer(ngram_range=ngram_tuple) #intialize BOW model
  feature_vector = count_vectorizer.fit_transform(pp_sent_tokens) #train and fit BoW Model
  dimention_reduction_model = LDA(n_components = summary_size, max_iter = 100, random_state =100) #initialize LSA model
  reduced_feature_vector = dimention_reduction_model.fit_transform(feature_vector) #train and fit LSA model
  clustering_model = KMeans(n_clusters=summary_size, random_state=0, max_iter=500) #initialize K Means model
  clustering_model.fit(reduced_feature_vector) #train and fir clustering model
  cluster_centers = clustering_model.cluster_centers_ #get cluster centers

  #generate summary based on "summary_size" number of clusters
  #sentecens with value closest to each cluster centers are picked
  summary_sentences = []
  for i in range(cluster_centers.shape[0]):
    distances = np.linalg.norm(cluster_centers[i] - reduced_feature_vector, axis=1)
    sorted_indices = np.argsort(distances)
    summary_sentences.append(article_st[sorted_indices[0]])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000 
  
  return timeTaken, "".join(summary_sentences)


### Function to Generated TF-IDF + LDA + K-Means Summary

In [None]:
def generate_tfidf_lda_kmeans_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram_tuple=(1,1)):
  #start timing
  tic = time.time()

  tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_tuple) #intialize TF-IDF model
  feature_vector = tfidf_vectorizer.fit_transform(pp_sent_tokens) #train and fit TF-IDF Model
  dimention_reduction_model = LDA(n_components = summary_size, max_iter = 100, random_state =100) #initialize LSA model
  reduced_feature_vector = dimention_reduction_model.fit_transform(feature_vector) #train and fit LSA model
  clustering_model = KMeans(n_clusters=summary_size, random_state=0, max_iter=500) #initialize K Means model
  clustering_model.fit(reduced_feature_vector) #train and fir clustering model
  cluster_centers = clustering_model.cluster_centers_ #get cluster centers

  #generate summary based on "summary_size" number of clusters
  #sentecens with value closest to each cluster centers are picked
  summary_sentences = []
  for i in range(cluster_centers.shape[0]):
    distances = np.linalg.norm(cluster_centers[i] - reduced_feature_vector, axis=1)
    sorted_indices = np.argsort(distances)
    summary_sentences.append(article_st[sorted_indices[0]])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000 

  return timeTaken, "".join(summary_sentences)

### Function to Generate Word-Embeddings + LDA + K-Means Summary

In [None]:
def generate_wemb_lda_kmeans_summary(raw_sent_tokens, pp_sent_tokens, summary_size = 10, ngram = 1):

  #start timing
  tic = time.time()

  #get word 2 vec model
  w2v_model = create_word_to_vec_model(pp_sent_tokens, ngram)#get w2v model
  
  #get word 2 vec model
  words = list(w2v_model.wv.index_to_key)
  word_emb_list = sorted(words)

  #create pandas dataframe 
  document_term_emb = np.zeros(shape=(len(pp_article_st_wt), len(word_emb_list)))
  dte_df = pd.DataFrame(document_term_emb)
  dte_df.columns = word_emb_list
  for i in range(len(pp_article_st)):
    freq_map = Counter(pp_article_st[i])
    for word in word_emb_list:
      if freq_map[word] > 0:
        dte_df[word].iloc[i] += np.abs(np.median(w2v_model.wv[word]))
        freq_map[word] -= 1  

  feature_vector = dte_df.to_numpy()
  dimention_reduction_model = LDA(n_components = summary_size, max_iter = 100, random_state =100) #initialize LSA model
  reduced_feature_vector = dimention_reduction_model.fit_transform(feature_vector) #train and fit LSA model
  clustering_model = KMeans(n_clusters=summary_size, random_state=0, max_iter=500) #initialize K Means model
  clustering_model.fit(reduced_feature_vector) #train and fir clustering model
  cluster_centers = clustering_model.cluster_centers_ #get cluster centers

  #generate summary based on "summary_size" number of clusters
  #sentecens with value closest to each cluster centers are picked
  summary_sentences = []
  for i in range(cluster_centers.shape[0]):
    distances = np.linalg.norm(cluster_centers[i] - reduced_feature_vector, axis=1)
    sorted_indices = np.argsort(distances)
    summary_sentences.append(article_st[sorted_indices[0]])

  #end time
  toc = time.time()

  #calcuale total time
  timeTaken = (toc - tic) * 1000 

  return timeTaken, "".join(summary_sentences)  

## Number of Lines in Summary

In [None]:
N_LINE = 10

## Generate Summaries for Uni-GRAM (1-GRAM) Occurence

### LSA

#### BOW

In [None]:
bow_lsa_1gram_tt, bow_lsa_1gram_summary = generate_bow_lsa_summary(article_st, pp_article_st, N_LINE, (1,1))
bow_lsa_1gram_summary

#### TF-IDF

In [None]:
tfidf_lsa_1gram_tt, tfidf_lsa_1gram_summary = generate_tfidf_lsa_summary(article_st, pp_article_st, N_LINE, (1,1))
tfidf_lsa_1gram_summary

#### Word-Embeddings

In [None]:
wemb_lsa_1gram_tt ,wemb_lsa_1gram_summary = generate_wemb_lsa_summary(article_st, pp_article_st, summary_size = 10, ngram = 1)
wemb_lsa_1gram_summary

#### K-Means

##### BOW

In [None]:
bow_lsa_kmeans_1gram_tt, bow_lsa_kmeans_1gram_summary = generate_bow_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (1,1))
bow_lsa_kmeans_1gram_summary

##### TF-IDF

In [None]:
tfidf_lsa_kmeans_1gram_tt, tfidf_lsa_kmeans_1gram_summary = generate_tfidf_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (1,1))
tfidf_lsa_kmeans_1gram_summary

##### Word-Embeddings

In [None]:
wemb_lsa_kmeans_1gram_tt, wemb_lsa_kmeans_1gram_summary = generate_wemb_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 1)
wemb_lsa_kmeans_1gram_summary

### LDA

#### BOW

In [None]:
bow_lda_1gram_tt, bow_lda_1gram_summary = generate_bow_lda_summary(article_st, pp_article_st, N_LINE, (1,1))
bow_lda_1gram_summary

#### TF-IDF

In [None]:
tfidf_lda_1gram_tt, tfidf_lda_1gram_summary = generate_tfidf_lda_summary(article_st, pp_article_st, N_LINE, (1,1))
tfidf_lda_1gram_summary

#### Word-Embeddings

In [None]:
wemb_lda_1gram_tt, wemb_lda_1gram_summary = generate_wemb_lda_summary(article_st, pp_article_st, summary_size = 10, ngram = 1)
wemb_lda_1gram_summary

#### K-Means

##### BOW

In [None]:
bow_lda_kmeans_1gram_tt, bow_lda_kmeans_1gram_summary = generate_bow_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (1,1))
bow_lda_kmeans_1gram_summary

##### TF-IDF

In [None]:
tfidf_lda_kmeans_1gram_tt, tfidf_lda_kmeans_1gram_summary = generate_tfidf_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (1,1))
tfidf_lda_kmeans_1gram_summary

##### Word-Embeddings

In [None]:
wemb_lda_kmeans_1gram_tt, wemb_lda_kmeans_1gram_summary = generate_wemb_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 1)
wemb_lda_kmeans_1gram_summary

## Generate Summaries For BI-GRAM (2-GRAM) Occurence

### LSA

#### BOW

In [None]:
bow_lsa_2gram_tt, bow_lsa_2gram_summary = generate_bow_lsa_summary(article_st, pp_article_st, N_LINE, (2,2))
bow_lsa_2gram_summary

#### TF-IDF

In [None]:
tfidf_lsa_2gram_tt, tfidf_lsa_2gram_summary = generate_tfidf_lsa_summary(article_st, pp_article_st, N_LINE, (2,2))
tfidf_lsa_2gram_summary

#### Word-Embeddings

In [None]:
wemb_lsa_2gram_tt, wemb_lsa_2gram_summary = generate_wemb_lsa_summary(article_st, pp_article_st, summary_size = 10, ngram = 2)
wemb_lsa_2gram_summary

#### K-Means

##### BOW

In [None]:
bow_lsa_kmeans_2gram_tt, bow_lsa_kmeans_2gram_summary = generate_bow_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (2,2))
bow_lsa_kmeans_2gram_summary

##### TF-IDF

In [None]:
tfidf_lsa_kmeans_2gram_tt, tfidf_lsa_kmeans_2gram_summary = generate_tfidf_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (2,2))
tfidf_lsa_kmeans_2gram_summary

##### Word-Embeddings

In [None]:
wemb_lsa_kmeans_2gram_tt, wemb_lsa_kmeans_2gram_summary = generate_wemb_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 2)
wemb_lsa_kmeans_2gram_summary

### LDA

#### BOW

In [None]:
bow_lda_2gram_tt, bow_lda_2gram_summary = generate_bow_lda_summary(article_st, pp_article_st, N_LINE, (2,2))
bow_lda_2gram_summary

#### TF-IDF

In [None]:
tfidf_lda_2gram_tt, tfidf_lda_2gram_summary = generate_tfidf_lda_summary(article_st, pp_article_st, N_LINE, (2,2))
tfidf_lda_2gram_summary

#### Word-Embeddings

In [None]:
wemb_lda_2gram_tt, wemb_lda_2gram_summary = generate_wemb_lda_summary(article_st, pp_article_st, summary_size = 10, ngram = 2)
wemb_lda_2gram_summary

#### K-Means

##### BOW

In [None]:
bow_lda_kmeans_2gram_tt, bow_lda_kmeans_2gram_summary = generate_bow_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (2,2))
bow_lda_kmeans_2gram_summary

##### TF-IDF

In [None]:
tfidf_lda_kmeans_2gram_tt, tfidf_lda_kmeans_2gram_summary = generate_tfidf_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (2,2))
tfidf_lda_kmeans_2gram_summary

##### Word-Embeddings

In [None]:
wemb_lda_kmeans_2gram_tt, wemb_lda_kmeans_2gram_summary = generate_wemb_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 2)
wemb_lda_kmeans_2gram_summary

## Generate Summaries For Tri-GRAM (3-GRAM) Occurence

### LSA

#### BOW

In [None]:
bow_lsa_3gram_tt, bow_lsa_3gram_summary = generate_bow_lsa_summary(article_st, pp_article_st, N_LINE, (3,3))
bow_lsa_3gram_summary

#### TF-IDF

In [None]:
tfidf_lsa_3gram_tt, tfidf_lsa_3gram_summary = generate_tfidf_lsa_summary(article_st, pp_article_st, N_LINE, (3,3))
tfidf_lsa_3gram_summary

#### Word-Embeddings

In [None]:
wemb_lsa_3gram_tt, wemb_lsa_3gram_summary = generate_wemb_lsa_summary(article_st, pp_article_st, summary_size = 10, ngram = 3)
wemb_lsa_3gram_summary

#### K-Means

##### BOW

In [None]:
bow_lsa_kmeans_3gram_tt, bow_lsa_kmeans_3gram_summary = generate_bow_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (3, 3))
bow_lsa_kmeans_3gram_summary

##### TF-IDF

In [None]:
tfidf_lsa_kmeans_3gram_tt, tfidf_lsa_kmeans_3gram_summary = generate_tfidf_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (3, 3))
tfidf_lsa_kmeans_3gram_summary

##### Word-Embeddings

In [None]:
wemb_lsa_kmeans_3gram_tt, wemb_lsa_kmeans_3gram_summary = generate_wemb_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 3)
wemb_lsa_kmeans_3gram_summary

### LDA

#### BOW

In [None]:
bow_lda_3gram_tt, bow_lda_3gram_summary = generate_bow_lda_summary(article_st, pp_article_st, N_LINE, (3,3))
bow_lda_3gram_summary

#### TF-IDF

In [None]:
tfidf_lda_3gram_tt, tfidf_lda_3gram_summary = generate_tfidf_lda_summary(article_st, pp_article_st, N_LINE, (3,3))
tfidf_lda_3gram_summary

#### Word-Embeddings

In [None]:
wemb_lda_3gram_tt, wemb_lda_3gram_summary = generate_wemb_lda_summary(article_st, pp_article_st, summary_size = 10, ngram = 3)
wemb_lda_3gram_summary

#### K-Means

##### BOW

In [None]:
bow_lda_kmeans_3gram_tt, bow_lda_kmeans_3gram_summary = generate_bow_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (3, 3))
bow_lda_kmeans_3gram_summary

##### TF-IDF

In [None]:
tfidf_lda_kmeans_3gram_tt, tfidf_lda_kmeans_3gram_summary = generate_tfidf_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (3, 3))
tfidf_lda_kmeans_3gram_summary

##### Word-Embeddings

In [None]:
wemb_lda_kmeans_3gram_tt, wemb_lda_kmeans_3gram_summary = generate_wemb_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 3)
wemb_lda_kmeans_3gram_summary

## Generate Feaure Matrixes and Summaries For (4-GRAM) Occurence

### LSA

#### BOW

In [None]:
bow_lsa_4gram_tt, bow_lsa_4gram_summary = generate_bow_lsa_summary(article_st, pp_article_st, N_LINE, (4,4))
bow_lsa_4gram_summary

#### TF-IDF

In [None]:
tfidf_lsa_4gram_tt, tfidf_lsa_4gram_summary = generate_tfidf_lsa_summary(article_st, pp_article_st, N_LINE, (4,4))
tfidf_lsa_4gram_summary

#### Word-Embeddings

In [None]:
wemb_lsa_4gram_tt, wemb_lsa_4gram_summary = generate_wemb_lsa_summary(article_st, pp_article_st, summary_size = 10, ngram = 4)
wemb_lsa_4gram_summary

#### K-Means

##### BOW

In [None]:
bow_lsa_kmeans_4gram_tt, bow_lsa_kmeans_4gram_summary = generate_bow_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (4, 4))
bow_lsa_kmeans_4gram_summary

##### TF-IDF

In [None]:
tfidf_lsa_kmeans_4gram_tt, tfidf_lsa_kmeans_4gram_summary = generate_tfidf_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (4, 4))
tfidf_lsa_kmeans_4gram_summary

##### Word-Embeddings

In [None]:
wemb_lsa_kmeans_4gram_tt, wemb_lsa_kmeans_4gram_summary = generate_wemb_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 4)
wemb_lsa_kmeans_4gram_summary

### LDA

#### BOW

In [None]:
bow_lda_4gram_tt, bow_lda_4gram_summary = generate_bow_lda_summary(article_st, pp_article_st, N_LINE, (4,4))
bow_lda_4gram_summary

#### TF-IDF

In [None]:
tfidf_lda_4gram_tt, tfidf_lda_4gram_summary = generate_tfidf_lda_summary(article_st, pp_article_st, N_LINE, (4,4))
tfidf_lda_4gram_summary

#### Word-Embeddings

In [None]:
wemb_lda_4gram_tt, wemb_lda_4gram_summary = generate_wemb_lda_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 4)
wemb_lda_4gram_summary

#### K-Means

##### BOW

In [None]:
bow_lda_kmeans_4gram_tt, bow_lda_kmeans_4gram_summary = generate_bow_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (4, 4))
bow_lda_kmeans_4gram_summary

##### TF-IDF

In [None]:
tfidf_lda_kmeans_4gram_tt, tfidf_lda_kmeans_4gram_summary = generate_tfidf_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (4, 4))
tfidf_lda_kmeans_4gram_summary

##### Word-Embeddings

In [None]:
wemb_lda_kmeans_4gram_tt, wemb_lda_kmeans_4gram_summary = generate_wemb_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 4)
wemb_lda_kmeans_4gram_summary

## Generate Feaure Matrixes and Summaries For (5-GRAM) Occurence

### LSA

#### BOW

In [None]:
bow_lsa_5gram_tt, bow_lsa_5gram_summary = generate_bow_lsa_summary(article_st, pp_article_st, N_LINE, (5,5))
bow_lsa_5gram_summary

#### TF-IDF

In [None]:
tfidf_lsa_5gram_tt, tfidf_lsa_5gram_summary = generate_tfidf_lsa_summary(article_st, pp_article_st, N_LINE, (5,5))
tfidf_lsa_5gram_summary

#### Word-Embeddings

In [None]:
wemb_lsa_5gram_tt, wemb_lsa_5gram_summary = generate_wemb_lsa_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 5)
wemb_lsa_5gram_summary

#### K-Means


##### BOW

In [None]:
bow_lsa_kmeans_5gram_tt, bow_lsa_kmeans_5gram_summary = generate_bow_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (5, 5))
bow_lsa_kmeans_5gram_summary

##### TF-IDF

In [None]:
tfidf_lsa_kmeans_5gram_tt, tfidf_lsa_kmeans_5gram_summary = generate_tfidf_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (5, 5))
tfidf_lsa_kmeans_5gram_summary

##### Word-Embeddings

In [None]:
wemb_lsa_kmeans_5gram_tt, wemb_lsa_kmeans_5gram_summary = generate_wemb_lsa_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 5)
wemb_lsa_kmeans_5gram_summary

### LDA

#### BOW

In [None]:
bow_lda_5gram_tt, bow_lda_5gram_summary = generate_bow_lda_summary(article_st, pp_article_st, N_LINE, (5,5))
bow_lda_5gram_summary

#### TF-IDF

In [None]:
tfidf_lda_5gram_tt, tfidf_lda_5gram_summary = generate_tfidf_lda_summary(article_st, pp_article_st, N_LINE, (5,5))
tfidf_lda_5gram_summary

#### Word-Embeddings

In [None]:
wemb_lda_5gram_tt, wemb_lda_5gram_summary = generate_wemb_lda_summary(article_st, pp_article_st, summary_size = 10, ngram = 5)
wemb_lda_5gram_summary

#### K-Means

##### BOW

In [None]:
bow_lda_kmeans_5gram_tt, bow_lda_kmeans_5gram_summary = generate_bow_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (5, 5))
bow_lda_kmeans_5gram_summary

##### TF-IDF

In [None]:
tfidf_lda_kmeans_5gram_tt, tfidf_lda_kmeans_5gram_summary = generate_tfidf_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram_tuple= (5, 5))
tfidf_lda_kmeans_5gram_summary

##### Word-Embeddings

In [None]:
wemb_lda_kmeans_5gram_tt, wemb_lda_kmeans_5gram_summary = generate_wemb_lda_kmeans_summary(article_st, pp_article_st, summary_size = N_LINE, ngram = 5)
wemb_lda_kmeans_5gram_summary

## Summaries Generated Via Survery

In [None]:
survey_summary2 = "Hurricane Irene was a long-lived Cape Verde hurricane during the 2005 Atlantic hurricane season. It was the ninth named storm and fourth hurricane of the record-breaking season. Although there were initial fears of a landfall in the United States due to uncertainty in predicting the storm's track, Hurricane Irene never approached land and caused no recorded damage; however, swells up to 8 ft (2.4 m) and strong rip currents resulted in one fatality in Long Beach, New York. Despite the unfavorable conditions in its vicinity and its poor organization, Tropical Depression Nine continued to strengthen, becoming Tropical Storm Irene on August 7. Contrary to these expectations, warmer waters and less wind shear allowed Irene to become gradually more organized while south of Bermuda, and it became a tropical storm once again early on August 11. Though NHC meteorologists thought it was likely that Irene would become a hurricane, they were not expecting an intensification of such a magnitude. Irene entered a region of increased wind shear and began to weaken, and as a result it was downgraded to a tropical storm early on August 18, when it was 520 miles (830 km) south of Cape Race, Newfoundland. Irene lasted for 14 days as a tropical system, the longest duration of any storm of the 2005 season. However, the hurricane generated strong waves and increased the risk of rip currents along the East Coast of the United States. This storm also marked the fifth occasion the name Irene had been used to name a tropical cyclone in the Atlantic."
survey_summary2

## Evaluation of Generated Summaries using ROUGE-N Metric and Survey Summary 2

## Function to evaluate N-gram summary

In [None]:
def get_ngram_rouge(ngram, generated_summary, survey_summary):
  scorer = rouge_scorer.RougeScorer([f'rouge{ngram}'], use_stemmer=False)
  scores = scorer.score(generated_summary, survey_summary)[f"rouge{ngram}"]
  return scores

### 1-GRAM

#### LSA

In [None]:
bow_lsa_1gram_rouge = get_ngram_rouge(1, bow_lsa_1gram_summary, survey_summary2)
bow_lsa_1gram_rouge

In [None]:
tfidf_lsa_1gram_rouge = get_ngram_rouge(1, tfidf_lsa_1gram_summary, survey_summary2)
tfidf_lsa_1gram_rouge

In [None]:
wemb_lsa_1gram_rouge = get_ngram_rouge(1, wemb_lsa_1gram_summary, survey_summary2)
wemb_lsa_1gram_rouge

In [None]:
bow_lsa_kmeans_1gram_rouge = get_ngram_rouge(1, bow_lsa_kmeans_1gram_summary, survey_summary2)
bow_lsa_kmeans_1gram_rouge

In [None]:
tfidf_lsa_kmeans_1gram_rouge = get_ngram_rouge(1, tfidf_lsa_kmeans_1gram_summary, survey_summary2)
tfidf_lsa_kmeans_1gram_rouge

In [None]:
wemb_lsa_kmeans_1gram_rouge = get_ngram_rouge(1, wemb_lsa_kmeans_1gram_summary, survey_summary2)
wemb_lsa_kmeans_1gram_rouge

#### LDA

In [None]:
bow_lda_1gram_rouge = get_ngram_rouge(1, bow_lda_1gram_summary, survey_summary2)
bow_lda_1gram_rouge

In [None]:
tfidf_lda_1gram_rouge = get_ngram_rouge(1, tfidf_lda_1gram_summary, survey_summary2)
tfidf_lda_1gram_rouge

In [None]:
wemb_lda_1gram_rouge = get_ngram_rouge(1, wemb_lda_1gram_summary, survey_summary2)
wemb_lda_1gram_rouge

In [None]:
bow_lda_kmeans_1gram_rouge = get_ngram_rouge(1, bow_lda_kmeans_1gram_summary, survey_summary2)
bow_lda_kmeans_1gram_rouge

In [None]:
tfidf_lda_kmeans_1gram_rouge = get_ngram_rouge(1, tfidf_lda_kmeans_1gram_summary, survey_summary2)
tfidf_lda_kmeans_1gram_rouge

In [None]:
wemb_lda_kmeans_1gram_rouge = get_ngram_rouge(1, wemb_lda_kmeans_1gram_summary, survey_summary2)
wemb_lda_kmeans_1gram_rouge

### 2-GRAM

#### LSA

In [None]:
bow_lsa_2gram_rouge = get_ngram_rouge(2, bow_lsa_2gram_summary, survey_summary2)
bow_lsa_2gram_rouge

In [None]:
tfidf_lsa_2gram_rouge = get_ngram_rouge(2, tfidf_lsa_2gram_summary, survey_summary2)
tfidf_lsa_2gram_rouge

In [None]:
wemb_lsa_2gram_rouge = get_ngram_rouge(2, wemb_lsa_2gram_summary, survey_summary2)
wemb_lsa_2gram_rouge

In [None]:
bow_lsa_kmeans_2gram_rouge = get_ngram_rouge(2, bow_lsa_kmeans_2gram_summary, survey_summary2)
bow_lsa_kmeans_2gram_rouge

In [None]:
tfidf_lsa_kmeans_2gram_rouge = get_ngram_rouge(2, tfidf_lsa_kmeans_2gram_summary, survey_summary2)
tfidf_lsa_kmeans_2gram_rouge

In [None]:
wemb_lsa_kmeans_2gram_rouge = get_ngram_rouge(2, wemb_lsa_kmeans_2gram_summary, survey_summary2)
wemb_lsa_kmeans_2gram_rouge

#### LDA

In [None]:
bow_lda_2gram_rouge = get_ngram_rouge(2, bow_lda_2gram_summary, survey_summary2)
bow_lda_2gram_rouge

In [None]:
tfidf_lda_2gram_rouge = get_ngram_rouge(2, tfidf_lda_2gram_summary, survey_summary2)
tfidf_lda_2gram_rouge

In [None]:
wemb_lda_2gram_rouge = get_ngram_rouge(2, wemb_lda_2gram_summary, survey_summary2)
wemb_lda_2gram_rouge

In [None]:
bow_lda_kmeans_2gram_rouge = get_ngram_rouge(2, bow_lda_kmeans_2gram_summary, survey_summary2)
bow_lda_kmeans_2gram_rouge

In [None]:
tfidf_lda_kmeans_2gram_rouge = get_ngram_rouge(2, tfidf_lda_kmeans_2gram_summary, survey_summary2)
tfidf_lda_kmeans_2gram_rouge

In [None]:
wemb_lda_kmeans_2gram_rouge = get_ngram_rouge(2, wemb_lda_kmeans_2gram_summary, survey_summary2)
wemb_lda_kmeans_2gram_rouge

### 3-GRAM

#### LSA

In [None]:
bow_lsa_3gram_rouge = get_ngram_rouge(3, bow_lsa_3gram_summary, survey_summary2)
bow_lsa_3gram_rouge

In [None]:
tfidf_lsa_3gram_rouge = get_ngram_rouge(3, tfidf_lsa_3gram_summary, survey_summary2)
tfidf_lsa_3gram_rouge

In [None]:
wemb_lsa_3gram_rouge = get_ngram_rouge(3, wemb_lsa_3gram_summary, survey_summary2)
wemb_lsa_3gram_rouge

In [None]:
bow_lsa_kmeans_3gram_rouge = get_ngram_rouge(3, bow_lsa_kmeans_3gram_summary, survey_summary2)
bow_lsa_kmeans_3gram_rouge

In [None]:
tfidf_lsa_kmeans_3gram_rouge = get_ngram_rouge(3, tfidf_lsa_kmeans_3gram_summary, survey_summary2)
tfidf_lsa_kmeans_3gram_rouge

In [None]:
wemb_lsa_kmeans_3gram_rouge = get_ngram_rouge(3, wemb_lsa_kmeans_3gram_summary, survey_summary2)
wemb_lsa_kmeans_3gram_rouge

#### LDA

In [None]:
bow_lda_3gram_rouge = get_ngram_rouge(3, bow_lda_3gram_summary, survey_summary2)
bow_lda_3gram_rouge

In [None]:
tfidf_lda_3gram_rouge = get_ngram_rouge(3, tfidf_lda_3gram_summary, survey_summary2)
tfidf_lda_3gram_rouge

In [None]:
wemb_lda_3gram_rouge = get_ngram_rouge(3, wemb_lda_3gram_summary, survey_summary2)
wemb_lda_3gram_rouge

In [None]:
bow_lda_kmeans_3gram_rouge = get_ngram_rouge(3, bow_lda_kmeans_3gram_summary, survey_summary2)
bow_lda_kmeans_3gram_rouge

In [None]:
tfidf_lda_kmeans_3gram_rouge = get_ngram_rouge(3, tfidf_lda_kmeans_3gram_summary, survey_summary2)
tfidf_lda_kmeans_3gram_rouge

In [None]:
wemb_lda_kmeans_3gram_rouge = get_ngram_rouge(3, wemb_lda_kmeans_3gram_summary, survey_summary2)
wemb_lda_kmeans_3gram_rouge

### 4-GRAM

#### LSA

In [None]:
bow_lsa_4gram_rouge = get_ngram_rouge(4, bow_lsa_4gram_summary, survey_summary2)
bow_lsa_4gram_rouge

In [None]:
tfidf_lsa_4gram_rouge = get_ngram_rouge(4, tfidf_lsa_4gram_summary, survey_summary2)
tfidf_lsa_4gram_rouge

In [None]:
wemb_lsa_4gram_rouge = get_ngram_rouge(4, wemb_lsa_4gram_summary, survey_summary2)
wemb_lsa_4gram_rouge

In [None]:
bow_lsa_kmeans_4gram_rouge = get_ngram_rouge(4, bow_lsa_kmeans_4gram_summary, survey_summary2)
bow_lsa_kmeans_4gram_rouge

In [None]:
tfidf_lsa_kmeans_4gram_rouge = get_ngram_rouge(4, tfidf_lsa_kmeans_4gram_summary, survey_summary2)
tfidf_lsa_kmeans_4gram_rouge

In [None]:
wemb_lsa_kmeans_4gram_rouge = get_ngram_rouge(4, wemb_lsa_kmeans_4gram_summary, survey_summary2)
wemb_lsa_kmeans_4gram_rouge

#### LDA

In [None]:
bow_lda_4gram_rouge = get_ngram_rouge(4, bow_lda_4gram_summary, survey_summary2)
bow_lda_4gram_rouge

In [None]:
tfidf_lda_4gram_rouge = get_ngram_rouge(4, tfidf_lda_4gram_summary, survey_summary2)
tfidf_lda_4gram_rouge

In [None]:
wemb_lda_4gram_rouge = get_ngram_rouge(4, wemb_lda_4gram_summary, survey_summary2)
wemb_lda_4gram_rouge

In [None]:
bow_lda_kmeans_4gram_rouge = get_ngram_rouge(4, bow_lda_kmeans_4gram_summary, survey_summary2)
bow_lda_kmeans_4gram_rouge

In [None]:
tfidf_lda_kmeans_4gram_rouge = get_ngram_rouge(4, tfidf_lda_kmeans_4gram_summary, survey_summary2)
tfidf_lda_kmeans_4gram_rouge

In [None]:
wemb_lda_kmeans_4gram_rouge = get_ngram_rouge(4, wemb_lda_kmeans_4gram_summary, survey_summary2)
wemb_lda_kmeans_4gram_rouge

### 5-GRAM

#### LSA

In [None]:
bow_lsa_5gram_rouge = get_ngram_rouge(5, bow_lsa_5gram_summary, survey_summary2)
bow_lsa_5gram_rouge

In [None]:
tfidf_lsa_5gram_rouge = get_ngram_rouge(5, tfidf_lsa_5gram_summary, survey_summary2)
tfidf_lsa_5gram_rouge

In [None]:
wemb_lsa_5gram_rouge = get_ngram_rouge(5, wemb_lsa_5gram_summary, survey_summary2)
wemb_lsa_5gram_rouge

In [None]:
bow_lsa_kmeans_5gram_rouge = get_ngram_rouge(5, bow_lsa_kmeans_5gram_summary, survey_summary2)
bow_lsa_kmeans_5gram_rouge

In [None]:
tfidf_lsa_kmeans_5gram_rouge = get_ngram_rouge(5, tfidf_lsa_kmeans_5gram_summary, survey_summary2)
tfidf_lsa_kmeans_5gram_rouge

In [None]:
wemb_lsa_kmeans_5gram_rouge = get_ngram_rouge(5, wemb_lsa_kmeans_5gram_summary, survey_summary2)
wemb_lsa_kmeans_5gram_rouge

#### LDA

In [None]:
bow_lda_5gram_rouge = get_ngram_rouge(5, bow_lda_5gram_summary, survey_summary2)
bow_lda_5gram_rouge

In [None]:

tfidf_lda_5gram_rouge = get_ngram_rouge(5, tfidf_lda_5gram_summary, survey_summary2)
tfidf_lda_5gram_rouge

In [None]:

wemb_lda_5gram_rouge = get_ngram_rouge(5, wemb_lda_5gram_summary, survey_summary2)
wemb_lda_5gram_rouge

In [None]:
bow_lda_kmeans_5gram_rouge = get_ngram_rouge(5, bow_lda_kmeans_5gram_summary, survey_summary2)
bow_lda_kmeans_5gram_rouge

In [None]:
tfidf_lda_kmeans_5gram_rouge = get_ngram_rouge(5, tfidf_lda_kmeans_5gram_summary, survey_summary2)
tfidf_lda_kmeans_5gram_rouge

In [None]:
wemb_lda_kmeans_5gram_rouge = get_ngram_rouge(5, wemb_lda_kmeans_5gram_summary, survey_summary2)
wemb_lda_kmeans_5gram_rouge

## Comparision of N-GRAM and Feature Extraction Scheme Using ROUGE-N Recall Scores 

###LSA

In [None]:
lsa_comparision_df = pd.DataFrame()
lsa_comparision_df["N-Gram Range"] = ["1-Gram", "2-Gram", "3-Gram", "4-Gram", "5-Gram"]

lsa_comparision_df["BOW_Recall"] = [bow_lsa_1gram_rouge.recall, bow_lsa_2gram_rouge.recall, bow_lsa_3gram_rouge.recall, bow_lsa_4gram_rouge.recall, bow_lsa_5gram_rouge.recall]
lsa_comparision_df["TF-IDF_Recall"] = [tfidf_lsa_1gram_rouge.recall, tfidf_lsa_2gram_rouge.recall, tfidf_lsa_3gram_rouge.recall, tfidf_lsa_4gram_rouge.recall, tfidf_lsa_5gram_rouge.recall]
lsa_comparision_df["W-Emb_Recall"] = [wemb_lsa_1gram_rouge.recall, wemb_lsa_2gram_rouge.recall, wemb_lsa_3gram_rouge.recall, wemb_lsa_4gram_rouge.recall, wemb_lsa_5gram_rouge.recall]


lsa_comparision_df["BOW_Precision"] = [bow_lsa_1gram_rouge.precision, bow_lsa_2gram_rouge.precision, bow_lsa_3gram_rouge.precision, bow_lsa_4gram_rouge.precision, bow_lsa_5gram_rouge.precision]
lsa_comparision_df["TF-IDF_Precision"] = [tfidf_lsa_1gram_rouge.precision, tfidf_lsa_2gram_rouge.precision, tfidf_lsa_3gram_rouge.precision, tfidf_lsa_4gram_rouge.precision, tfidf_lsa_5gram_rouge.precision]
lsa_comparision_df["W-Emb_Precision"] = [wemb_lsa_1gram_rouge.precision, wemb_lsa_2gram_rouge.precision, wemb_lsa_3gram_rouge.precision, wemb_lsa_4gram_rouge.precision, wemb_lsa_5gram_rouge.precision]


lsa_comparision_df["BOW_Fmeasure"] = [bow_lsa_1gram_rouge.fmeasure, bow_lsa_2gram_rouge.fmeasure, bow_lsa_3gram_rouge.fmeasure, bow_lsa_4gram_rouge.fmeasure, bow_lsa_5gram_rouge.fmeasure]
lsa_comparision_df["TF-IDF_Fmeasure"] = [tfidf_lsa_1gram_rouge.fmeasure, tfidf_lsa_2gram_rouge.fmeasure, tfidf_lsa_3gram_rouge.fmeasure, tfidf_lsa_4gram_rouge.fmeasure, tfidf_lsa_5gram_rouge.fmeasure]
lsa_comparision_df["W-Emb_Fmeasure"] = [wemb_lsa_1gram_rouge.fmeasure, wemb_lsa_2gram_rouge.fmeasure, wemb_lsa_3gram_rouge.fmeasure, wemb_lsa_4gram_rouge.fmeasure, wemb_lsa_5gram_rouge.fmeasure]

lsa_comparision_df

#### Precision Bar Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, lsa_comparision_df["BOW_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Precision', hatch=("---"))
plt.bar(nGramRangeB, lsa_comparision_df["TF-IDF_Precision"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Precision' )
plt.bar(nGramRangeC, lsa_comparision_df["W-Emb_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Precision', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Precision Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Precision scores for Summaries Generated Using LSA", fontweight = 'bold', fontsize=17)
plt.show()

#### Recall Bar Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, lsa_comparision_df["BOW_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Recall', hatch=("---"))
plt.bar(nGramRangeB, lsa_comparision_df["TF-IDF_Recall"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Recall')
plt.bar(nGramRangeC, lsa_comparision_df["W-Emb_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Recall', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Recall Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Recall scores for Summaries Generated Using LSA", fontweight = 'bold', fontsize=17)
plt.show()


#### F-Measure Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, lsa_comparision_df["BOW_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Fmeasure', hatch=("---"))
plt.bar(nGramRangeB, lsa_comparision_df["TF-IDF_Fmeasure"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Fmeasure')
plt.bar(nGramRangeC, lsa_comparision_df["W-Emb_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Fmeasure', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Fmeasure Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Fmeasure scores for Summaries Generated Using LSA", fontweight = 'bold', fontsize=17)
plt.show()

### LDA

In [None]:
lda_comparision_df = pd.DataFrame()
lda_comparision_df["N-Gram Range"] = ["1-Gram", "2-Gram", "3-Gram", "4-Gram", "5-Gram"]

lda_comparision_df["BOW_Recall"] = [bow_lda_1gram_rouge.recall, bow_lda_2gram_rouge.recall, bow_lda_3gram_rouge.recall, bow_lda_4gram_rouge.recall, bow_lda_5gram_rouge.recall]
lda_comparision_df["TF-IDF_Recall"] = [tfidf_lda_1gram_rouge.recall, tfidf_lda_2gram_rouge.recall, tfidf_lda_3gram_rouge.recall, tfidf_lda_4gram_rouge.recall, tfidf_lda_5gram_rouge.recall]
lda_comparision_df["W-Emb_Recall"] = [wemb_lsa_1gram_rouge.recall, wemb_lda_2gram_rouge.recall, wemb_lda_3gram_rouge.recall, wemb_lda_4gram_rouge.recall, wemb_lda_5gram_rouge.recall]

lda_comparision_df["BOW_Precision"] = [bow_lda_1gram_rouge.precision, bow_lda_2gram_rouge.precision, bow_lda_3gram_rouge.precision, bow_lda_4gram_rouge.precision, bow_lda_5gram_rouge.precision]
lda_comparision_df["TF-IDF_Precision"] = [tfidf_lda_1gram_rouge.precision, tfidf_lda_2gram_rouge.precision, tfidf_lda_3gram_rouge.precision, tfidf_lda_4gram_rouge.precision, tfidf_lda_5gram_rouge.precision]
lda_comparision_df["W-Emb_Precision"] = [wemb_lda_1gram_rouge.precision, wemb_lda_2gram_rouge.precision, wemb_lda_3gram_rouge.precision, wemb_lda_4gram_rouge.precision, wemb_lda_5gram_rouge.precision]

lda_comparision_df["BOW_Fmeasure"] = [bow_lda_1gram_rouge.fmeasure, bow_lda_2gram_rouge.fmeasure, bow_lda_3gram_rouge.fmeasure, bow_lda_4gram_rouge.fmeasure, bow_lda_5gram_rouge.fmeasure]
lda_comparision_df["TF-IDF_Fmeasure"] = [tfidf_lda_1gram_rouge.fmeasure, tfidf_lda_2gram_rouge.fmeasure, tfidf_lda_3gram_rouge.fmeasure, tfidf_lda_4gram_rouge.fmeasure, tfidf_lda_5gram_rouge.fmeasure]
lda_comparision_df["W-Emb_Fmeasure"] = [wemb_lda_1gram_rouge.fmeasure, wemb_lda_2gram_rouge.fmeasure, wemb_lda_3gram_rouge.fmeasure, wemb_lda_4gram_rouge.fmeasure, wemb_lda_5gram_rouge.fmeasure]

lda_comparision_df

#### Precision Bar Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, lda_comparision_df["BOW_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Precision', hatch=("---"))
plt.bar(nGramRangeB, lda_comparision_df["TF-IDF_Precision"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Precision')
plt.bar(nGramRangeC, lda_comparision_df["W-Emb_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Precision', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Precision Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Precision scores for Summaries Generated Using LDA", fontweight = 'bold', fontsize=17)
plt.show()

#### Recall Bar Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, lda_comparision_df["BOW_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Recall', hatch=("---"))
plt.bar(nGramRangeB, lda_comparision_df["TF-IDF_Recall"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Recall')
plt.bar(nGramRangeC, lda_comparision_df["W-Emb_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Recall', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Recall Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Recall scores for Summaries Generated Using LDA", fontweight = 'bold', fontsize=17)
plt.show()


#### F-Measure Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, lda_comparision_df["BOW_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Fmeasure', hatch=("---"))
plt.bar(nGramRangeB, lda_comparision_df["TF-IDF_Fmeasure"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Fmeasure')
plt.bar(nGramRangeC, lda_comparision_df["W-Emb_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Fmeasure', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Fmeasure Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N F-measure scores for Summaries Generated Using LDA", fontweight = 'bold', fontsize=17)

plt.show()

### K-Means

#### LSA

In [None]:
klsa_comparision_df = pd.DataFrame()
klsa_comparision_df["N-Gram Range"] = ["1-Gram", "2-Gram", "3-Gram", "4-Gram", "5-Gram"]


klsa_comparision_df["BOW_Recall"] = [bow_lsa_kmeans_1gram_rouge.recall,
                                     bow_lsa_kmeans_2gram_rouge.recall, 
                                     bow_lsa_kmeans_3gram_rouge.recall, 
                                     bow_lsa_kmeans_4gram_rouge.recall, 
                                     bow_lsa_kmeans_5gram_rouge.recall]

klsa_comparision_df["TF-IDF_Recall"] = [tfidf_lsa_kmeans_1gram_rouge.recall, 
                                        tfidf_lsa_kmeans_2gram_rouge.recall, 
                                        tfidf_lsa_kmeans_3gram_rouge.recall, 
                                        tfidf_lsa_kmeans_4gram_rouge.recall, 
                                        tfidf_lsa_kmeans_5gram_rouge.recall]

klsa_comparision_df["W-Emb_Recall"] = [wemb_lsa_kmeans_1gram_rouge.recall, 
                                       wemb_lsa_kmeans_2gram_rouge.recall, 
                                       wemb_lsa_kmeans_3gram_rouge.recall, 
                                       wemb_lsa_kmeans_4gram_rouge.recall, 
                                       wemb_lsa_kmeans_5gram_rouge.recall]




klsa_comparision_df["BOW_Precision"] = [bow_lsa_kmeans_1gram_rouge.precision, 
                                        bow_lsa_kmeans_2gram_rouge.precision, 
                                        bow_lsa_kmeans_3gram_rouge.precision, 
                                        bow_lsa_kmeans_4gram_rouge.precision, 
                                        bow_lsa_kmeans_5gram_rouge.precision]

klsa_comparision_df["TF-IDF_Precision"] = [tfidf_lsa_kmeans_1gram_rouge.precision, 
                                           tfidf_lsa_kmeans_2gram_rouge.precision, 
                                           tfidf_lsa_kmeans_3gram_rouge.precision, 
                                           tfidf_lsa_kmeans_4gram_rouge.precision, 
                                           tfidf_lsa_kmeans_5gram_rouge.precision]

klsa_comparision_df["W-Emb_Precision"] = [wemb_lsa_kmeans_1gram_rouge.precision, 
                                          wemb_lsa_kmeans_2gram_rouge.precision, 
                                          wemb_lsa_kmeans_3gram_rouge.precision, 
                                          wemb_lsa_kmeans_4gram_rouge.precision, 
                                          wemb_lsa_kmeans_5gram_rouge.precision]




klsa_comparision_df["BOW_Fmeasure"] = [bow_lsa_kmeans_1gram_rouge.fmeasure, 
                                       bow_lsa_kmeans_2gram_rouge.fmeasure, 
                                       bow_lsa_kmeans_3gram_rouge.fmeasure, 
                                       bow_lsa_kmeans_4gram_rouge.fmeasure, 
                                       bow_lsa_kmeans_5gram_rouge.fmeasure]

klsa_comparision_df["TF-IDF_Fmeasure"] = [tfidf_lsa_kmeans_1gram_rouge.fmeasure, 
                                          tfidf_lsa_kmeans_2gram_rouge.fmeasure, 
                                          tfidf_lsa_kmeans_3gram_rouge.fmeasure, 
                                          tfidf_lsa_kmeans_4gram_rouge.fmeasure, 
                                          tfidf_lsa_kmeans_5gram_rouge.fmeasure]

klsa_comparision_df["W-Emb_Fmeasure"] = [wemb_lsa_kmeans_1gram_rouge.fmeasure, 
                                         wemb_lsa_kmeans_2gram_rouge.fmeasure, 
                                         wemb_lsa_kmeans_3gram_rouge.fmeasure, 
                                         wemb_lsa_kmeans_4gram_rouge.fmeasure, 
                                         wemb_lsa_kmeans_5gram_rouge.fmeasure]

klsa_comparision_df

##### Precision Bar Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, klsa_comparision_df["BOW_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Precision', hatch=("---"))
plt.bar(nGramRangeB, klsa_comparision_df["TF-IDF_Precision"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Precision')
plt.bar(nGramRangeC, klsa_comparision_df["W-Emb_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Precision', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Precision Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Precision scores for Summaries Generated Using K-means & LSA", fontweight = 'bold', fontsize=15)
plt.show()

##### Recall Bar Plot

In [None]:

fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, klsa_comparision_df["BOW_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Recall', hatch=("---"))
plt.bar(nGramRangeB, klsa_comparision_df["TF-IDF_Recall"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Recall')
plt.bar(nGramRangeC, klsa_comparision_df["W-Emb_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Recall', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Recall Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Recall scores for Summaries Generated Using K-means & LSA", fontweight = 'bold', fontsize=15)
plt.show()


##### F-Measure Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, klsa_comparision_df["BOW_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Fmeasure', hatch=("---"))
plt.bar(nGramRangeB, klsa_comparision_df["TF-IDF_Fmeasure"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Fmeasure')
plt.bar(nGramRangeC, klsa_comparision_df["W-Emb_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Fmeasure', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Fmeasure Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Fmeasure scores for Summaries Generated Using K-means & LSA", fontweight = 'bold', fontsize=15)
plt.show()

#### LDA

In [None]:
klda_comparision_df = pd.DataFrame()
klda_comparision_df["N-Gram Range"] = ["1-Gram", "2-Gram", "3-Gram", "4-Gram", "5-Gram"]


klda_comparision_df["BOW_Recall"] = [bow_lda_kmeans_1gram_rouge.recall,
                                     bow_lda_kmeans_2gram_rouge.recall, 
                                     bow_lda_kmeans_3gram_rouge.recall, 
                                     bow_lda_kmeans_4gram_rouge.recall, 
                                     bow_lda_kmeans_5gram_rouge.recall]

klda_comparision_df["TF-IDF_Recall"] = [tfidf_lda_kmeans_1gram_rouge.recall, 
                                        tfidf_lda_kmeans_2gram_rouge.recall, 
                                        tfidf_lda_kmeans_3gram_rouge.recall, 
                                        tfidf_lda_kmeans_4gram_rouge.recall, 
                                        tfidf_lda_kmeans_5gram_rouge.recall]

klda_comparision_df["W-Emb_Recall"] = [wemb_lda_kmeans_1gram_rouge.recall, 
                                       wemb_lda_kmeans_2gram_rouge.recall, 
                                       wemb_lda_kmeans_3gram_rouge.recall, 
                                       wemb_lda_kmeans_4gram_rouge.recall, 
                                       wemb_lda_kmeans_5gram_rouge.recall]




klda_comparision_df["BOW_Precision"] = [bow_lda_kmeans_1gram_rouge.precision, 
                                        bow_lda_kmeans_2gram_rouge.precision, 
                                        bow_lda_kmeans_3gram_rouge.precision, 
                                        bow_lda_kmeans_4gram_rouge.precision, 
                                        bow_lda_kmeans_5gram_rouge.precision]

klda_comparision_df["TF-IDF_Precision"] = [tfidf_lda_kmeans_1gram_rouge.precision, 
                                           tfidf_lda_kmeans_2gram_rouge.precision, 
                                           tfidf_lda_kmeans_3gram_rouge.precision, 
                                           tfidf_lda_kmeans_4gram_rouge.precision, 
                                           tfidf_lda_kmeans_5gram_rouge.precision]

klda_comparision_df["W-Emb_Precision"] = [wemb_lda_kmeans_1gram_rouge.precision, 
                                          wemb_lda_kmeans_2gram_rouge.precision, 
                                          wemb_lda_kmeans_3gram_rouge.precision, 
                                          wemb_lda_kmeans_4gram_rouge.precision, 
                                          wemb_lda_kmeans_5gram_rouge.precision]




klda_comparision_df["BOW_Fmeasure"] = [bow_lda_kmeans_1gram_rouge.fmeasure, 
                                       bow_lda_kmeans_2gram_rouge.fmeasure, 
                                       bow_lda_kmeans_3gram_rouge.fmeasure, 
                                       bow_lda_kmeans_4gram_rouge.fmeasure, 
                                       bow_lda_kmeans_5gram_rouge.fmeasure]

klda_comparision_df["TF-IDF_Fmeasure"] = [tfidf_lda_kmeans_1gram_rouge.fmeasure, 
                                          tfidf_lda_kmeans_2gram_rouge.fmeasure, 
                                          tfidf_lda_kmeans_3gram_rouge.fmeasure, 
                                          tfidf_lda_kmeans_4gram_rouge.fmeasure, 
                                          tfidf_lda_kmeans_5gram_rouge.fmeasure]

klda_comparision_df["W-Emb_Fmeasure"] = [wemb_lda_kmeans_1gram_rouge.fmeasure, 
                                         wemb_lda_kmeans_2gram_rouge.fmeasure, 
                                         wemb_lda_kmeans_3gram_rouge.fmeasure, 
                                         wemb_lda_kmeans_4gram_rouge.fmeasure, 
                                         wemb_lda_kmeans_5gram_rouge.fmeasure]

klda_comparision_df

##### Precision Bar Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, klda_comparision_df["BOW_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Precision', hatch=("---"))
plt.bar(nGramRangeB, klda_comparision_df["TF-IDF_Precision"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Precision')
plt.bar(nGramRangeC, klda_comparision_df["W-Emb_Precision"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Precision', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Precision Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Precision scores for Summaries Generated Using K-means & LDA", fontweight = 'bold', fontsize=15)
plt.show()

##### Recall Bar Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, klda_comparision_df["BOW_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Recall', hatch=("---"))
plt.bar(nGramRangeB, klda_comparision_df["TF-IDF_Recall"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Recall')
plt.bar(nGramRangeC, klda_comparision_df["W-Emb_Recall"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Recall', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Precision Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N Recall scores for Summaries Generated Using K-means & LDA", fontweight = 'bold', fontsize=15)
plt.show()

##### F-Measure Plot

In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.25

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]

#make plots
plt.bar(nGramRangeA, klda_comparision_df["BOW_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'BOW_Fmeasure', hatch=("---"))
plt.bar(nGramRangeB, klda_comparision_df["TF-IDF_Fmeasure"], color = 'k', width = barWidth, edgecolor = 'grey', label = 'TF-IDF_Fmeasure')
plt.bar(nGramRangeC, klda_comparision_df["W-Emb_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label = 'W-Emb_Fmeasure', hatch=("///"))
plt.legend(loc="best")
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N Recall Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of ROUGE-N F-measure scores for Summaries Generated Using K-means & LDA", fontweight = 'bold', fontsize=15)
plt.show()


## Overall (F-Measure)

In [None]:
fm_comparision_df = pd.DataFrame()
fm_comparision_df["LSA_TF-IDF_Fmeasure"] = [tfidf_lsa_1gram_rouge.fmeasure, tfidf_lsa_2gram_rouge.fmeasure, tfidf_lsa_3gram_rouge.fmeasure, tfidf_lsa_4gram_rouge.fmeasure, tfidf_lsa_5gram_rouge.fmeasure]
fm_comparision_df["LDA_BOW_Fmeasure"] = [bow_lda_1gram_rouge.fmeasure, bow_lda_2gram_rouge.fmeasure, bow_lda_3gram_rouge.fmeasure, bow_lda_4gram_rouge.fmeasure, bow_lda_5gram_rouge.fmeasure]
fm_comparision_df["LSA_KM_TF-IDF_Fmeasure"] = [tfidf_lsa_kmeans_1gram_rouge.fmeasure, tfidf_lsa_kmeans_2gram_rouge.fmeasure, tfidf_lsa_kmeans_3gram_rouge.fmeasure, tfidf_lsa_kmeans_4gram_rouge.fmeasure, tfidf_lsa_kmeans_5gram_rouge.fmeasure]
fm_comparision_df["LDA_KM_BOW_Fmeasure"] = [bow_lda_kmeans_1gram_rouge.fmeasure, bow_lda_kmeans_2gram_rouge.fmeasure, bow_lda_kmeans_3gram_rouge.fmeasure, bow_lda_kmeans_4gram_rouge.fmeasure, bow_lda_kmeans_5gram_rouge.fmeasure]
fm_comparision_df


In [None]:
fig = plt.subplots( figsize=(15, 10))
barWidth = 0.2

nGramRangeA = [i for i in range(1,6)]
nGramRangeB = [i + barWidth for i in nGramRangeA]
nGramRangeC = [i + barWidth for i in nGramRangeB]
nGramRangeD = [i + barWidth for i in nGramRangeC]

#make plots
plt.bar(nGramRangeA,fm_comparision_df["LSA_TF-IDF_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label="LSA_TF-IDF", hatch=("+++"))
plt.bar(nGramRangeB,fm_comparision_df["LDA_BOW_Fmeasure"], color = 'k', width = barWidth, edgecolor = 'grey', label="LDA_BOW")
plt.bar(nGramRangeC,fm_comparision_df["LSA_KM_TF-IDF_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label="LSA_KM_TF-IDF", hatch=("xxx"))
plt.bar(nGramRangeD,fm_comparision_df["LDA_KM_BOW_Fmeasure"], color = 'w', width = barWidth, edgecolor = 'grey', label="LDA_KM_BOW", hatch=("///"))
plt.legend()
plt.xticks([1 + r + barWidth for r in range(len(nGramRangeC))], ['1-Gram', '2-Gram', '3-Gram', '4-Gram', '5-Gram'])
plt.xlabel('N-Gram Co-Occurence', fontweight = 'bold', fontsize = 15)
plt.ylabel('ROUGE-N F-measure Score', fontweight = 'bold', fontsize = 15)
plt.title("Comparision of Higest ROUGE-N F-measure scores for Summaries Generated Using Different Techniques", fontweight = 'bold', fontsize=15)
plt.show()


#### Timing Plots

In [None]:
averageTimingDf = pd.DataFrame()

#BOW 
averageTimingDf["BOW_LSA"] = [bow_lsa_1gram_tt,  bow_lsa_2gram_tt, bow_lsa_3gram_tt, bow_lsa_4gram_tt, bow_lsa_5gram_tt]
averageTimingDf["BOW_LDA"] = [bow_lda_1gram_tt,  bow_lda_2gram_tt, bow_lda_3gram_tt, bow_lda_4gram_tt, bow_lda_5gram_tt]

#BOW & K-means
averageTimingDf["BOW_KLSA"] = [bow_lsa_kmeans_1gram_tt,  bow_lsa_kmeans_2gram_tt, bow_lsa_kmeans_3gram_tt, bow_lsa_kmeans_4gram_tt, bow_lsa_kmeans_5gram_tt]
averageTimingDf["BOW_KLDA"] = [bow_lda_kmeans_1gram_tt,  bow_lda_kmeans_2gram_tt, bow_lda_kmeans_3gram_tt, bow_lda_kmeans_4gram_tt, bow_lda_kmeans_5gram_tt]

#TF-IDF
averageTimingDf["TF_IDF_LSA"] = [tfidf_lsa_1gram_tt,  tfidf_lsa_2gram_tt, tfidf_lsa_3gram_tt, tfidf_lsa_4gram_tt, tfidf_lsa_5gram_tt]
averageTimingDf["TF_IDF_LDA"] = [tfidf_lda_1gram_tt,  tfidf_lda_2gram_tt, tfidf_lda_3gram_tt, tfidf_lda_4gram_tt, tfidf_lda_5gram_tt]

#TF-IDF & K - means
averageTimingDf["TF_IDF_KLSA"] = [tfidf_lsa_kmeans_1gram_tt,  tfidf_lsa_kmeans_2gram_tt, tfidf_lsa_kmeans_3gram_tt, tfidf_lsa_kmeans_4gram_tt, tfidf_lsa_kmeans_5gram_tt]
averageTimingDf["TF_IDF_KLDA"] = [tfidf_lda_kmeans_1gram_tt,  tfidf_lda_kmeans_2gram_tt, tfidf_lda_kmeans_3gram_tt, tfidf_lda_kmeans_4gram_tt, tfidf_lda_kmeans_5gram_tt]

#W-Emb 
averageTimingDf["WEMB_LSA"] = [wemb_lsa_1gram_tt,  wemb_lsa_2gram_tt, wemb_lsa_3gram_tt, wemb_lsa_4gram_tt, wemb_lsa_5gram_tt]
averageTimingDf["WEMB_LDA"] = [wemb_lda_1gram_tt,  wemb_lda_2gram_tt, wemb_lda_3gram_tt, wemb_lda_4gram_tt, wemb_lda_5gram_tt]

#W-Emb & K - means
averageTimingDf["WEMB_KLSA"] = [wemb_lsa_kmeans_1gram_tt,  wemb_lsa_kmeans_2gram_tt, wemb_lsa_kmeans_3gram_tt, wemb_lsa_kmeans_4gram_tt, wemb_lsa_kmeans_5gram_tt]
averageTimingDf["WEMB_KLDA"] = [wemb_lda_kmeans_1gram_tt,  wemb_lda_kmeans_2gram_tt, wemb_lda_kmeans_3gram_tt, wemb_lda_kmeans_4gram_tt, wemb_lda_kmeans_5gram_tt]

averageTimingDf

In [None]:
timingsDf = pd.DataFrame()

timingsDf["Labels"]  = ["BOW_LSA", 
                        "BOW_LDA", 
                        
                        "BOW_LSA_K-means", 
                        "BOW_LDA_K-means",
                        
                        "TF_IDF_LSA", 
                        "TF_IDF_LDA", 

                        "TF_IDF_LSA_K-means", 
                        "TF_IDF_LDA_K-means",
                        
                        "WEMB_LSA", 
                        "WEMB_LDA",

                        "WEMB_LSA_K-means", 
                        "WEMB_LDA_K-means"]


#final Averages
timingsDf["Median"] = [averageTimingDf["BOW_LSA"].median(),
                            averageTimingDf["BOW_LDA"].median(),
                            
                            averageTimingDf["BOW_KLSA"].median(),
                            averageTimingDf["BOW_KLDA"].median(),
                            
                            averageTimingDf["TF_IDF_LSA"].median(),
                            averageTimingDf["TF_IDF_LDA"].median(),
                            
                            averageTimingDf["TF_IDF_KLSA"].median(),
                            averageTimingDf["TF_IDF_KLDA"].median(),
                            
                            averageTimingDf["WEMB_LSA"].median(),
                            averageTimingDf["WEMB_LDA"].median(),
                            
                            averageTimingDf["WEMB_KLSA"].median(),
                            averageTimingDf["WEMB_KLDA"].median()]

timingsDf

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
plt.barh(timingsDf["Labels"], timingsDf["Median"], color = "w", hatch=("///"))
plt.xlabel("Time Taken (ms)", fontweight = 'bold', fontsize = 15)
plt.ylabel("Model Type", fontweight = 'bold', fontsize = 15)
plt.title("End to End Execution Time of All Models", fontweight = 'bold', fontsize = 20)
plt.show()