<a href="https://colab.research.google.com/github/sayarghoshroy/Summarization/blob/master/summarization_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pickle
import spacy
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neural_network import MLPRegressor as mlp

import warnings
warnings.filterwarnings('ignore')

In [95]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# change to path to dataset
file_name = "/content/drive/My Drive/Summarization_Pickled_Data/cnn_dataset_1000_labelled.pkl"
stories = pickle.load(open(file_name, 'rb'))

In [97]:
# displaying the first datapoint
# verify correctness of load
print(stories[0])

{'story': ["potomac, maryland (cnn) -- to combat the depression and despair during her 105-day stint in iran's notorious evin prison, haleh esfandiari welcomed all distractions and blocked thoughts of her beloved home and family.", 'haleh esfandiari talks to iranian media in front of evin prison after her august 21 release.', 'the iranian-american scholar, who was charged with espionage and endangering iranian national security during a december visit to her family, wrote a book in her mind, read newspapers, watched television and exercised voraciously.', '"i decided either i am going to succumb to despair or i am going to try to make the best of this condition, and the best of this condition was to have a disciplined day," she said. "so i would exercise for many hours, i would read, i would walk a lot, some three to four hours a day -- even in the room, you know, i would pace up and down timing myself."', 'the 67-year-old grandmother of two said dwelling on her incarceration, and long

In [98]:
# Required Models for glove
# in case of errors with conda, use this:
# conda install -c conda-forge spacy
# this is what worked for me :P

!python -m spacy download en
!python -m spacy download en_core_web_lg
!python -m spacy link en_core_web_lg en --force

# use the large model as the default model for English textual data

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_lg -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [0]:
# Initializing the processor
embedder = spacy.load('en')

In [0]:
# basic embeddings using averaged glove vectors
# using Spacy's large language model
def get_embedding(text):
    extract = embedder(text)
    total_sum = np.zeros(300)
    count = 0
    for token in extract:
        count += 1
        total_sum += np.asarray(token.vector)
    return total_sum / count

In [0]:
# creating the inputs and expected outputs
X_train = []
y_train = []
count = 0
for data in stories:
    count += 1
    doc_emb = get_embedding(data['story_text'])
    # use the function of choice to generate the document embedding

    index = 0
    for sentence in data['story']:
        sent_emb = get_embedding(sentence)
        # use the function of choice to generate the sentence embedding

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 
        index += 1

        X_train.append(x)
        y_train.append(y)

    if count > 100:
        break

X_train = np.asmatrix(X_train)
y_train = np.asarray(y_train)

In [0]:
def train(X, y):
    model = mlp(hidden_layer_sizes = (1024, 2048, 1024, 512, 256), max_iter = 100)
    model.fit(X, y)
    return model

def get_values(X, model):
    return model.predict(X)

In [0]:
m = train(X_train, 1000 * y_train)

In [0]:
# Hyperparameter for similarity threshold
theta = 0.95

def similarity(A, B):
    similarity =  (A @ B.T) / (np.linalg.norm(A) * np.linalg.norm(B))
    return similarity

def get_top_5(X_doc, y):
    order = np.flip(np.argsort(y))
    sentence_set = []
    for sent_id in order:
        if sentence_set == []:
            sentence_set.append(order[0])
            continue

        consider = X_doc[sent_id, :]
        flag = 1
        for consider_id in sentence_set:
            if similarity(X_doc[consider_id, :], consider) > theta:
                flag = 0
                break

        if flag == 1:
            sentence_set.append(sent_id)
    return sentence_set[0: min(5, len(sentence_set))]

In [116]:
# evaluation
# testing out each document iteratively
# test set: document 950 onwards

doc_id = 950
doc_count = len(stories)

# set the number of documents for testing
limit = 960

while doc_id < min(doc_count, limit):
    X_doc = []
    y_doc = []
    data = stories[doc_id]
    doc_emb = get_embedding(data['story_text'])

    index = 0
    for sentence in data['story']:
        sent_emb = get_embedding(sentence)

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 

        index += 1

        X_doc.append(x)
        y_doc.append(y)

    X_doc = np.asmatrix(X_doc)
    y_doc = np.asarray(y_doc)

    sentence_predicted_scores = get_values(X_doc, m)

    loss = np.linalg.norm(sentence_predicted_scores - y_doc)

    # Uncomment to view the test_loss on the sample  
    # print(loss)

    print("Document ID:", doc_id, ", Top 5 Sentences:", get_top_5(X_doc, sentence_predicted_scores))

    # Uncomment to view the top 10 sentences based on Gold Labels
    print("Top 10 sentences based on Gold Label", np.flip(np.argsort(y_doc))[0:10])
    doc_id += 1

Document ID: 950 , Top 5 Sentences: [0, 1, 18, 16, 10]
Top 10 sentences based on Gold Label [ 5 23 20  1  8  9 18 25 10  2]
Document ID: 951 , Top 5 Sentences: [1, 12, 9, 7, 11]
Top 10 sentences based on Gold Label [ 0  1 22 19 15 10  4 16 14  6]
Document ID: 952 , Top 5 Sentences: [4, 15, 5]
Top 10 sentences based on Gold Label [ 0 15 14 13 12 11 10  9  8  7]
Document ID: 953 , Top 5 Sentences: [3, 9, 16, 19, 7]
Top 10 sentences based on Gold Label [ 1 10 30 25 15  7 12 16 14  4]
Document ID: 954 , Top 5 Sentences: [1, 0, 5, 2, 4]
Top 10 sentences based on Gold Label [ 0 10  7  1  2  4 11  9  8  6]
Document ID: 955 , Top 5 Sentences: [0, 4, 11, 5, 14]
Top 10 sentences based on Gold Label [ 1 13  0  5 12  3 14 11 10  9]
Document ID: 956 , Top 5 Sentences: [4, 3, 1, 6, 2]
Top 10 sentences based on Gold Label [1 0 6 4 3 5 7 2]
Document ID: 957 , Top 5 Sentences: [24, 15, 10, 7, 2]
Top 10 sentences based on Gold Label [ 3 18 26  4 23  2  0 22  6 19]
Document ID: 958 , Top 5 Sentences: [6,

In [0]:
# ^_^ Thank You