In [11]:
import os
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import *

In [12]:
files = os.listdir("data/story_summary/stories_text_summarization_dataset_train/")

In [13]:
df = pd.DataFrame(columns=['content', 'summary'])

In [14]:
contractions = { 
"ain\'t": "am not / are not / is not / has not / have not",
"aren\'t": "are not",
"can\'t": "cannot",
"can\'t\'ve": "cannot have",
"\'cause": "because",
"could\'ve": "could have",
"couldn\'t": "could not",
"couldn\'t\'ve": "could not have",
"didn\'t": "did not",
"doesn\'t": "does not",
"don\'t": "do not",
"hadn\'t": "had not",
"hadn\'t\'ve": "had not have",
"hasn\'t": "has not",
"haven\'t": "have not",
"he\'d": "he had / he would",
"he\'d\'ve": "he would have",
"he\'ll": "he shall / he will",
"he\'ll\'ve": "he shall have / he will have",
"he\'s": "he has / he is",
"how\'d": "how did",
"how\'d\'y": "how do you",
"how\'ll": "how will",
"how\'s": "how has / how is / how does",
"I\'d": "I had / I would",
"I\'d\'ve": "I would have",
"I\'ll": "I shall / I will",
"I\'ll\'ve": "I shall have / I will have",
"I\'m": "I am",
"I\'ve": "I have",
"isn\'t": "is not",
"it\'d": "it had / it would",
"it\'d\'ve": "it would have",
"it\'ll": "it shall / it will",
"it\'ll\'ve": "it shall have / it will have",
"it\'s": "it has / it is",
"let\'s": "let us",
"ma\'am": "madam",
"mayn\'t": "may not",
"might\'ve": "might have",
"mightn\'t": "might not",
"mightn\'t\'ve": "might not have",
"must\'ve": "must have",
"mustn\'t": "must not",
"mustn\'t\'ve": "must not have",
"needn\'t": "need not",
"needn\'t\'ve": "need not have",
"o\'clock": "of the clock",
"oughtn\'t": "ought not",
"oughtn\'t\'ve": "ought not have",
"shan\'t": "shall not",
"sha\'n\'t": "shall not",
"shan\'t\'ve": "shall not have",
"she\'d": "she had / she would",
"she\'d\'ve": "she would have",
"she\'ll": "she shall / she will",
"she\'ll\'ve": "she shall have / she will have",
"she\'s": "she has / she is",
"should\'ve": "should have",
"shouldn\'t": "should not",
"shouldn\'t\'ve": "should not have",
"so\'ve": "so have",
"so\'s": "so as / so is",
"that\'d": "that would / that had",
"that\'d\'ve": "that would have",
"that\'s": "that has / that is",
"there\'d": "there had / there would",
"there\'d\'ve": "there would have",
"there\'s": "there is",
"they\'d": "they had / they would",
"they\'d\'ve": "they would have",
"they\'ll": "they will",
"they\'ll\'ve": "they shall have / they will have",
"they\'re": "they are",
"they\'ve": "they have",
"to\'ve": "to have",
"wasn\'t": "was not",
"we\'d": "we had / we would",
"we\'d\'ve": "we would have",
"we\'ll": "we will",
"we\'ll\'ve": "we will have",
"we\'re": "we are",
"we\'ve": "we have",
"weren\'t": "were not",
"what\'ll": "what shall / what will",
"what\'ll\'ve": "what shall have / what will have",
"what\'re": "what are",
"what\'s": "what has / what is",
"what\'ve": "what have",
"when\'s": "when has / when is",
"when\'ve": "when have",
"where\'d": "where did",
"where\'s": "where has / where is",
"where\'ve": "where have",
"who\'ll": "who shall / who will",
"who\'ll\'ve": "who shall have / who will have",
"who\'s": "who has / who is",
"who\'ve": "who have",
"why\'s": "why has / why is",
"why\'ve": "why have",
"will\'ve": "will have",
"won\'t": "will not",
"won\'t\'ve": "will not have",
"would\'ve": "would have",
"wouldn\'t": "would not",
"wouldn\'t\'ve": "would not have",
"y\'all": "you all",
"y\'all\'d": "you all would",
"y\'all\'d\'ve": "you all would have",
"y\'all\'re": "you all are",
"y\'all\'ve": "you all have",
"you\'d": "you had / you would",
"you\'d\'ve": "you would have",
"you\'ll": "you shall / you will",
"you\'ll\'ve": "you shall have / you will have",
"you\'re": "you are",
"you\'ve": "you have"
}

In [15]:
contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))
def expand_contractions(s, contractions_dict=contractions):
    def replace(match):
        return contractions_dict[match.group(0)]
    return re.sub(r"[^a-zA-Z0-9 ]", "", contractions_re.sub(replace, s))

def correct_text(passage):
    return [expand_contractions(i) for i in sent_tokenize(passage.lower())]

In [16]:
for i,file_name in enumerate(files[:100]):
    summary, content, next = "", "", False
    with open(f'./data/story_summary/stories_text_summarization_dataset_train/{file_name}', 'r', encoding='utf-8') as file:
        for line in file.readlines():
            if len(line.replace("\n", ""))>0:
                if line.find('@highlight')>=0:
                    next=True
                    continue
                if next:
                    summary += line.replace("@highlight", "").replace("\n", "").strip().lower() + ". "
                else:
                    next=False
                    content += line.replace("\n", "").strip().lower()
        df.loc[i] = {'content':correct_text(content), 'summary':summary}

In [18]:
print("Original Text: ", df.content[0])
print("Original Summary: ", df.summary[0])

Original Text:  ['it has  it is official us president barack obama wants lawmakers to weigh in on whether to use military force in syriaobama sent a letter to the heads of the house and senate on saturday night hours after announcing that he believes military action against syrian targets is the right step to take over the alleged use of chemical weaponsthe proposed legislation from obama asks congress to approve the use of military force to deter disrupt prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction', 'it has  it is a step that is set to turn an international crisis into a fierce domestic political battlethere are key questions looming over the debate what did un weapons inspectors find in syria', 'what happens if congress votes no', 'and how will the syrian government reactin a televised address from the white house rose garden earlier saturday the president said he would take his case to congress not because he has to  but

In [19]:
def create_freq_table(text_string):
    stopwords_list = set(stopwords.words('english'))
    
    words = word_tokenize(text_string)
    ps = PorterStemmer()
    freq_table = {}
    
    for word in words:
        #stem word 
        word = ps.stem(word)
        
        #remove stopwords
        if word in stopwords_list: 
            continue
        elif word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1
            
    return freq_table

In [56]:
#extract sentences that are important based on the word frequency
def generate_summary(sent_list):
    article_word_freq = create_freq_table(" ".join(sent_list))
    imp_list = {}
    for i, sentence in enumerate(sent_list):
        value = 0
        words = word_tokenize(sentence)
        for word in words:
            if word in article_word_freq.keys():
                value += article_word_freq[word]
        if(len(sentence)>0):
            imp = value/len(sentence)
        else:
            imp = 0
        imp_list[i] = imp
    temp = pd.DataFrame({'sent':sent_list, 'imp':list(imp_list.values())})
    return ". ".join(temp.sort_values(['imp'], ascending=0)[:5].sent)

In [48]:
df['gen_summary'] = [generate_summary(sent_list) for sent_list in df.content]

In [54]:
print(df.iloc[0].summary + "\n")
print(df.iloc[0].gen_summary)

syrian official: obama climbed to the top of the tree, "doesn't know how to get down". obama sends a letter to the heads of the house and senate. obama to seek congressional approval on military action against syria. aim is to determine whether cw were used, not by whom, says u.n. spokesman. 

no explanation was offered for the discrepancyiran us military action in syria would spark disasteropinion why strikes in syria are a bad idea. the syrian armys status is on maximum readiness and fingers are on the trigger to confront all challenges wael nader alhalqi said during a meeting with a delegation of syrian expatriates from italy according to a banner on syria state tv that was broadcast prior to obamas addressan anchor on syrian state television said obama appeared to be preparing for an aggression on syria based on repeated lies. a top syrian diplomat told the state television network that obama was facing pressure to take military action from israel turkey some arabs and rightwing ex