In [20]:
from os import listdir
import os as os
import shutil
import string
import pickle
# Helper libraries
import collections
import hashlib
import nltk
import json 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize.treebank import TreebankWordTokenizer
from rouge import Rouge


In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    #print(filename)
    file = open(filename,'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
def write_story_highlight_diff_files(filename, story, highlights):
    h_filename = filename.replace(".story", ".highlights")
    print(h_filename)
    file = open(filename,'w')
    story = "@story: \n" + story
    file.write(story)
    file.close()
    h_file = open(h_filename, 'w')
    h_file.write("@abstract: \n")
    highlights = list(map(lambda s: "<s>"+s+"<\s>", highlights))
    h_file.write(' '.join(highlights))
    h_file.close()

In [5]:
def write_story_highlight(filename, story, highlights):
    file = open(filename,'a+')
    file.write("@abstract: \n")
    highlights = list(map(lambda s: "<s>"+s+"<\s>", highlights))
    file.write(' '.join(highlights))                 
    story = "\n@story: \n" + story
    file.write(story)
    file.write("\n\n")
    file.close()

In [6]:
# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

In [7]:

# load all stories in a directory
def load_stories(directory):
    stories = list()
    rs_dir = "./processed"
    if  os.path.exists(rs_dir):
        shutil.rmtree(rs_dir)
    os.makedirs(rs_dir)
    list_of_files = listdir(directory)
    count1 = -1
        
    for count,name in enumerate(list_of_files):
        if ".story" not in name:
            continue
        urlhash = os.path.splitext(name)
        filename = directory + '/' + name
        # load document
        # print(filename)
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        if(story.isspace()):
            print(story+" is empty!!!! "+filename)
            continue
        if (count%1000 == 0):
            count1 += 1
            trainName = 'train'+ str(count1).zfill(3) 
        else:
            trainName
        fn = rs_dir + '/' + trainName
        #write_story_highlight(fn, story, highlights)
        # store
        stories.append({'hash':urlhash[0], 'story':story, 'highlights':highlights})
    return stories


In [8]:
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare a translation table to remove punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # strip source cnn office if it exists
        index = line.find('(CNN) -- ')
        if index > -1:
            line = line[index+len('(CNN)'):]
        index = line.find('CNN')
        if index > -1:
            line = line[index+len('CNN'):]
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [w.translate(table) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    # remove empty strings
    cleaned = [c for c in cleaned if len(c) > 0]
    return cleaned

In [21]:
def combine_vocab(tokenList, corpus=None):
    token_feed = [utils.canonicalize_word(w) for w in tokenList]
    print(len(token_feed))
    if corpus:
        token_feed.extend([utils.canonicalize_word(w) for w in corpus.words()])
        print(len(token_feed))
    return token_feed




In [22]:
NOUNS = ['NN', 'NNS', 'NNP', 'NNPS']
def rank_sentences(sents, doc_matrix, feature_names, sentence_ordering=0, top_n=4):
    sentences = [nltk.word_tokenize(sent) for sent in sents]
    sentences = [[w for w in sent if nltk.pos_tag([w])[0][1] in NOUNS]
                  for sent in sentences]
    tfidf_sent = [[doc_matrix[feature_names.index(w.lower())]
                   for w in sent if w.lower() in feature_names]
                 for sent in sentences]
    #print(len(sents))
    #print(len(tfidf_sent))
    #print(len(sentences))

    # Calculate Sentence Values
    doc_val = sum(doc_matrix)
    sent_values = [sum(sent) / doc_val for sent in tfidf_sent]
    
    #print(len(sent_values))
    if sentence_ordering == 1:
        #print("Coming to ordering 1")
        # Apply Position Weights
        sent_values = [sent*(i/len(sent_values))
                        for i, sent in enumerate(sent_values)]
    elif sentence_ordering == 2:
        #print("Coming to ordering 2")
        sent_values = [sent*((len(sent_values) - i)/len(sent_values))
                        for i, sent in enumerate(sent_values)]
        

    ranked_sents = [pair for pair in zip(range(len(sent_values)), sent_values)]
    ranked_sents = sorted(ranked_sents, key=lambda x: x[1] *-1)

    return ranked_sents[:top_n]




In [23]:
def get_base_line_rouge_score(test_stories_list, test_highlights_list, n=4):
    t = []
    h = []
    r = []
    for i in range(len(test_stories_list)):
        t.append(' '.join(test_stories_list[i][:n]))
        h.append(' '.join(train_highlights_list[i]))
        rouge = Rouge()
        rg_score = rouge.get_scores(t[i],h[i])
        r.append(rg_score)
    
        #print(rouge.get_scores(t[i],h[i]))
        if(i%400==0):
            print(i)
            print(t[i]+'\n\n'+h[i])
            print("\n\ntfidf_score: ",tfidf_s[i])
    return r

def get_rouge_score(test_stories_list, test_highlights_list, count_vect, tfidf, sentence_ordering=0, n=4):
    t = []
    h = []
    tfidf_s = []
    r = []
    len_stories = len(test_stories_list)
    for i in range(len_stories):
        story_freq_term = count_vect.transform(test_stories_list[i])
        story_tfidf_matrix = tfidf.transform(story_freq_term)
        story_dense = story_tfidf_matrix.todense()
        doc_matrix = story_dense.tolist()[0]
        rank = rank_sentences(test_stories_list[i], doc_matrix, count_vect.get_feature_names(), sentence_ordering, n)
        t.append(' '.join([test_stories_list[i][id[0]] for id in rank]))
        tfidf_s.append([id[1] for id in rank])
        h1 = ' '.join(test_highlights_list[i])
        rouge = Rouge()
        rg_score = rouge.get_scores(t[i],h1)
        r.append(rg_score)

        #print(rouge.get_scores(t[i],h[i]))
        if(i%400==0):
            print(i)
            print(t[i]+'\n\n'+h1)
            print("\n\ntfidf_score: ",tfidf_s[i])
    return r, t, tfidf_s


In [24]:
def makeresult_file(r_scores_tmp, filename):
    scores = [[score_type for rouge_type in score[0].values() for score_type in rouge_type.values()] for score in r_scores_tmp]
    df = pd.DataFrame(scores, columns=[key1+'_'+key2 for dict1 in r_scores_tmp[0] for key1 in dict1.keys() for key2 in dict1[key1]])
    df.to_csv(filename + '.csv')
    
    return df

def makeprocessed_file(t, hash_list, highlight_list):
    for i,story in enumerate(t):
        with open('cnn/stories/processed/'+test_hash_list[i]+'.story', 'w') as pfile:
            pfile.write(string(story))
            pfile.write("\n")
            for highlight in test_highlight_list[i]:
                pfile.write('@highlight\n')
                pfile.write(highlight)
                pfile.write("\n")

In [30]:
def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

#
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

Initially we need to load up the data, the data can be found at CNN data link. We untar it in the same directory : tar xvf cnn_stories.tgz on the command line. After we load up the data, we separate out to the story and highlight portion and store it as a list of dictionaries.

In [9]:
os.getcwd()

'/Users/shrividyamanmohan/Capstone_Project'

In [10]:
# load stories
try:
    file = open('cnn/stories/stories.json')
    stories = json.load(file)
    file.close()
except:
    directory = 'cnn/stories'
    stories = load_stories(directory)
    print('Loaded Stories %d' % len(stories))
    # clean stories
    for i,example in enumerate(stories):
        example['story'] = clean_lines(example['story'].split('\n'))
        example['highlights'] = clean_lines(example['highlights'])
        stories[i] = example
    with open('cnn/stories/stories.json', 'w') as outfile:
        json.dump(stories, outfile)




 is empty!!!! cnn/stories/226ca83313bb4db0917847f80fcf4a2d2af5007d.story


 is empty!!!! cnn/stories/c36fb222cee4c1f4e38cf62ad37e2eb8dd0a85be.story


 is empty!!!! cnn/stories/d4b4ee22583e0490d5e41e93941e8e6ec182d7ab.story


 is empty!!!! cnn/stories/2cb398794fea7b2dd83501c401c034ca73362323.story


 is empty!!!! cnn/stories/4a524c9714a5651a5f02497d23a8164f868babcd.story


 is empty!!!! cnn/stories/6e782862e65a315c67f8de9b6c6d0b4de21a6126.story


 is empty!!!! cnn/stories/7fa8a466d78b5232c91feceae35025b4c190e049.story


 is empty!!!! cnn/stories/84b530bc2b81c7f6b917906e0f6fc6dec87d4e8e.story


 is empty!!!! cnn/stories/abb74e2466084a9968a305ac439229302e5de164.story


 is empty!!!! cnn/stories/ce3271caecf540a64b4ae1b5943194ee1084b712.story


 is empty!!!! cnn/stories/97bb70b3837cff47e258c3ddf3aba494e6f4a139.story


 is empty!!!! cnn/stories/72855376efdaa110e598e17d36aaa5a46ed14425.story


 is empty!!!! cnn/stories/cf0c6788b92a287cccc4523776a442469796c4a8.story


 is empty!!!! cnn/stori

In [11]:
file = open('cnn/stories/stories.json')
stories1 = json.load(file)
file.close()
len(stories1)

92465

Now splitting to an array of story sentences and the corresponding highlight sentences we get :

In [12]:
stories_list = list(map(lambda s: s['story'], stories))
highlights_list = list(map(lambda s: s['highlights'], stories))

In [13]:
mean_story_length = np.mean(list(map(lambda s: len(s), stories_list)))

In [14]:
mean_sentence_length_of_every_story = np.mean(list(map(lambda s: len(s), [s for t in stories_list for s in t])))

In [15]:
print("Mean length of story is {:.3f} sentences and the Mean sentence length of every story is {:.3f} charachters".format(mean_story_length, mean_sentence_length_of_every_story))

Mean length of story is 21.241 sentences and the Mean sentence length of every story is 171.595 charachters


Now separating the datasets to training, dev and test datasets :

In [16]:
def get_train_test_stories(stories, highlights, split=0.8, shuffle=False):
    """Generate train/test split for unsupervised tasks.

    Args:
      stories(list): list of stories
      split (double): fraction to use as training set
      shuffle (int or bool): seed for shuffle of input data, or False to just
      take the training data as the first xx% contiguously.

    Returns:
      train_sentences, test_sentences ( list(list(string)) ): the train and test
      splits
    """
    sentences = np.array(list(stories), dtype=list)
    fmt = (len(sentences), sum(map(len, sentences)))
    print("Loaded {:,} stories ({:g} sentences)".format(*fmt))

    if shuffle:
        rng = np.random.RandomState(shuffle)
        all_sents = list(zip(sentences, highlights))
        rng.shuffle(all_sents)
        sentences, highlights = zip(*all_sents)
       # rng.shuffle(sentences)  # in-place
       # rng.shuffle(highlights)
    split_idx = int(split * len(sentences))
    test_dev_split_idx = int((len(sentences) - split_idx)/2)+ split_idx
    print(split_idx, test_dev_split_idx)
    train_stories = sentences[:split_idx]
    dev_stories = sentences[split_idx:test_dev_split_idx]
    test_stories = sentences[test_dev_split_idx:]
    train_highlights = highlights[:split_idx]
    dev_highlights = highlights[split_idx:test_dev_split_idx]
    test_highlights = highlights[test_dev_split_idx:]
    
    
    fmt = (len(train_stories), sum(map(len, train_stories)))
    print("Training set: {:,} stories ({:,} sentences)".format(*fmt))
    fmt = (len(dev_stories), sum(map(len, dev_stories)))
    print("Dev set: {:,} stories ({:,} sentences)".format(*fmt))
    fmt = (len(test_stories), sum(map(len, test_stories)))
    print("Test set: {:,} stories ({:,} sentences)".format(*fmt))

    return train_stories, dev_stories, test_stories, train_highlights, dev_highlights, test_highlights


In [17]:
train_stories_list, dev_stories_list, test_stories_list, train_highlights_list, dev_highlights_list, test_highlights_list \
= get_train_test_stories(stories_list , highlights_list, split=0.9, shuffle=42)

Loaded 92,465 stories (1.96402e+06 sentences)
83218 87841
Training set: 83,218 stories (1,766,718 sentences)
Dev set: 4,623 stories (99,117 sentences)
Test set: 4,624 stories (98,180 sentences)


In [18]:
train_stories_list[0]

['the end is closer than the beginning',
 'ron rupert grint left harry daniel radcliffe second from right and hermione emma watson in the new film',
 'harry potter and his friends at hogwarts are now in their sixth year of seven at the school theyve seen a lot of changes particularly as the influence of the reawakened voldemort that is he who must not be named has made itself known',
 'and the movie series itself is now nearing its conclusion harry potter and the halfblood prince which comes out wednesday is likewise the sixth movie in the series based on jk rowlings seven harry potter books',
 'there is one benefit to having such history director david yates says pretty much everybody going to see halfblood prince is familiar with the characters whether through the books or the movies',
 'we made a decision we kind of crossed a line actually i think on this movie where we said you know this is the sixth one in the series its the most popular franchise probably in history do we stop an

In [25]:
all_train_sentences = [item for sublist in train_stories_list for item in sublist]
all_dev_sentences = [item for sublist in dev_stories_list for item in sublist]
all_highlights_sentences = [item for sublist in train_highlights_list+dev_highlights_list for item in sublist]

In [26]:
combined_list = all_train_sentences + all_dev_sentences + all_highlights_sentences

In [31]:
count_vect = CountVectorizer(preprocessor=canonicalize_word, stop_words={'English'})
count_vect = count_vect.fit(combined_list)
#freq_term_matrix = count_vect.transform(train_stories_list[0])
freq_term_matrix = count_vect.transform(combined_list)
feature_names = count_vect.get_feature_names()

  'stop_words.' % sorted(inconsistent))


In [37]:
 nltk.download('averaged_perceptron_tagger')
  

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shrividyamanmohan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [35]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shrividyamanmohan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [38]:
import time
tfidf = TfidfTransformer(norm = 'l2')
tfidf.fit(freq_term_matrix)
#tfidf.fit(freq_term_matrix)
print(time.ctime())
r_score, t, tfidif_scores = get_rouge_score(test_stories_list, test_highlights_list, count_vect, tfidf, sentence_ordering=1, n=4)
print(time.ctime())

Fri Apr 24 21:25:50 2020
0
chef gordon ramsay writes of seeing his father abuse his mother today ramsay fights domestic abuse domestic violence is not identified solely by violent physical abuse instead it is defined as physical sexual psychological financial or emotional violence that takes place in a relationship intimate or familyoriented eventually this develops into a pattern of coercive and controlling behavior to this day i will never understand why mum stayed with him she deserved so much better and so much more it still pains me to remember how badly he treated her i have four young children of my own and i could never see myself behaving the way my father did when i was a child i want to be a role model for my children and have them look up to me no child should ever have to live in fear in their own home a home should be a place where you feel safe and loved when i was a kid our home was anything but that

gordon ramsay says his father battled alcohol and abused ramsays mom 