## Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import glob
import re
import warnings
import openpyxl
import keras


## Import Data 

In [2]:
# Grab all excel files
filenames = glob.glob("[a-z]*.xlsx")
# Combine files
doc = pd.DataFrame()
for i in range(len(filenames)):
    data=pd.read_excel(filenames[i])
    doc = pd.concat([doc, data])

# Limit to only good records.
doc = doc[doc['comment']=='good']
doc = doc[doc['lid']=='Id1']
# Shuffle the dataframe to randomize the records
#doc = doc.sample(frac = 1)
# Seperate the columns we want to use as lists.
triple = doc['mtriple']
triple = triple.tolist()
lex2 = doc['lex']
lex2 = lex2.tolist()
len(lex2)

469

In [3]:
doc

Unnamed: 0,category,eid,size,otriple,mtriple,lex,comment,lid
1,Airport,Id1,1,,"Aarhus_Airport | cityServed | ""Aarhus, Denmark""","The Aarhus is the airport of Aarhus, Denmark.",good,Id1
4,Airport,Id2,1,,Aarhus_Airport | cityServed | Aarhus,Aarhus airport serves the city of Aarhus.,good,Id1
6,Airport,Id3,1,,Aarhus_Airport | elevationAboveTheSeaLevel_(in...,Aarhus Airport is 25 metres above sea level.,good,Id1
10,Airport,Id4,1,,Aarhus_Airport | location | Tirstrup,Aarhus Airport is located in Tirstrup.,good,Id1
13,Airport,Id5,1,,"Aarhus_Airport | operatingOrganisation | ""Aarh...",Aarhus Airport is operated by Aarhus Lufthavn ...,good,Id1
...,...,...,...,...,...,...,...,...
360,ComicsCharacter,Id94,1,,The_Arrow_(comicsCharacter) | creator | Paul_G...,Paul Gustavson created The Arrow.,good,Id1
363,ComicsCharacter,Id95,1,,Tim_Brooke-Taylor | birthPlace | Buxton,Tim Brooke-Taylor was born in Buxton.,good,Id1
366,ComicsCharacter,Id96,1,,Tim_Brooke-Taylor | birthPlace | Derbyshire,Tim Brooke Taylor was born in Derbyshire.,good,Id1
369,ComicsCharacter,Id97,1,,Tim_Brooke-Taylor | birthPlace | England,Tim Brooke Taylor was born in England.,good,Id1


## Preprocess Data 

In [4]:
# Seperates the sentence (3 triples) by the seperator | 
# and gets rid of unnecessary spaces.
mtriple_arr = []
for mtriple in triple:
    str1 = mtriple
    ind1 = str1.find('|')
    trip1 = str1[0:ind1-1]
    str2 = str1[ind1+2:]
    ind2 = str2.find('|')
    trip2 = str2[:ind2-1]
    trip3 = str2[ind2+2:]

    # Fixes the second triple that sometimes is combined ex: isPartOf.
    res = [idx for idx in range(len(trip2)) if trip2[idx].isupper()] 
    if len(res) == 0:
        trip2b = trip2
    else:
        trip2b = ''
    x=0
    while x < len(res):
        if x == 0:
            trip2b += trip2[:res[x]] + ' '
        else:
            trip2b += trip2[res[x-1]:res[x]] + ' '
        if (x+1) == len(res):
            trip2b += trip2[res[x]:]        
        x+=1
    trip2 = trip2b

    # Gets rid of unwanted characters in the triples
    trip1 = re.sub(r'[,|""_:@#?!&$]', ' ', trip1)
    trip2 = re.sub(r'[,|""_:@#?!&$]', ' ', trip2)
    trip3 = re.sub(r'[,|""_:@#?!&$]', ' ', trip3)

    # converts triples to lower case since we trained on lowercase lex column.
    trip1 = trip1.lower()
    trip2 = trip2.lower()
    trip3 = trip3.lower()
    mtriple_arr.append([trip1,trip2,trip3])
mtriple_arr

[['aarhus airport', 'city served', ' aarhus  denmark '],
 ['aarhus airport', 'city served', 'aarhus'],
 ['aarhus airport', 'elevation above the sea level (in metres)', '25.0'],
 ['aarhus airport', 'location', 'tirstrup'],
 ['aarhus airport', 'operating organisation', ' aarhus lufthavn a/s '],
 ['aarhus airport', 'operating organisation', 'aktieselskab'],
 ['aarhus airport', 'runway length', '2776.0'],
 ['aarhus airport', 'runway length', '2777.0'],
 ['aarhus airport', 'runway name', ' 10l/28r '],
 ['aarhus airport', 'runway name', ' 10r/28l '],
 ['abilene  texas', 'country', 'united states'],
 ['abilene  texas', 'is part of', 'jones county  texas'],
 ['abilene  texas', 'is part of', 'taylor county  texas'],
 ['abilene  texas', 'is part of', 'texas'],
 ['abilene regional airport', '1st runway  length feet', '3678'],
 ['abilene regional airport', '1st runway  surface type', 'asphalt'],
 ['abilene regional airport', '3rd runway  length feet', '7202'],
 ['abilene regional airport', ' i c a

In [5]:
# Remove unwanted characters and make everything lowercase
lex = []
for i in range(len(lex2)):
    x = re.sub(r'[,_:@#?!&$]', ' ', lex2[i])
    x = x.lower()
    lex.append(x)
lex

['the aarhus is the airport of aarhus  denmark.',
 'aarhus airport serves the city of aarhus.',
 'aarhus airport is 25 metres above sea level.',
 'aarhus airport is located in tirstrup.',
 'aarhus airport is operated by aarhus lufthavn a/s.',
 'aktieselskab is the operating organisation for aarhus airport.',
 'aarhus airport runway length is 2776.0.',
 'the runway length at aarhus airport is 2777.0 meters.',
 'aarhus airport runway name is 10l/28r.',
 'the runway name at aarhus airport is "10r/28l".',
 'abilene  texas is in the united states.',
 'abilene  texas is part of jones county  texas.',
 'abilene  texas is part of taylor county texas.',
 'abilene  texas is part of texas.',
 'the length of the 1st runway at abilene regional airport is 3678 feet.',
 'the first runway at abilene regional airport is made from asphalt.',
 'the third runway at abilene regional airport is 7 202 feet long.',
 'abilene regional airport icao location identifier is kabi.',
 'abilene regional airport eleva

In [6]:
# Split the dataset into training and validation
stop = int(len(mtriple_arr)*0.8)
xtrain = mtriple_arr[0:stop]
ytrain = lex[:stop]
xtest = mtriple_arr[stop:]
ytest = lex[stop:]
print('The size of our training data is: ' + str(len(xtrain)) + ', while our testing data has this many records: ' + str(len(xtest)))

The size of our training data is: 375, while our testing data has this many records: 94


## Train Model 

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

In [8]:
tokenizer = Tokenizer() #instantiating the tokenizer
corpus=[]
sentence = ytrain
for i in range(len(sentence)):
    c = sentence[i].lower() #converting the sentence to lowercase
    corpus.append(c)
tokenizer.fit_on_texts(corpus) #creates tokens for each words 
total_words = len(tokenizer.word_index) + 1 #calculating total number of words in the initial sentence
corpus

['the aarhus is the airport of aarhus  denmark.',
 'aarhus airport serves the city of aarhus.',
 'aarhus airport is 25 metres above sea level.',
 'aarhus airport is located in tirstrup.',
 'aarhus airport is operated by aarhus lufthavn a/s.',
 'aktieselskab is the operating organisation for aarhus airport.',
 'aarhus airport runway length is 2776.0.',
 'the runway length at aarhus airport is 2777.0 meters.',
 'aarhus airport runway name is 10l/28r.',
 'the runway name at aarhus airport is "10r/28l".',
 'abilene  texas is in the united states.',
 'abilene  texas is part of jones county  texas.',
 'abilene  texas is part of taylor county texas.',
 'abilene  texas is part of texas.',
 'the length of the 1st runway at abilene regional airport is 3678 feet.',
 'the first runway at abilene regional airport is made from asphalt.',
 'the third runway at abilene regional airport is 7 202 feet long.',
 'abilene regional airport icao location identifier is kabi.',
 'abilene regional airport eleva

In [9]:
input_sequences = [] #training features (x) will be a list

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0] #converts each sentence as its tokenized equivalent
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1] #generating n gram sequences
        input_sequences.append(n_gram_sequence) #appending each n gram sequence to the list of our features (xs)

In [10]:
max_sequence_len = max([len(x) for x in input_sequences]) #calculating the length of the longest sequence
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) #pre-pading each value of the input_sequence
xs, labels = input_sequences[:,:-1],input_sequences[:,-1] #creating xs and their labels using numpy slicing
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) #creating one hot encoding values
ys

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [11]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional,Masking

model = Sequential() #creating a sequential model
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1)) #adding an embedding layer with 64 as the embedding dimension
model.add(Masking(mask_value=0.0))
model.add(Bidirectional(LSTM(64,return_sequences=True))) #adding 64 LSTM units
model.add(Bidirectional(LSTM(64,dropout=0.1)))
#model.add(Bidirectional(LSTM(64s,dropout=0.1)))
model.add(Dense(total_words, activation='relu')) #creating a dense layer with 54 output units (total_words) with softmax activation

In [12]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) #compiling the model with adam optimiser
history = model.fit(xs, ys, epochs=10, verbose=1) #training for 30 epochs

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Use the Model to Generate Text 

In [15]:
pred_arr = []
for mtrip in xtest:
    pred_sentence = ''
    x = 0
    while x < 3:
        trip = mtrip[x]
        if x == 2:
            trip = trip
        else:
            next_words = 1
            for _ in range(next_words):
                token_list = tokenizer.texts_to_sequences([trip])[0] 
                token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
                predicted = model.predict(token_list, verbose=0)


                output_word = "" 
                for word, index in tokenizer.word_index.items():
                    if index == predicted.all():
                        output_word = word
                        break
                trip += " " + output_word    
        pred_sentence += trip + ' '
        x += 1
    pred_arr.append(pred_sentence)

In [16]:
pred_arr

['amazing-man (comicscharacter) is creator is bill everett ',
 "april o'neil is creator is peter laird ",
 "arion (comicscharacter) is alternative name is  ahri'ahn  ",
 'arion (comicscharacter) is creator is jan duursema ',
 'arion (comicscharacter) is creator is paul kupperberg ',
 'asterix (comicscharacter) is alternative name is  astérix  ',
 'asterix (comicscharacter) is creator is albert uderzo ',
 'aurakles is alternative name is  aurakles  ',
 'aurakles is creator is dick dillin ',
 'aurakles is creator is len wein ',
 'auron (comicscharacter) is creator is karl kesel ',
 'auron (comicscharacter) is creator is walt simonson ',
 'auron (comicscharacter) is full name is  lambien  ',
 'bbc is founded by is john reith  1st baron reith ',
 'bbc is key person is rona fairhead ',
 'bbc is key person is tony hall  baron hall of birkenhead ',
 'bbc is location city is broadcasting house ',
 'bbc is location city is london ',
 'bbc is product is bbc radio ',
 'balder (comicscharacter) is

# Calculate Accuracy

In [17]:
from nltk.translate.bleu_score import sentence_bleu,corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from rouge import Rouge
rouge = Rouge()
bleu_acc = []
rouge_acc = []
c = 0
cc = SmoothingFunction()
while c < len(ytest):
    bleu_acc.append(sentence_bleu([ytest[c]], pred_arr[c], smoothing_function=cc.method7))
    rouge_acc.append(rouge.get_scores(ytest[c], pred_arr[c]))
    c += 1
print(rouge_acc)

[[{'rouge-1': {'r': 0.8333333333333334, 'p': 0.5555555555555556, 'f': 0.6666666618666668}, 'rouge-2': {'r': 0.16666666666666666, 'p': 0.1111111111111111, 'f': 0.1333333285333335}, 'rouge-l': {'r': 0.3333333333333333, 'p': 0.2222222222222222, 'f': 0.2666666618666667}}], [{'rouge-1': {'r': 0.5, 'p': 0.3333333333333333, 'f': 0.39999999520000007}, 'rouge-2': {'r': 0.16666666666666666, 'p': 0.125, 'f': 0.14285713795918387}, 'rouge-l': {'r': 0.5, 'p': 0.3333333333333333, 'f': 0.39999999520000007}}], [{'rouge-1': {'r': 0.6666666666666666, 'p': 0.36363636363636365, 'f': 0.4705882307266437}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.6666666666666666, 'p': 0.36363636363636365, 'f': 0.4705882307266437}}], [{'rouge-1': {'r': 0.5, 'p': 0.3333333333333333, 'f': 0.39999999520000007}, 'rouge-2': {'r': 0.16666666666666666, 'p': 0.125, 'f': 0.14285713795918387}, 'rouge-l': {'r': 0.5, 'p': 0.3333333333333333, 'f': 0.39999999520000007}}], [{'rouge-1': {'r': 0.8333333333333334, 'p': 0.5

In [18]:
rouge = Rouge()
rouge.get_scores(pred_arr, ytest, avg=True)

{'rouge-1': {'r': 0.5518344539621134,
  'p': 0.6915818519541921,
  'f': 0.6082645024002363},
 'rouge-2': {'r': 0.2617464391400561,
  'p': 0.2735838629455649,
  'f': 0.2635855725157066},
 'rouge-l': {'r': 0.4533708903389753,
  'p': 0.5658918209450121,
  'f': 0.4989037347544414}}

In [19]:

overall_bleu= sum(bleu_acc)/len(bleu_acc)
print("Overall Bleu score: ",overall_bleu)

# Calculating F-score, precision and recall from Rouge score

rouge = Rouge()
rouge.get_scores(pred_arr, ytest, avg=True)

Overall Bleu score:  0.7323223560476824


{'rouge-1': {'r': 0.5518344539621134,
  'p': 0.6915818519541921,
  'f': 0.6082645024002363},
 'rouge-2': {'r': 0.2617464391400561,
  'p': 0.2735838629455649,
  'f': 0.2635855725157066},
 'rouge-l': {'r': 0.4533708903389753,
  'p': 0.5658918209450121,
  'f': 0.4989037347544414}}

In [20]:
y_test=str()
for i in range(len(ytest)):
    a=str(ytest[i])
    y_test=y_test+a+' '

In [21]:
y_test

'bill everett is the creator of the comic character  amazing-man. the character  april o\'neal  was created by peter laird. the comic character  arion  is also known by he name ahri\'ahn. the comic character  arion  was created by jan duursema. paul kupperberg is the creator of the comic character  arion. the alternative name of asterix (comics character) is "astérix". albert uderzo is the creator of the comic character  asterix. aurakles is also known as \'\'aurakles\'\'. dick dillin is the creator of aurakles. the character  aurakles  was created by len wein. the creator of auron (comics character) is karl kesel. the comic character  auron  was created by walt simonson. the full name of auron (comics character) is "lambien". bbc was founded by john reith 1st baron reith. rona fairhead is the key person for bbc. a key person at the bbc is baron hall of birkenhead  tony hall. the location of the bbc is the broadcasting house in london. bbc is located in london. one of the products of b

In [22]:
#Fteching the data from y-predicted (pred_arr)
final=str()
for i in pred_arr:
    text=(i.capitalize()).strip()
    final= final+text
    final=final+". "
final    

"Amazing-man (comicscharacter) is creator is bill everett. April o'neil is creator is peter laird. Arion (comicscharacter) is alternative name is  ahri'ahn. Arion (comicscharacter) is creator is jan duursema. Arion (comicscharacter) is creator is paul kupperberg. Asterix (comicscharacter) is alternative name is  astérix. Asterix (comicscharacter) is creator is albert uderzo. Aurakles is alternative name is  aurakles. Aurakles is creator is dick dillin. Aurakles is creator is len wein. Auron (comicscharacter) is creator is karl kesel. Auron (comicscharacter) is creator is walt simonson. Auron (comicscharacter) is full name is  lambien. Bbc is founded by is john reith  1st baron reith. Bbc is key person is rona fairhead. Bbc is key person is tony hall  baron hall of birkenhead. Bbc is location city is broadcasting house. Bbc is location city is london. Bbc is product is bbc radio. Balder (comicscharacter) is alternative name is  balder odinson. Balder (comicscharacter) is creator is stan

In [23]:
# Removing Square Brackets and Extra Spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', final)
article_text = re.sub(r'\s+', ' ', final)

In [24]:
# Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', final )
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

In [25]:
#Converting Text To Sentences
import nltk
sentence_list = nltk.sent_tokenize(article_text)
sentence_list

['Amazing-man (comicscharacter) is creator is bill everett.',
 "April o'neil is creator is peter laird.",
 "Arion (comicscharacter) is alternative name is ahri'ahn.",
 'Arion (comicscharacter) is creator is jan duursema.',
 'Arion (comicscharacter) is creator is paul kupperberg.',
 'Asterix (comicscharacter) is alternative name is astérix.',
 'Asterix (comicscharacter) is creator is albert uderzo.',
 'Aurakles is alternative name is aurakles.',
 'Aurakles is creator is dick dillin.',
 'Aurakles is creator is len wein.',
 'Auron (comicscharacter) is creator is karl kesel.',
 'Auron (comicscharacter) is creator is walt simonson.',
 'Auron (comicscharacter) is full name is lambien.',
 'Bbc is founded by is john reith 1st baron reith.',
 'Bbc is key person is rona fairhead.',
 'Bbc is key person is tony hall baron hall of birkenhead.',
 'Bbc is location city is broadcasting house.',
 'Bbc is location city is london.',
 'Bbc is product is bbc radio.',
 'Balder (comicscharacter) is alternati

In [26]:
#Find Weighted Frequency of Occurrence
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [27]:
#Find Weighted Frequency of Occurrence
maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [28]:
#Calculating Sentence Scores
sentence_scores = {}
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [29]:
#Getting the Summary of top 10 sentences
import heapq
summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)
print(summary)

Dane whitman black knight (comicscharacter) is creator is john buscema. Dane whitman black knight (comicscharacter) is creator is roy thomas. Dane whitman black knight (comicscharacter) is alternative name is dane whitman. The arrow (comicscharacter) is creator is paul gustavson. Ballistic (comicscharacter) is creator is doug moench. Bolt (comicscharacter) is creator is gary cohn (comics). Arion (comicscharacter) is creator is jan duursema. Arion (comicscharacter) is creator is paul kupperberg. Auron (comicscharacter) is creator is walt simonson. Balder (comicscharacter) is creator is stan lee.


In [30]:
summary_sentences

['Dane whitman black knight (comicscharacter) is creator is john buscema.',
 'Dane whitman black knight (comicscharacter) is creator is roy thomas.',
 'Dane whitman black knight (comicscharacter) is alternative name is dane whitman.',
 'The arrow (comicscharacter) is creator is paul gustavson.',
 'Ballistic (comicscharacter) is creator is doug moench.',
 'Bolt (comicscharacter) is creator is gary cohn (comics).',
 'Arion (comicscharacter) is creator is jan duursema.',
 'Arion (comicscharacter) is creator is paul kupperberg.',
 'Auron (comicscharacter) is creator is walt simonson.',
 'Balder (comicscharacter) is creator is stan lee.']