In [859]:
import pandas as pd

names =  pd.read_csv('stringMatch.csv')
names.head(10)

Unnamed: 0,Wonders
0,UNESCO World Heritage Site
1,New7Wonder
2,Peru
3,Colosseum
4,Taj Mahal
5,Great Pyramid of Giza
6,Machu Picchu
7,Petra
8,"United Nations Educational, Scientific and Cultural Organization"
9,1931 AD


In [860]:
#Code for creating unigram
import re

def unigrams(string, n=1):
    string = string.lower()
    unigrams = zip(*[string[i:] for i in range(n)])
    return [''.join(unigram) for unigram in unigrams]

In [861]:
#Vectorising, TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

wonder_names = names['Wonders']
vectorizer = TfidfVectorizer(min_df=2, analyzer=unigrams)
tf_idf_matrix = vectorizer.fit_transform(wonder_names)

# Cosine Similarity

In [862]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def cosine_sim(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [863]:
# Only entities with a similarity above 0.86 are added to matches
matches = cosine_sim(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.86)

In [864]:
def get_matches_df(sparse_matrix, name_vector, top=1):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'similairity': similairity})

In [865]:
#determine the shape of the dataframe
matches_df.shape

(61, 3)

In [866]:
#since the total number of rows in the generated match is 61, displaying top 61 rows of the dataframe
matches_df = get_matches_df(matches, wonder_names, top=61)
matches_df

Unnamed: 0,left_side,right_side,similairity
0,UNESCO World Heritage Site,Unesco world heritage site,1.000000
1,UNESCO World Heritage Site,UNESCO World Heritage Site,1.000000
2,New7Wonder,New7Wonder,1.000000
3,New7Wonder,New7Wonders,0.976562
4,Peru,Peru,1.000000
...,...,...,...
56,1931 BC,1931 A.D.,0.922880
57,1931 BC,1931 AD,0.922880
58,Roman Colosseum,Roman Colosseum,1.000000
59,Roman Colosseum,Colosseum,0.887435


# Edit Distance

In [870]:
def editDist(str1, str2):
    mat = [[0 for j in range(len(str1) + 1)] for i in range(len(str2) + 1)]
    for i in range(len(str2) + 1):
        mat[i][0] = i
    for j in range(len(str1) + 1):
        mat[0][j] = j
    for i in range(1, len(str2) + 1):
        for j in range(1, len(str1) + 1):
            if str1[j-1] == str2[i-1]:
                mat[i][j] = mat[i-1][j-1]
            else:
                mat[i][j] = min([mat[i-1][j], mat[i][j-1], mat[i-1][j-1]]) + 1

    return mat[len(str2)][len(str1)]

In [871]:
editDist('Peru', 'Peruu')

1

# Jaccard Similarity

In [869]:
def jaccard_distance(Str1, Str2):
    Str1 = Str1.lower()
    Str2 = Str2.lower()
    Str1 = set(Str1)
    Str2 = set(Str2)
    return 1.0 * len(Str1&Str2)/len(Str1|Str2)

In [812]:
left_side =[]
right_side =[]
similarity=[]

In [834]:
Str1 = '1931 AD'
Str2 = '1931 BC'
left_side.append(Str1)
right_side.append(Str2)
sim = jaccard_distance(Str1, Str2)
similarity.append(sim)

jacDf = pd.DataFrame({'left_side':left_side, 'right_side':right_side, 'similarity':similarity})
jacDf

Unnamed: 0,left_side,right_side,similarity
0,UNESCO World Heritage Site,Unesco world heritage site,1.0
1,UNESCO World Heritage Site,Heritage Site,0.5625
2,New7Wonder,New7Wonders,0.875
3,New7Wonder,Seven Wonder,0.6
4,Peru,Peruu,1.0
5,Peru,Peroo,0.6
6,Colosseum,Coloseum,1.0
7,Colosseum,Roman Colosseum,0.636364
8,Colosseum,Kolosium,0.555556
9,Taj Mahal,Taj Mehel,0.875


In [None]:
#Jaccard distance without lowercase
def jaccard_distance(Str1, Str2):
    Str1 = set(Str1)
    Str2 = set(Str2)
    return 1.0 * len(Str1&Str2)/len(Str1|Str2)

In [None]:
left_side =[]
right_side =[]
similarity=[]

In [714]:
Str1 = 'UNESCO World Heritage Site'
Str2 = 'Unesco world heritage site'
left_side.append(Str1)
right_side.append(Str2)
sim = jaccard_distance(Str1, Str2)
similarity.append(sim)

jacDf = pd.DataFrame({'left_side':left_side, 'right_side':right_side, 'similarity':similarity})
jacDf

Unnamed: 0,left_side,right_side,similarity
0,UNESCO World Heritage Site,Unesco world heritage site,0.478261
1,UNESCO World Heritage Site,Heritage site,0.421053
2,New7Wonder,New7Wonders,0.9
3,New7Wonder,Seven Wonder,0.5
4,Peru,Peruu,1.0
5,Peru,Peroo,0.6
6,Colosseum,Coloseum,1.0
7,Colosseum,Roman Colosseum,0.636364
8,Colosseum,Kolosium,0.555556
9,Taj Mahal,Taj Mehel,0.875


# NL to SPARQL

In [890]:
ques = 'What type of structures are in both lists?'

In [891]:
#Importing libraries
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from contractions import CONTRACTION_MAP
import unicodedata

#Reference for contractions: https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/notebooks/Ch07_Analyzing_Movie_Reviews_Sentiment/contractions.py

In [892]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [893]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

In [894]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [895]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [896]:
nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

#Removing negation words from the list of stop words since they might be useful in providing meaning to a sentence
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

#Function to remove stop words
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [897]:
#Creating the text normalizer

def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True,
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters    
        if special_char_removal:
            doc = remove_special_characters(doc) 
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [898]:
corpus = normalize_corpus([ques])
print(corpus)

['type structure list']


In [899]:
listToStr = ''.join([str(elem) for elem in corpus]) 
sentence = listToStr
sentence_nlp = nlp(corpus)

TypeError: 'Series' object is not callable

In [882]:
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
posDF = pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])
posDF.head(15)

Unnamed: 0,Word,POS tag,Tag type
0,type,NN,NOUN
1,structure,NN,NOUN
2,list,NN,NOUN


In [883]:
#Printing all the named entities from the pre-processed corpus
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

[]


In [754]:
#Finding the most frequent named entities from the corpus
named_entities = []
for sentence in corpus:
    temp_entity_name = ''
    temp_named_entity = None
    sentence = nlp(sentence)
    for word in sentence:
        term = word.text 
        tag = word.ent_type_
        if tag:
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()
            temp_named_entity = (temp_entity_name, tag)
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name = ''
                temp_named_entity = None

entity_frame = pd.DataFrame(named_entities, 
                            columns=['Entity Name', 'Entity Type'])
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0
Entity Name,new7wonder
Entity Type,ORG
Frequency,1


In [None]:
sparqlTemplate = 'SELECT DISTINCT ?obj WHERE{?obj rdfs:subClassOf dbo:', entity, '}'

In [768]:
strng = top_entities['Entity Name']
if

0    new7wonder
Name: Entity Name, dtype: object


# Named Entity Recognition using spaCy

In [909]:
# Load Packages
from __future__ import unicode_literals, print_function

import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm # loading bar

In [910]:
nlp1 = spacy.load('en')

In [916]:
docx1 = nlp1(u"What is PETRA?")

In [917]:
for token in docx1.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

PETRA 8 13 ORG


In [923]:
docx2 = nlp1(u"What is new7wonder?")

In [924]:
for token in docx2.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

new7wonder 8 18 ORG


In [952]:
docx3 = nlp1(u"Machu Picchu is a new7wonder")

In [953]:
for token in docx3.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Machu Picchu 0 12 PERSON
new7wonder 18 28 ORG


In [954]:
# training data
TRAIN_DATA = [
    ('What is PETRA?', {
        'entities': [(8, 13, 'ORG')]
    }),
     ('What is new7wonder?', {
        'entities': [(8, 18, 'ORG')]
    }),
    ('Machu Picchu is a new7wonder', {
        'entities': [(0, 12, 'DATE'), (18,28, 'ORG')]
    })
]

In [955]:
## plac is wrapper for argparser 
model=None
output_dir=("E:\\books\\Masters\\Winter 2020\\Semantic web\\Assignment4")
n_iter=100

In [956]:
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


In [937]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [938]:
# add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

    # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 17.64it/s]
100%|██████████| 3/3 [00:00<00:00, 25.64it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 10.519949734210968}
{'ner': 9.505746245384216}


100%|██████████| 3/3 [00:00<00:00, 21.90it/s]
100%|██████████| 3/3 [00:00<00:00, 22.32it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 8.513011455535889}
{'ner': 7.225867927074432}


100%|██████████| 3/3 [00:00<00:00, 20.67it/s]
100%|██████████| 3/3 [00:00<00:00, 21.58it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.602010704576969}
{'ner': 4.563530772924423}


100%|██████████| 3/3 [00:00<00:00, 22.15it/s]
100%|██████████| 3/3 [00:00<00:00, 20.98it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.018974218517542}
{'ner': 3.7643272548448294}


100%|██████████| 3/3 [00:00<00:00, 21.03it/s]
100%|██████████| 3/3 [00:00<00:00, 23.44it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.1503054714994505}
{'ner': 4.075289020845958}


100%|██████████| 3/3 [00:00<00:00, 23.16it/s]
100%|██████████| 3/3 [00:00<00:00, 21.96it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.8199425548373256}
{'ner': 2.448312916152645}


100%|██████████| 3/3 [00:00<00:00, 15.83it/s]
100%|██████████| 3/3 [00:00<00:00, 18.99it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.5634515690617263}
{'ner': 1.5135208462270384}


100%|██████████| 3/3 [00:00<00:00, 22.90it/s]
100%|██████████| 3/3 [00:00<00:00, 17.24it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.9946748834117898}
{'ner': 1.4347214532608632}


100%|██████████| 3/3 [00:00<00:00, 17.72it/s]
100%|██████████| 3/3 [00:00<00:00, 20.13it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.6735565650587887}
{'ner': 2.131584698545339}


100%|██████████| 3/3 [00:00<00:00, 18.29it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 17.09it/s]

{'ner': 1.794965092146981}


100%|██████████| 3/3 [00:00<00:00, 16.24it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.11it/s]

{'ner': 1.0968297285614441}


100%|██████████| 3/3 [00:00<00:00, 14.02it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.20it/s]

{'ner': 1.7647076307729916}


100%|██████████| 3/3 [00:00<00:00, 12.00it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.56it/s]

{'ner': 2.0083974343646025}


100%|██████████| 3/3 [00:00<00:00, 10.99it/s]
 33%|███▎      | 1/3 [00:00<00:00,  7.87it/s]

{'ner': 1.9751311678103065}


100%|██████████| 3/3 [00:00<00:00,  9.71it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.59it/s]

{'ner': 1.9310954722029745}


100%|██████████| 3/3 [00:00<00:00, 10.40it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.17it/s]

{'ner': 1.8431424433239088}


100%|██████████| 3/3 [00:00<00:00, 12.23it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 13.51it/s]

{'ner': 1.7694497069743564}


100%|██████████| 3/3 [00:00<00:00, 13.39it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.27it/s]

{'ner': 1.7501262317671427}


100%|██████████| 3/3 [00:00<00:00, 14.78it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 13.92it/s]

{'ner': 0.5662785050316256}


100%|██████████| 3/3 [00:00<00:00, 14.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.46it/s]

{'ner': 1.8769131582907534}


100%|██████████| 3/3 [00:00<00:00, 11.16it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.05it/s]

{'ner': 1.6166326645483888}


100%|██████████| 3/3 [00:00<00:00, 11.24it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.34it/s]

{'ner': 1.2805431135077352}


100%|██████████| 3/3 [00:00<00:00, 12.09it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.83it/s]

{'ner': 0.29877360177426593}


100%|██████████| 3/3 [00:00<00:00, 11.67it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.77it/s]

{'ner': 0.2851876875497479}


100%|██████████| 3/3 [00:00<00:00, 12.71it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 13.29it/s]

{'ner': 0.9689675260808367}


100%|██████████| 3/3 [00:00<00:00, 13.13it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 14.08it/s]

{'ner': 0.5538867777908726}


100%|██████████| 3/3 [00:00<00:00, 13.80it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.11it/s]

{'ner': 0.01675331839097124}


100%|██████████| 3/3 [00:00<00:00, 13.31it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.83it/s]

{'ner': 0.019692075293314227}


100%|██████████| 3/3 [00:00<00:00, 11.76it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.90it/s]

{'ner': 0.020014571590125537}


100%|██████████| 3/3 [00:00<00:00, 10.71it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.02844390243461594}


100%|██████████| 3/3 [00:00<00:00, 10.71it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.58it/s]

{'ner': 0.0005308625970182852}


100%|██████████| 3/3 [00:00<00:00, 10.38it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.31it/s]

{'ner': 0.07256120460534993}


100%|██████████| 3/3 [00:00<00:00,  9.90it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.41789308789382346}


100%|██████████| 3/3 [00:00<00:00, 10.10it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.61it/s]

{'ner': 0.002229329110893863}


100%|██████████| 3/3 [00:00<00:00,  9.23it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.319469238068388e-06}


100%|██████████| 3/3 [00:00<00:00,  9.90it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0001906107174250758}


100%|██████████| 3/3 [00:00<00:00, 10.20it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0063078603553574615}


100%|██████████| 3/3 [00:00<00:00, 10.27it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.23it/s]

{'ner': 0.0027398794036735674}


100%|██████████| 3/3 [00:00<00:00,  8.97it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.62it/s]

{'ner': 9.264450175583023e-05}


100%|██████████| 3/3 [00:00<00:00,  9.15it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.71it/s]

{'ner': 0.013554013669328806}


100%|██████████| 3/3 [00:00<00:00,  9.15it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.80it/s]

{'ner': 0.0017440765682517451}


100%|██████████| 3/3 [00:00<00:00,  9.38it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.99it/s]

{'ner': 0.030839589662063356}


100%|██████████| 3/3 [00:00<00:00, 10.79it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.83it/s]

{'ner': 6.50936254640651e-05}


100%|██████████| 3/3 [00:00<00:00,  9.91it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.87it/s]

{'ner': 2.0329810691145046e-08}


100%|██████████| 3/3 [00:00<00:00, 10.71it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.34it/s]

{'ner': 2.6682281873343523e-06}


100%|██████████| 3/3 [00:00<00:00, 11.46it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.17it/s]

{'ner': 0.00012835411082091902}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.42it/s]

{'ner': 4.280381670769583e-07}


100%|██████████| 3/3 [00:00<00:00, 12.61it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.98it/s]

{'ner': 0.0017831746464077296}


100%|██████████| 3/3 [00:00<00:00, 12.00it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 13.07it/s]

{'ner': 1.269045283588088e-06}


100%|██████████| 3/3 [00:00<00:00, 12.58it/s]
 33%|███▎      | 1/3 [00:00<00:00,  7.94it/s]

{'ner': 1.2733744225903684e-05}


100%|██████████| 3/3 [00:00<00:00,  8.65it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.978682125499758e-08}


100%|██████████| 3/3 [00:00<00:00, 10.27it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.30it/s]

{'ner': 0.000160566204816193}


100%|██████████| 3/3 [00:00<00:00, 10.99it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00013261705783286452}


100%|██████████| 3/3 [00:00<00:00,  9.46it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.30it/s]

{'ner': 2.089833947879089e-08}


100%|██████████| 3/3 [00:00<00:00, 11.58it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.65it/s]

{'ner': 6.980313576336384e-05}


100%|██████████| 3/3 [00:00<00:00, 12.34it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.67it/s]

{'ner': 8.313317830694845e-07}


100%|██████████| 3/3 [00:00<00:00, 13.09it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.37it/s]

{'ner': 6.851163015344787e-07}


100%|██████████| 3/3 [00:00<00:00, 15.38it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 14.42it/s]

{'ner': 0.0002039254004548868}


100%|██████████| 3/3 [00:00<00:00, 14.70it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.92it/s]

{'ner': 4.673994735106857e-06}


100%|██████████| 3/3 [00:00<00:00, 12.56it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.50it/s]

{'ner': 3.638702567523725e-05}


100%|██████████| 3/3 [00:00<00:00, 14.49it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.37it/s]

{'ner': 0.0011394481980668658}


100%|██████████| 3/3 [00:00<00:00, 11.78it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 14.35it/s]

{'ner': 3.9386960092638135e-07}


100%|██████████| 3/3 [00:00<00:00, 14.45it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.93it/s]

{'ner': 2.322214939486693e-09}


100%|██████████| 3/3 [00:00<00:00, 11.60it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.05it/s]

{'ner': 1.3503939263037524e-08}


100%|██████████| 3/3 [00:00<00:00, 12.87it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 14.31it/s]

{'ner': 4.337596208034536e-07}


100%|██████████| 3/3 [00:00<00:00, 13.17it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.30it/s]

{'ner': 0.1370976427542702}


100%|██████████| 3/3 [00:00<00:00, 11.21it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.19it/s]

{'ner': 2.092861570656763e-09}


100%|██████████| 3/3 [00:00<00:00, 10.95it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.04it/s]

{'ner': 4.552034278280682e-06}


100%|██████████| 3/3 [00:00<00:00, 12.13it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 14.74it/s]

{'ner': 8.776306432394832e-09}


100%|██████████| 3/3 [00:00<00:00, 14.24it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.10it/s]

{'ner': 3.481637619778767e-08}


100%|██████████| 3/3 [00:00<00:00, 15.51it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 17.01it/s]

{'ner': 1.128918535127421e-09}


100%|██████████| 3/3 [00:00<00:00, 15.42it/s]
100%|██████████| 3/3 [00:00<00:00, 15.31it/s]

{'ner': 0.0019569669711922287}



 67%|██████▋   | 2/3 [00:00<00:00, 15.28it/s]

{'ner': 1.6623586703030998e-09}


100%|██████████| 3/3 [00:00<00:00, 15.06it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.96it/s]

{'ner': 1.5979339515958455e-06}


100%|██████████| 3/3 [00:00<00:00, 15.80it/s]
100%|██████████| 3/3 [00:00<00:00, 18.04it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.9143722996368323e-11}
{'ner': 0.00025663790794617055}


100%|██████████| 3/3 [00:00<00:00, 19.36it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 16.14it/s]

{'ner': 2.4844124128445597e-07}


100%|██████████| 3/3 [00:00<00:00, 15.71it/s]
100%|██████████| 3/3 [00:00<00:00, 17.19it/s]


{'ner': 0.00035659604990664407}


100%|██████████| 3/3 [00:00<00:00, 15.46it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 18.02it/s]

{'ner': 8.43935855711609e-09}
{'ner': 2.2087449284432753e-08}


100%|██████████| 3/3 [00:00<00:00, 16.30it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 16.53it/s]

{'ner': 9.263891139127933e-07}


100%|██████████| 3/3 [00:00<00:00, 16.15it/s]
100%|██████████| 3/3 [00:00<00:00, 17.45it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.119733336178322e-07}
{'ner': 5.604234716878771e-09}


100%|██████████| 3/3 [00:00<00:00, 15.97it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 17.71it/s]

{'ner': 1.1232133650178968e-08}


100%|██████████| 3/3 [00:00<00:00, 17.24it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 15.20it/s]

{'ner': 9.737425998238515e-08}


100%|██████████| 3/3 [00:00<00:00, 16.27it/s]
100%|██████████| 3/3 [00:00<00:00, 17.19it/s]


{'ner': 1.7299674949466717e-07}
{'ner': 2.4070847134731487e-11}


100%|██████████| 3/3 [00:00<00:00, 16.04it/s]
100%|██████████| 3/3 [00:00<00:00, 16.64it/s]

{'ner': 7.098212330378699e-07}



100%|██████████| 3/3 [00:00<00:00, 16.59it/s]


{'ner': 3.6623691020883946e-07}
{'ner': 9.431171367936098e-11}


100%|██████████| 3/3 [00:00<00:00, 17.37it/s]

{'ner': 4.305321517632475e-07}





In [939]:
# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('2007', 'DATE')]
Tokens [('What', '', 2), ('is', '', 2), ('2007', 'DATE', 3), ('?', '', 2)]
Entities [('PETRA', 'ORG')]
Tokens [('What', '', 2), ('is', '', 2), ('PETRA', 'ORG', 3), ('?', '', 2)]
Entities [('new7wonder', 'ORG')]
Tokens [('What', '', 2), ('is', '', 2), ('new7wonder', 'ORG', 3), ('?', '', 2)]


In [940]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to E:\books\Masters\Winter 2020\Semantic web\Assignment4


In [941]:
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from E:\books\Masters\Winter 2020\Semantic web\Assignment4
Entities [('2007', 'DATE')]
Tokens [('What', '', 2), ('is', '', 2), ('2007', 'DATE', 3), ('?', '', 2)]
Entities [('PETRA', 'ORG')]
Tokens [('What', '', 2), ('is', '', 2), ('PETRA', 'ORG', 3), ('?', '', 2)]
Entities [('new7wonder', 'ORG')]
Tokens [('What', '', 2), ('is', '', 2), ('new7wonder', 'ORG', 3), ('?', '', 2)]


Giving exactly same results hence working perfectly!