In [859]:
import pandas as pd

names =  pd.read_csv('stringMatch.csv')
names.head(10)

Unnamed: 0,Wonders
0,UNESCO World Heritage Site
1,New7Wonder
2,Peru
3,Colosseum
4,Taj Mahal
5,Great Pyramid of Giza
6,Machu Picchu
7,Petra
8,"United Nations Educational, Scientific and Cultural Organization"
9,1931 AD


In [860]:
#Code for creating unigram
import re

def unigrams(string, n=1):
    string = string.lower()
    unigrams = zip(*[string[i:] for i in range(n)])
    return [''.join(unigram) for unigram in unigrams]

In [861]:
#Vectorising, TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

wonder_names = names['Wonders']
vectorizer = TfidfVectorizer(min_df=2, analyzer=unigrams)
tf_idf_matrix = vectorizer.fit_transform(wonder_names)

# Cosine Similarity

In [862]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def cosine_sim(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [863]:
# Only entities with a similarity above 0.86 are added to matches
matches = cosine_sim(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.86)

In [864]:
def get_matches_df(sparse_matrix, name_vector, top=1):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'similairity': similairity})

In [865]:
#determine the shape of the dataframe
matches_df.shape

(61, 3)

In [866]:
#since the total number of rows in the generated match is 61, displaying top 61 rows of the dataframe
matches_df = get_matches_df(matches, wonder_names, top=61)
matches_df

Unnamed: 0,left_side,right_side,similairity
0,UNESCO World Heritage Site,Unesco world heritage site,1.000000
1,UNESCO World Heritage Site,UNESCO World Heritage Site,1.000000
2,New7Wonder,New7Wonder,1.000000
3,New7Wonder,New7Wonders,0.976562
4,Peru,Peru,1.000000
...,...,...,...
56,1931 BC,1931 A.D.,0.922880
57,1931 BC,1931 AD,0.922880
58,Roman Colosseum,Roman Colosseum,1.000000
59,Roman Colosseum,Colosseum,0.887435


# Edit Distance

In [870]:
def editDist(str1, str2):
    mat = [[0 for j in range(len(str1) + 1)] for i in range(len(str2) + 1)]
    for i in range(len(str2) + 1):
        mat[i][0] = i
    for j in range(len(str1) + 1):
        mat[0][j] = j
    for i in range(1, len(str2) + 1):
        for j in range(1, len(str1) + 1):
            if str1[j-1] == str2[i-1]:
                mat[i][j] = mat[i-1][j-1]
            else:
                mat[i][j] = min([mat[i-1][j], mat[i][j-1], mat[i-1][j-1]]) + 1

    return mat[len(str2)][len(str1)]

In [871]:
editDist('Peru', 'Peruu')

1

# Jaccard Similarity

In [869]:
def jaccard_distance(Str1, Str2):
    Str1 = Str1.lower()
    Str2 = Str2.lower()
    Str1 = set(Str1)
    Str2 = set(Str2)
    return 1.0 * len(Str1&Str2)/len(Str1|Str2)

In [812]:
left_side =[]
right_side =[]
similarity=[]

In [834]:
Str1 = '1931 AD'
Str2 = '1931 BC'
left_side.append(Str1)
right_side.append(Str2)
sim = jaccard_distance(Str1, Str2)
similarity.append(sim)

jacDf = pd.DataFrame({'left_side':left_side, 'right_side':right_side, 'similarity':similarity})
jacDf

Unnamed: 0,left_side,right_side,similarity
0,UNESCO World Heritage Site,Unesco world heritage site,1.0
1,UNESCO World Heritage Site,Heritage Site,0.5625
2,New7Wonder,New7Wonders,0.875
3,New7Wonder,Seven Wonder,0.6
4,Peru,Peruu,1.0
5,Peru,Peroo,0.6
6,Colosseum,Coloseum,1.0
7,Colosseum,Roman Colosseum,0.636364
8,Colosseum,Kolosium,0.555556
9,Taj Mahal,Taj Mehel,0.875


In [None]:
#Jaccard distance without lowercase
def jaccard_distance(Str1, Str2):
    Str1 = set(Str1)
    Str2 = set(Str2)
    return 1.0 * len(Str1&Str2)/len(Str1|Str2)

In [None]:
left_side =[]
right_side =[]
similarity=[]

In [714]:
Str1 = 'UNESCO World Heritage Site'
Str2 = 'Unesco world heritage site'
left_side.append(Str1)
right_side.append(Str2)
sim = jaccard_distance(Str1, Str2)
similarity.append(sim)

jacDf = pd.DataFrame({'left_side':left_side, 'right_side':right_side, 'similarity':similarity})
jacDf

Unnamed: 0,left_side,right_side,similarity
0,UNESCO World Heritage Site,Unesco world heritage site,0.478261
1,UNESCO World Heritage Site,Heritage site,0.421053
2,New7Wonder,New7Wonders,0.9
3,New7Wonder,Seven Wonder,0.5
4,Peru,Peruu,1.0
5,Peru,Peroo,0.6
6,Colosseum,Coloseum,1.0
7,Colosseum,Roman Colosseum,0.636364
8,Colosseum,Kolosium,0.555556
9,Taj Mahal,Taj Mehel,0.875


# NL to SPARQL

In [872]:
ques = 'What type of structures are in both lists?'

In [873]:
#Importing libraries
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from contractions import CONTRACTION_MAP
import unicodedata

#Reference for contractions: https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/notebooks/Ch07_Analyzing_Movie_Reviews_Sentiment/contractions.py

In [874]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [875]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

In [876]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [877]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [878]:
nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

#Removing negation words from the list of stop words since they might be useful in providing meaning to a sentence
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

#Function to remove stop words
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [879]:
#Creating the text normalizer

def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True,
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters    
        if special_char_removal:
            doc = remove_special_characters(doc) 
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [884]:
corpus = normalize_corpus([ques])
print(corpus)

['type structure list']


In [None]:
listToStr = ''.join([str(elem) for elem in corpus]) 
sentence = listToStr
sentence_nlp = nlp(corpus)

In [882]:
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
posDF = pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])
posDF.head(15)

Unnamed: 0,Word,POS tag,Tag type
0,type,NN,NOUN
1,structure,NN,NOUN
2,list,NN,NOUN


In [883]:
#Printing all the named entities from the pre-processed corpus
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

[]


In [754]:
#Finding the most frequent named entities from the corpus
named_entities = []
for sentence in corpus:
    temp_entity_name = ''
    temp_named_entity = None
    sentence = nlp(sentence)
    for word in sentence:
        term = word.text 
        tag = word.ent_type_
        if tag:
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()
            temp_named_entity = (temp_entity_name, tag)
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name = ''
                temp_named_entity = None

entity_frame = pd.DataFrame(named_entities, 
                            columns=['Entity Name', 'Entity Type'])
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0
Entity Name,new7wonder
Entity Type,ORG
Frequency,1


In [None]:
sparqlTemplate = 'SELECT DISTINCT ?obj WHERE{?obj rdfs:subClassOf dbo:', entity, '}'

In [768]:
strng = top_entities['Entity Name']
if

0    new7wonder
Name: Entity Name, dtype: object
