In [1]:
import pandas as pd
from rdfframes.client.http_client import HttpClientDataFormat, HttpClient
from rdfframes.knowledge_graph import KnowledgeGraph

In [2]:

graph = KnowledgeGraph(
    graph_uri='http://dblp.l3s.de',
    prefixes={
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "swrc": "http://swrc.ontoware.org/ontology#",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "dc": "http://purl.org/dc/elements/1.1/",
        "dcterm": "http://purl.org/dc/terms/",
        "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/"
    })

endpoint = 'http://10.161.202.101:8890/sparql/'
port = 8890
output_format = HttpClientDataFormat.PANDAS_DF
client = HttpClient(endpoint_url=endpoint,
                    port=port,
                    return_format=output_format)

In [3]:
dataset = graph.entities('swrc:InProceedings', entities_col_name='paper')\
    .expand(src_col_name='paper', predicate_list=[
        ('dc:creator', 'author'), ('dcterm:issued', 'date'),
        ('swrc:series', 'conference'), ('dc:title', 'title')])
dataset = dataset.cache()
    
authors = dataset.filter({'date':['>= 2000'], 'conference': ['IN (dblprc:vldb, dblprc:sigmod)']})\
    .group_by(['author']).count('paper', 'papers_count')\
    .filter({'papers_count':['>= 20']})

titles = dataset.join(authors, 'author').filter({'date': ['>= 2005']}).select_cols(['title'])

In [4]:
print("Sparql Query = \n{}".format(titles.to_sparql()))

Sparql Query = 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX swrc: <http://swrc.ontoware.org/ontology#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterm: <http://purl.org/dc/terms/>
PREFIX dblprc: <http://dblp.l3s.de/d2r/resource/conferences/>
SELECT ?title 
FROM <http://dblp.l3s.de>
WHERE {
	?paper rdf:type swrc:InProceedings .
	?paper dc:creator ?author .
	?paper dcterm:issued ?date .
	?paper swrc:series ?conference .
	?paper dc:title ?title .
	FILTER (  (year(xsd:dateTime(?date)) >= 2005 ) ) 

		{
		SELECT ?author  (COUNT( ?paper) AS ?papers_count) 
		WHERE {
			?paper rdf:type swrc:InProceedings .
			?paper dc:creator ?author .
			?paper dcterm:issued ?date .
			?paper swrc:series ?conference .
			?paper dc:title ?title .
			FILTER (  (year(xsd:dateTime(?date)) >= 2000 ) &&  (?conference IN (dblprc:vldb, dblprc:sigmod) ) ) 
			} GROUP BY ?author 
		HAVING ( ( COUNT( ?paper) >= 20 ) )
		
		}
	}



In [5]:
df = titles.execute(client, return_format=output_format)

time of the query preparation 0.0005693435668945312


In [6]:
print(df.columns)
print(df.shape)
print(df.head(10))

Index(['title'], dtype='object')
(4684, 1)
                                               title
0  A framework for using reference ontologies as ...
1  Regular Paths in SparQL: Querying the NCI Thes...
2  Automatic XQuery Generation and Generalized Vi...
3  Generating Application Ontologies from Referen...
4  Laziness is a Virtue: Motion Stitching Using E...
5  FMDistance: A Fast and Effective Distance Func...
6  Scalable modeling of real graphs using Kroneck...
7  Probabilistic Tensor Analysis with Akaike and ...
8  SLL: Running My Web Services on Your WS Platfo...
9  Opinion Fraud Detection in Online Reviews by N...


In [7]:
# removing everything except alphabets`
df['clean_title'] = df['title'].str.replace("[^a-zA-Z#]", " ")
# removing short words
df['clean_title'] = df['clean_title'].apply(lambda x: ' '.join([w for w in str(x).split() if len(w)>3])) 
# make all text lowercase
df['clean_title'] = df['clean_title'].apply(lambda x: x.lower())
print(df.head())

                                               title  \
0  A framework for using reference ontologies as ...   
1  Regular Paths in SparQL: Querying the NCI Thes...   
2  Automatic XQuery Generation and Generalized Vi...   
3  Generating Application Ontologies from Referen...   
4  Laziness is a Virtue: Motion Stitching Using E...   

                                         clean_title  
0  framework using reference ontologies foundatio...  
1            regular paths sparql querying thesaurus  
2  automatic xquery generation generalized visual...  
3  generating application ontologies from referen...  
4  laziness virtue motion stitching using effort ...  


In [8]:
import nltk
nltk.download('stopwords')

# Using the stopwords.
from nltk.corpus import stopwords
# Initialize the stopwords
stop_words = stopwords.words('english')
stop_words = [x.strip() for x in stop_words] + ['based']

# tokenization
tokenized_doc = df['clean_title'].apply(lambda x: x.split())
# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_title'] = detokenized_doc

print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amohamed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               title  \
0  A framework for using reference ontologies as ...   
1  Regular Paths in SparQL: Querying the NCI Thes...   
2  Automatic XQuery Generation and Generalized Vi...   
3  Generating Application Ontologies from Referen...   
4  Laziness is a Virtue: Motion Stitching Using E...   

                                         clean_title  
0  framework using reference ontologies foundatio...  
1            regular paths sparql querying thesaurus  
2  automatic xquery generation generalized visual...  
3  generating application ontologies reference on...  
4  laziness virtue motion stitching using effort ...  


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(df['clean_title']) # document-term matrix

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)

20

In [10]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    string = "Topic "+str(i)+": "
    for t in sorted_terms:
        string += t[0] + " "
    print(string)

Topic 0: data mining query processing management streams networks 
Topic 1: data management streams quality integration cleaning structured 
Topic 2: query processing queries stream continuous optimization distributed 
Topic 3: mining search graphs large graph databases scale 
Topic 4: search efficient databases keyword similarity database queries 
Topic 5: queries database management systems using large answering 
Topic 6: database management systems information query search applications 
Topic 7: large graphs scale search efficient processing social 
Topic 8: databases large probabilistic graphs information scale uncertain 
Topic 9: graph information detection processing approach extraction using 
Topic 10: information efficient extraction mining queries management graphs 
Topic 11: efficient time databases computation distributed series detection 
Topic 12: management processing distributed stream mining queries systems 
Topic 13: time series databases real distributed detection sys