# Topic Modeling on DBLP

In [11]:
from rdfframes.client.http_client import HttpClientDataFormat, HttpClient
from rdfframes.knowledge_graph import KnowledgeGraph

## Choose the graph and define the SPARQL endpoint URI

In [12]:
graph = KnowledgeGraph(
    graph_uri='http://dblp.l3s.de',
    prefixes={
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "swrc": "http://swrc.ontoware.org/ontology#",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "dc": "http://purl.org/dc/elements/1.1/",
        "dcterm": "http://purl.org/dc/terms/",
        "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/"
    })

endpoint = 'http://10.161.202.101:8890/sparql/'
port = 8890
output_format = HttpClientDataFormat.PANDAS_DF
client = HttpClient(endpoint_url=endpoint,
                    port=port,
                    return_format=output_format)

## Build a dataframe of papers titles from the graph 

In [13]:
dataset = graph.entities('swrc:InProceedings', entities_col_name='paper')\
    .expand(src_col_name='paper', predicate_list=[
        ('dc:creator', 'author'), ('dcterm:issued', 'date'),
        ('swrc:series', 'conference'), ('dc:title', 'title')])
dataset = dataset.cache()
    
authors = dataset.filter({'date':['>= 2000'], 'conference': ['IN (dblprc:vldb, dblprc:sigmod)']})\
    .group_by(['author']).count('paper', 'papers_count')\
    .filter({'papers_count':['>= 20']})

titles = dataset.join(authors, 'author').filter({'date': ['>= 2010']}).select_cols(['title'])

## Execute RDFframes code to get the result in a dataframe

In [5]:
df = titles.execute(client, return_format=output_format)
print(df.head(10))

                                               title
0  Opinion Fraud Detection in Online Reviews by N...
1  WindMine: Fast and Effective Mining of Web-cli...
2           Query Log Attack on Encrypted Databases.
3     TPC-BiH: A Benchmark for Bitemporal Databases.
4  Recommending People in Developers' Collaborati...
5  Discovering Subsumption Relationships for Web-...
6  Location based Social Network analysis using T...
7      Cost and Quality Trade-Offs in Crowdsourcing.
8  Randomly Partitioned Encryption for Cloud Data...
9                        Structured Data on the Web.


## Clean the data

In [6]:
# removing everything except alphabets`
df['clean_title'] = df['title'].str.replace("[^a-zA-Z#]", " ")
# removing short words
df['clean_title'] = df['clean_title'].apply(lambda x: ' '.join([w for w in str(x).split() if len(w)>3])) 
# make all text lowercase
df['clean_title'] = df['clean_title'].apply(lambda x: x.lower())
print(df.head())

                                               title  \
0  Opinion Fraud Detection in Online Reviews by N...   
1  WindMine: Fast and Effective Mining of Web-cli...   
2           Query Log Attack on Encrypted Databases.   
3     TPC-BiH: A Benchmark for Bitemporal Databases.   
4  Recommending People in Developers' Collaborati...   

                                         clean_title  
0  opinion fraud detection online reviews network...  
1     windmine fast effective mining click sequences  
2                   query attack encrypted databases  
3                     benchmark bitemporal databases  
4  recommending people developers collaboration n...  


In [7]:
import nltk
nltk.download('stopwords')

# Using the stopwords.
from nltk.corpus import stopwords
# Initialize the stopwords
stop_words = stopwords.words('english')
stop_words = [x.strip() for x in stop_words] + ['based']

# tokenization
tokenized_doc = df['clean_title'].apply(lambda x: x.split())
# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_title'] = detokenized_doc

print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amohamed/nltk_data...


                                               title  \
0  Opinion Fraud Detection in Online Reviews by N...   
1  WindMine: Fast and Effective Mining of Web-cli...   
2           Query Log Attack on Encrypted Databases.   
3     TPC-BiH: A Benchmark for Bitemporal Databases.   
4  Recommending People in Developers' Collaborati...   

                                         clean_title  
0  opinion fraud detection online reviews network...  
1     windmine fast effective mining click sequences  
2                   query attack encrypted databases  
3                     benchmark bitemporal databases  
4  recommending people developers collaboration n...  


[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(df['clean_title']) # document-term matrix

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
             random_state=122, tol=0.0)

In [9]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    string = "Topic "+str(i)+": "
    for t in sorted_terms:
        string += t[0] + " "
    print(string)

Topic 0: data mining networks management query social processing 
Topic 1: networks social information heterogeneous large mining graph 
Topic 2: query processing graph large graphs efficient mining 
Topic 3: query processing database social networks queries memory 
Topic 4: mining query crowd search frequent processing exploration 
Topic 5: large graphs social query search scale processing 
Topic 6: search database efficient keyword memory time social 
Topic 7: database mining social memory systems analytics service 
Topic 8: graph social event pattern analytics online streams 
Topic 9: queries using answering knowledge time event network 
Topic 10: time analytics real series information processing event 
Topic 11: databases probabilistic detection knowledge analysis time network 
Topic 12: search knowledge using extraction analysis social entity 
Topic 13: detection community event efficient memory streams aware 
Topic 14: multi learning classification feature detection label selecti