In [1]:
import sys, os
from collections import defaultdict
sys.path.append(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/src/')
from cimple_querying import build_SPARQL_wrapper, request, get_predicates_recordType, \
            get_all_statements_with_predicate, get_all_subjects_objects_pairs_from_property_and_subject

wrapper=build_SPARQL_wrapper('https://data.cimple.eu/sparql')

In [2]:
# uses namespaces as keys 
concept_types=defaultdict(list)
for statement in request(wrapper,'select distinct ?Concept where {[] a ?Concept} LIMIT 100'):
    if statement['Concept']['value'][:18]=='http://schema.org/':
        concept_types['schema'].append(statement['Concept']['value'].split('/')[-1])
    elif statement['Concept']['value'][:18]=='http://www.w3.org/':
        concept_types['w3'].append(statement['Concept']['value'])
    elif statement['Concept']['value'][:16]=='http://purl.org/':
        concept_types['dc'].append(statement['Concept']['value'])
    elif statement['Concept']['value'][:26]=='http://www.openlinksw.com/':
        concept_types['openlinksw'].append(statement['Concept']['value'])
    else:
        concept_types['others'].append(statement['Concept']['value'])


In [3]:
concept_types['schema']

['Review',
 'NewsArticle',
 'Organization',
 'SocialMediaPosting',
 'Rating',
 'ClaimReview',
 'Claim']

## Counts on records

In [14]:
def countquery(recordtype):
    count_query="""SELECT ?s
    WHERE |
    ?s a schema:{}.
    ¨""".format(recordtype).replace('|','{').replace('¨','}')
    return len(request(wrapper, count_query))
for i in concept_types['schema']:
    print(i,' ->', countquery(i),'records')

Review  -> 1983 records
NewsArticle  -> 10000 records
Organization  -> 120 records
SocialMediaPosting  -> 10000 records
Rating  -> 10000 records
ClaimReview  -> 10000 records
Claim  -> 10000 records


In [4]:
def extract_predicates(record_type, as_subject=True):
    cimple_predicates=[]
    schema_predicates=[]
    other=[]
    for statement in request(wrapper, get_predicates_recordType(record_type, as_subject=as_subject)):
        if statement['predicate']['value'][:31]=='http://data.cimple.eu/ontology#':
            cimple_predicates.append(statement['predicate']['value'])
        elif statement['predicate']['value'][:18]=='http://schema.org/':
            schema_predicates.append(statement['predicate']['value'])
        else:
            other.append(statement['predicate']['value'])
    other=[]
    return cimple_predicates, schema_predicates, other


## predicates for record types def as subjects

In [16]:
recordType_schema=defaultdict(dict)
for type in concept_types['schema']:
    r=extract_predicates(type, as_subject=True)
    recordType_schema[type]['cimple_predicates']=r[0]
    recordType_schema[type]['schema_predicates']=r[1]
    recordType_schema[type]['others']=r[2]

In [17]:
print('schema_predicates')
for type in recordType_schema.keys():
    print(type, ' -> ', recordType_schema[type]['schema_predicates'])


schema_predicates
Review  ->  ['http://schema.org/mentions', 'http://schema.org/reviewRating', 'http://schema.org/itemReviewed', 'http://schema.org/reviewBody', 'http://schema.org/isBasedOnURL']
NewsArticle  ->  ['http://schema.org/articleBody', 'http://schema.org/headline', 'http://schema.org/mentions', 'http://schema.org/dateCreated', 'http://schema.org/author']
Organization  ->  ['http://schema.org/name', 'http://schema.org/url']
SocialMediaPosting  ->  ['http://schema.org/text', 'http://schema.org/mentions', 'http://schema.org/dateCreated']
Rating  ->  ['http://schema.org/name', 'http://schema.org/sameAs', 'http://schema.org/ratingValue', 'http://schema.org/author']
ClaimReview  ->  ['http://schema.org/url', 'http://schema.org/headline', 'http://schema.org/alternativeHeadline', 'http://schema.org/mentions', 'http://schema.org/reviewRating', 'http://schema.org/dateCreated', 'http://schema.org/author', 'http://schema.org/datePublished', 'http://schema.org/inLanguage', 'http://schema.

In [18]:
print('cimple_predicates')
for type in recordType_schema.keys():
    print(type, ' -> ', recordType_schema[type]['cimple_predicates'])

cimple_predicates
Review  ->  ['http://data.cimple.eu/ontology#normalizedReviewRating', 'http://data.cimple.eu/ontology#readability_score']
NewsArticle  ->  ['http://data.cimple.eu/ontology#hasPoliticalLeaning', 'http://data.cimple.eu/ontology#hasSentiment', 'http://data.cimple.eu/ontology#readability_score', 'http://data.cimple.eu/ontology#hasEmotion', 'http://data.cimple.eu/ontology#promotesConspiracy', 'http://data.cimple.eu/ontology#mentionsConspiracy']
Organization  ->  []
SocialMediaPosting  ->  ['http://data.cimple.eu/ontology#related', 'http://data.cimple.eu/ontology#hasPoliticalLeaning', 'http://data.cimple.eu/ontology#hasSentiment', 'http://data.cimple.eu/ontology#readability_score', 'http://data.cimple.eu/ontology#hasEmotion', 'http://data.cimple.eu/ontology#promotesConspiracy', 'http://data.cimple.eu/ontology#mentionsConspiracy']
Rating  ->  []
ClaimReview  ->  ['http://data.cimple.eu/ontology#normalizedReviewRating', 'http://data.cimple.eu/ontology#hasPoliticalLeaning', 'h

## predicates for record types def as objects

In [19]:
recordType_schema_objects=defaultdict(dict)
for type in concept_types['schema']:
    r=extract_predicates(type, as_subject=False)
    recordType_schema_objects[type]['cimple_predicates']=r[0]
    recordType_schema_objects[type]['schema_predicates']=r[1]
    recordType_schema_objects[type]['others']=r[2]

In [20]:
for type in recordType_schema_objects.keys():
    print(type, ' -> ', recordType_schema_objects[type]['schema_predicates'])

Review  ->  []
NewsArticle  ->  []
Organization  ->  ['http://schema.org/author']
SocialMediaPosting  ->  ['http://schema.org/itemReviewed']
Rating  ->  ['http://schema.org/sameAs', 'http://schema.org/reviewRating']
ClaimReview  ->  []
Claim  ->  ['http://schema.org/itemReviewed']


In [21]:
for type in recordType_schema_objects.keys():
    print(type, ' -> ', recordType_schema_objects[type]['cimple_predicates'])

Review  ->  []
NewsArticle  ->  []
Organization  ->  []
SocialMediaPosting  ->  []
Rating  ->  ['http://data.cimple.eu/ontology#normalizedReviewRating']
ClaimReview  ->  ['http://data.cimple.eu/ontology#related']
Claim  ->  []


## Count of ClaimReview predicates

In [23]:
for predicate in recordType_schema['ClaimReview']['schema_predicates']:
    print(predicate, 'has count ->', len(get_all_subjects_objects_pairs_from_property_and_subject(wrapper, 'ClaimReview', predicate)))

http://schema.org/url has count -> 10000
http://schema.org/headline has count -> 5069
http://schema.org/alternativeHeadline has count -> 5069
http://schema.org/mentions has count -> 10000
http://schema.org/reviewRating has count -> 10000
http://schema.org/dateCreated has count -> 10000
http://schema.org/author has count -> 10000
http://schema.org/datePublished has count -> 10000
http://schema.org/inLanguage has count -> 10000
http://schema.org/itemReviewed has count -> 10000


In [24]:
for predicate in recordType_schema['ClaimReview']['cimple_predicates']:
    print(predicate, 'has count ->', len(get_all_subjects_objects_pairs_from_property_and_subject(wrapper, 'ClaimReview', predicate)))

http://data.cimple.eu/ontology#normalizedReviewRating has count -> 10000
http://data.cimple.eu/ontology#hasPoliticalLeaning has count -> 5069
http://data.cimple.eu/ontology#hasSentiment has count -> 5069
http://data.cimple.eu/ontology#readability_score has count -> 5069
http://data.cimple.eu/ontology#hasEmotion has count -> 3900
http://data.cimple.eu/ontology#mentionsConspiracy has count -> 391
http://data.cimple.eu/ontology#promotesConspiracy has count -> 22
