In [44]:
import json
# import SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
from tqdm import tqdm
# import rdflib
import math
import ast

In [5]:
# functions for SPARQL queries to pandas dataframes 

url = 'https://api.cooperationdatabank.org/datasets/coda/databank/services/databank/sparql'

#  SPARQL queries for more then 10,000 triples
#  set num_iter to amount of expected returned triples (e.g. 20 for 20,000 triples) 
def get_sparql_dataframe(service, query):
    sparql = SPARQLWrapper(service)
    out = []
    num_iter = 20
    
    for i in range(num_iter):
        sparql.setQuery(query + " OFFSET " + str(i) + "000")
        sparql.setReturnFormat(JSON)
        result = sparql.query()
        processed_results = json.load(result.response)
        cols = processed_results['head']['vars']

        for row in processed_results['results']['bindings']:
            item = []
            for c in cols:
                item.append(row.get(c, {}).get('value'))
            out.append(item)
        
    return pd.DataFrame(out, columns=cols)


#  SPARQL queries for less then 10,000 triples 
def get_sparql_dataframe_small(service, query):
    sparql = SPARQLWrapper(service)
    out = []
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()
    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
        
    return pd.DataFrame(out, columns=cols)

#  SPARQL construct queries for less then 10,000 triples 
def get_sparql_dataframe_construct(service, query):
    sparql = SPARQLWrapper(service)
    out = []
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()
    processed_results = json.load(result.response)
    cols = ['s', 'p', 'o']

    for row in processed_results:
        item = []
        for c in cols:
            item.append(row.get(c))
        out.append(item)
        
    return pd.DataFrame(out, columns=cols)


# 1. Data Selection

In [6]:
#  Get all useful observations for preprocessing

query = """
PREFIX property: <https://data.cooperationdatabank.org/vocab/prop/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX id: <https://data.cooperationdatabank.org/id/>
PREFIX class: <https://data.cooperationdatabank.org/vocab/class/>

SELECT * WHERE {
  
  ?obs rdf:type class:Observation .
  ?obs property:eSmeasure <https://data.cooperationdatabank.org/id/esmeasure/d> . 
  ?obs property:dependentVariable ?dependent . 
  ?obs property:eSEstimate ?ES .
  ?obs property:effectSizeSampleSize ?N . 

  ?obs property:treatment ?t1, ?t2 . 
  ?study property:reportsEffect ?obs . 
  ?t1 property:betweenOrWithinParticipantsDesign ?design . 


OPTIONAL {
  ?t1 property:nCondition ?n1 . 
  ?t2 property:nCondition ?n2 .

  ?t1 property:sDforCondition ?sd1 . 
  ?t2 property:sDforCondition ?sd2 .
  } 

OPTIONAL { 
  ?obs property:effectSizeLowerLimit ?ESLower . 
  ?obs property:effectSizeUpperLimit ?ESUpper .  
}

OPTIONAL {
  ?paper property:study ?study . 
  ?paper property:doi ?doi . }
  
  FILTER (?t1 < ?t2) 
} 
"""

observations = get_sparql_dataframe_small(url, query)

In [7]:
#  inspect dataframe 

pd.set_option('max_colwidth', 100)
pd.set_option('max_row', 100)
observations 

Unnamed: 0,obs,dependent,ES,N,t1,t2,study,design,n1,n2,sd1,sd2,ESLower,ESUpper,paper,doi
0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.24393,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28,29,3.36,3.7,0.676655,1.8112,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004
1,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.08443,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28,29,3.36,4.08,0.528299,1.64057,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.0872995,58.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,29,29,3.7,4.08,-0.602256,0.427657,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004
3,https://data.cooperationdatabank.org/id/ENG00008_2.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.324082,192.0,https://data.cooperationdatabank.org/id/ENG00008_2.1.1,https://data.cooperationdatabank.org/id/ENG00008_2.1.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96,96,3.288,4.281,-0.60883,-0.0393351,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004
4,https://data.cooperationdatabank.org/id/ENG00008_2.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,0.315689,144.0,https://data.cooperationdatabank.org/id/ENG00008_2.2.1,https://data.cooperationdatabank.org/id/ENG00008_2.2.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96,48,3.91,3.52,-0.0326996,0.664078,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4767,https://data.cooperationdatabank.org/id/ENG02674_1.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.11775,80.0,https://data.cooperationdatabank.org/id/ENG02674_1.1.1,https://data.cooperationdatabank.org/id/ENG02674_1.1.2,https://data.cooperationdatabank.org/id/ENG02674_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.617696,0.382197,https://data.cooperationdatabank.org/id/ENG02674,http://dx.doi.org/10.1111/j.1813-6982.2007.00127.x
4768,https://data.cooperationdatabank.org/id/ENG02682_1.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.158607,40.0,https://data.cooperationdatabank.org/id/ENG02682_1.2.1,https://data.cooperationdatabank.org/id/ENG02682_1.2.2,https://data.cooperationdatabank.org/id/ENG02682_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.589394,0.906608,https://data.cooperationdatabank.org/id/ENG02682,http://dx.doi.org/10.1016/j.jtrangeo.2013.05.012
4769,https://data.cooperationdatabank.org/id/ENG02690_1.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.141421,50.0,https://data.cooperationdatabank.org/id/ENG02690_1.2.1,https://data.cooperationdatabank.org/id/ENG02690_1.2.2,https://data.cooperationdatabank.org/id/ENG02690_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.696475,0.413633,https://data.cooperationdatabank.org/id/ENG02690,http://dx.doi.org/10.1177/0146167210391103
4770,https://data.cooperationdatabank.org/id/ENG02506_1.5.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/withdrawals,-1.29771,11.0,https://data.cooperationdatabank.org/id/ENG02506_1.5.1,https://data.cooperationdatabank.org/id/ENG02506_1.5.2,https://data.cooperationdatabank.org/id/ENG02506_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-2.59807,0.00265933,https://data.cooperationdatabank.org/id/ENG02506,http://dx.doi.org/10.1016/j.ecolecon.2008.01.015


In [8]:
#  set dtypes

observations['independentProperties'] = np.nan
observations = observations.astype('object')
observations.dtypes

obs                      object
dependent                object
ES                       object
N                        object
t1                       object
t2                       object
study                    object
design                   object
n1                       object
n2                       object
sd1                      object
sd2                      object
ESLower                  object
ESUpper                  object
paper                    object
doi                      object
independentProperties    object
dtype: object

In [9]:
def find_differences(t1,t2): 
    t1 = t1.reset_index(drop=True)
    t2 = t2.reset_index(drop=True)
    
    df = pd.concat([t1, t2], keys=['t1', 't2'])
    df_gpby = df.groupby(list(df.columns))
    idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
    
    return df.reindex(idx).sort_index()


#  Filter out non-relevant predicates
def filter_predicates(some_list):
    
    #  These predicates usually differ for all treatments that have them 
    filter_outs = ['http://www.w3.org/2000/01/rdf-schema#label',
                   'https://data.cooperationdatabank.org/vocab/prop/meanContributionOrWithdrawalForCondition',
                   'https://data.cooperationdatabank.org/vocab/prop/nCondition',
                   'https://data.cooperationdatabank.org/vocab/prop/sDforCondition', 
                   'https://data.cooperationdatabank.org/vocab/prop/proportionOfCooperationCondition', 
                   'https://data.cooperationdatabank.org/vocab/prop/individualDifferenceLevel',
                   'https://data.cooperationdatabank.org/vocab/prop/nbOfLevels'] 
    
    for uri in filter_outs: 
        if uri in some_list: 
            some_list.remove(uri)
    
    return some_list 

#  Find all the predicates on which two treatments differ 
def find_differing_independents(obs): 
    outcome = pd.DataFrame()
    loop = tqdm(total = len(obs), position=0, leave=False)
    
    for i, row in obs.iterrows(): 
        row = observations.loc[i]
        observation = row['obs']
        treatments = {'t1': outcome, 't2': outcome}

        for treatment in treatments: 
            query = """
                        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                        PREFIX class: <https://data.cooperationdatabank.org/vocab/class/>

                        SELECT ?p ?o WHERE {
                         <""" + str(row[treatment]) + """> ?p ?o .                   
                        } 
                        """

            treatments[treatment] = get_sparql_dataframe_small(url, query)
        
        #  Show percentage bar
        loop.set_description("Loading...".format(i))
        loop.update(1)

        df_differences = find_differences(treatments['t1'].sort_values(by=['p']) , treatments['t2'].sort_values(by=['p']))
        df_differences = df_differences.reset_index(drop=True)
        only_in_one_t = df_differences['p'].drop_duplicates(keep=False).index
        different_predicates = df_differences['p'].drop(only_in_one_t).unique().tolist()
        independent_variables = filter_predicates(different_predicates)
        
        if independent_variables == []: 
            obs = obs.drop([i])
        else: 
            obs.at[i, 'independentProperties'] = independent_variables

    loop.close()
    return obs 
observations = find_differing_independents(observations)

                                                                                                                       

In [11]:
#  inspect observations table again 
observations = observations.reset_index(drop=True)
pd.set_option('max_rows', 100)
pd.set_option('max_colwidth', 100)
observations

Unnamed: 0,obs,dependent,ES,N,t1,t2,study,design,n1,n2,sd1,sd2,ESLower,ESUpper,paper,doi,independentProperties
0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.24393,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28,29,3.36,3.7,0.676655,1.8112,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality]
1,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.08443,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28,29,3.36,4.08,0.528299,1.64057,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality]
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.0872995,58.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,29,29,3.7,4.08,-0.602256,0.427657,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/positionInGame]
3,https://data.cooperationdatabank.org/id/ENG00008_2.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.324082,192.0,https://data.cooperationdatabank.org/id/ENG00008_2.1.1,https://data.cooperationdatabank.org/id/ENG00008_2.1.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96,96,3.288,4.281,-0.60883,-0.0393351,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/expectationsLevel]
4,https://data.cooperationdatabank.org/id/ENG00008_2.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,0.315689,144.0,https://data.cooperationdatabank.org/id/ENG00008_2.2.1,https://data.cooperationdatabank.org/id/ENG00008_2.2.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96,48,3.91,3.52,-0.0326996,0.664078,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3246,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.719742,126.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.297905,1.14158,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType]
3247,https://data.cooperationdatabank.org/id/ENG02668_1.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.0193469,107.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.43813,0.399437,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType]
3248,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.0690061,37.0,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1,https://data.cooperationdatabank.org/id/ENG02672_1a.1.2,https://data.cooperationdatabank.org/id/ENG02672_1a,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.888749,0.750737,https://data.cooperationdatabank.org/id/ENG02672,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType]
3249,https://data.cooperationdatabank.org/id/ENG02674_1.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.11775,80.0,https://data.cooperationdatabank.org/id/ENG02674_1.1.1,https://data.cooperationdatabank.org/id/ENG02674_1.1.2,https://data.cooperationdatabank.org/id/ENG02674_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.617696,0.382197,https://data.cooperationdatabank.org/id/ENG02674,http://dx.doi.org/10.1111/j.1813-6982.2007.00127.x,"[https://data.cooperationdatabank.org/vocab/prop/endowmentSize, https://data.cooperationdatabank..."


In [12]:
#  get all observations that only differ on one property 
listje = []

def unique(trends): 
    output = []
    for x in trends:
        if x not in output:
            output.append(x)
    return output
 
for index, row in observations.iterrows(): 
    if len(row['independentProperties']) == 1: 
        listje.append(row['obs'])
        
unique_obs = unique(listje)
len(unique_obs)

2491

In [13]:
#  get the data table with the observations that differ on only one property
observations_unique = observations[observations['obs'].isin(unique_obs)]

In [15]:
# Store if you don't continue right away, to not have to run slow code again 
%store observations_unique

Stored 'observations_with_unique_independents' (DataFrame)


In [16]:
%store -r observations_unique
observations_unique

Unnamed: 0,obs,dependent,ES,N,t1,t2,study,design,n1,n2,sd1,sd2,ESLower,ESUpper,paper,doi,independentProperties
0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.24393,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28,29,3.36,3.7,0.676655,1.8112,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality]
1,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.08443,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28,29,3.36,4.08,0.528299,1.64057,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality]
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.0872995,58.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,29,29,3.7,4.08,-0.602256,0.427657,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/positionInGame]
3,https://data.cooperationdatabank.org/id/ENG00008_2.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.324082,192.0,https://data.cooperationdatabank.org/id/ENG00008_2.1.1,https://data.cooperationdatabank.org/id/ENG00008_2.1.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96,96,3.288,4.281,-0.60883,-0.0393351,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/expectationsLevel]
4,https://data.cooperationdatabank.org/id/ENG00008_2.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,0.315689,144.0,https://data.cooperationdatabank.org/id/ENG00008_2.2.1,https://data.cooperationdatabank.org/id/ENG00008_2.2.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96,48,3.91,3.52,-0.0326996,0.664078,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3244,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.100519,48.0,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2,https://data.cooperationdatabank.org/id/ENG02667_1b.2.3,https://data.cooperationdatabank.org/id/ENG02667_1b,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.552567,0.753605,https://data.cooperationdatabank.org/id/ENG02667,http://dx.doi.org/10.2466/pr0.1975.36.2.371,[https://data.cooperationdatabank.org/vocab/prop/iteratedPreprogrammedCooperationRate]
3245,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.739089,127.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.31896,1.15922,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType]
3246,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.719742,126.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.297905,1.14158,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType]
3247,https://data.cooperationdatabank.org/id/ENG02668_1.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.0193469,107.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.43813,0.399437,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType]


In [17]:
observations_unique[["ES", "N", "n1", "n2", "sd1", "sd2", "ESLower", "ESUpper"]] = observations_unique[["ES", 
                           "N", "n1", "n2", "sd1", "sd2", "ESLower", "ESUpper"]].astype('float')

In [18]:
observations_unique.dtypes

obs                       object
dependent                 object
ES                       float64
N                        float64
t1                        object
t2                        object
study                     object
design                    object
n1                       float64
n2                       float64
sd1                      float64
sd2                      float64
ESLower                  float64
ESUpper                  float64
paper                     object
doi                       object
independentProperties     object
dtype: object

In [20]:
observations_unique['ESLower'].isnull().sum()

13

# Code for Calculating the standard error 
(not used for the current approach)

In [None]:
# Calculate SE for observations that contain extra metrics 
def calculate_SE(obs):
    for index, row in obs.iterrows(): 
        N = float(row['N'])
        ES = float(row['ES'])
        
      # Values from CoDa treatment 1 
        n1 = float(row['n1'])
        s1 = float(row['sd1'])

        # Values from CoDa treatment 2 
        n2 = float(row['n2'])
        s2 = float(row['sd2']) 
        
        if math.isnan(n1) or math.isnan(s1) or math.isnan(n2) or math.isnan(s2): 
            continue 
        else:    

            #  Between participants design 
            if row['design'] == 'https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1':         

                # Hedges' pooled and weighted standard deviation 
                SD_pooled = math.sqrt((((n1-1)*(s1**2)) + ((n2-1)*(s2**2)))/(n1+n2-2))

                #  standard error for treatment 1 and 2 respectively 
                SE1 = s1 / math.sqrt(n1)
                SE2 = s2 / math.sqrt(n2)

                #  standard error for difference of the means 
                SE_group_mean_difference = math.sqrt((SE1**2)+(SE2**2))

                #  standard error for the effect size 
                SE_effect_size = SE_group_mean_difference/SD_pooled


            #  Within participants design 
            elif row['design'] == 'https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/2':

                #  NOT SURE ABOUT THIS, approximation of the sd of differences 
                SD_difference = math.sqrt((((n1-1)*(s1**2)) + ((n2-1)*(s2**2))) / (n1 + n2 - 2)) 
                SE_difference = SD_difference / math.sqrt(N)

                # Standard deviation of the first treatment is used, don't have argumentation for this yet 
                SE_effect_size = SE_difference / s1


            obs.at[index, 'SE'] = SE_effect_size

    return obs

In [None]:
obs = calculate_SE(observations_with_unique_independents) 

# 2. Create ComplEx Input
## 2.1 Find IV Values 

In [23]:
obs = observations_with_unique_independents.reset_index(drop=True)
obs['categoryT1'] = np.nan
obs['categoryT2'] = np.nan

In [24]:
obs[["categoryT1", "categoryT2"]] = obs[["categoryT1", "categoryT2"]].astype('object')

In [25]:
obs

Unnamed: 0,obs,dependent,ES,N,t1,t2,study,design,n1,n2,sd1,sd2,ESLower,ESUpper,paper,doi,independentProperties,categoryT1,categoryT2
0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.243930,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,3.700,0.676655,1.811200,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],,
1,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.084430,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,4.080,0.528299,1.640570,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],,
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.087300,58.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,29.0,29.0,3.700,4.080,-0.602256,0.427657,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/positionInGame],,
3,https://data.cooperationdatabank.org/id/ENG00008_2.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.324082,192.0,https://data.cooperationdatabank.org/id/ENG00008_2.1.1,https://data.cooperationdatabank.org/id/ENG00008_2.1.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,96.0,3.288,4.281,-0.608830,-0.039335,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/expectationsLevel],,
4,https://data.cooperationdatabank.org/id/ENG00008_2.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,0.315689,144.0,https://data.cooperationdatabank.org/id/ENG00008_2.2.1,https://data.cooperationdatabank.org/id/ENG00008_2.2.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,48.0,3.910,3.520,-0.032700,0.664078,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2486,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.100519,48.0,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2,https://data.cooperationdatabank.org/id/ENG02667_1b.2.3,https://data.cooperationdatabank.org/id/ENG02667_1b,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.552567,0.753605,https://data.cooperationdatabank.org/id/ENG02667,http://dx.doi.org/10.2466/pr0.1975.36.2.371,[https://data.cooperationdatabank.org/vocab/prop/iteratedPreprogrammedCooperationRate],,
2487,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.739089,127.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.318960,1.159220,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],,
2488,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.719742,126.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.297905,1.141580,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],,
2489,https://data.cooperationdatabank.org/id/ENG02668_1.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.019347,107.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.438130,0.399437,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],,


In [27]:
def get_category_query(treatment, independent):
    query = """
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX class: <https://data.cooperationdatabank.org/vocab/class/>

            SELECT ?o WHERE {
             <""" + treatment + """> <""" + independent + """> ?o .                   
            } 
            """
    
    cat_df = get_sparql_dataframe_small(url, query)
    cat = cat_df['o'][0]
    return cat

#  Get the IV values 
def get_categories(obs): 
    loop = tqdm(total = len(obs), position=0, leave=False)
    
    for i, row in obs.iterrows(): 
        
        observation = row['obs']
        t1 = row['t1']
        t2 = row['t2']
#         independent = ast.literal_eval(row['independentProperties'])[0]
        independent = row['independentProperties'][0]
        try: 
            catT1 = get_category_query(t1, independent)
            catT2 = get_category_query(t2, independent)

            obs.at[i, 'categoryT1'] = catT1
            obs.at[i, 'categoryT2'] = catT2
            
        except: 
            continue
        #  Show percentage bar
        loop.set_description("Loading...".format(i))
        loop.update(1)

    loop.close()
    return obs 
obs = get_categories(obs)

                                                                                                                       

In [28]:
obs

Unnamed: 0,obs,dependent,ES,N,t1,t2,study,design,n1,n2,sd1,sd2,ESLower,ESUpper,paper,doi,independentProperties,categoryT1,categoryT2
0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.243930,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,3.700,0.676655,1.811200,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking
1,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.084430,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,4.080,0.528299,1.640570,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.087300,58.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,29.0,29.0,3.700,4.080,-0.602256,0.427657,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/positionInGame],1,2
3,https://data.cooperationdatabank.org/id/ENG00008_2.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.324082,192.0,https://data.cooperationdatabank.org/id/ENG00008_2.1.1,https://data.cooperationdatabank.org/id/ENG00008_2.1.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,96.0,3.288,4.281,-0.608830,-0.039335,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/expectationsLevel],https://data.cooperationdatabank.org/id/expectationslevel/low,https://data.cooperationdatabank.org/id/expectationslevel/high
4,https://data.cooperationdatabank.org/id/ENG00008_2.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,0.315689,144.0,https://data.cooperationdatabank.org/id/ENG00008_2.2.1,https://data.cooperationdatabank.org/id/ENG00008_2.2.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,48.0,3.910,3.520,-0.032700,0.664078,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2486,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.100519,48.0,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2,https://data.cooperationdatabank.org/id/ENG02667_1b.2.3,https://data.cooperationdatabank.org/id/ENG02667_1b,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.552567,0.753605,https://data.cooperationdatabank.org/id/ENG02667,http://dx.doi.org/10.2466/pr0.1975.36.2.371,[https://data.cooperationdatabank.org/vocab/prop/iteratedPreprogrammedCooperationRate],0.5,1.0
2487,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.739089,127.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.318960,1.159220,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],https://data.cooperationdatabank.org/id/svotype/prosocial,https://data.cooperationdatabank.org/id/svotype/individualist
2488,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.719742,126.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.297905,1.141580,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],https://data.cooperationdatabank.org/id/svotype/prosocial,https://data.cooperationdatabank.org/id/svotype/competitor
2489,https://data.cooperationdatabank.org/id/ENG02668_1.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.019347,107.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.438130,0.399437,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],https://data.cooperationdatabank.org/id/svotype/individualist,https://data.cooperationdatabank.org/id/svotype/competitor


In [69]:
# obs[["IV_new"]] = obs[["IV_new"]].astype('object')

In [30]:
#  take out observation that don't have a IV value) 

obs = obs.dropna(subset=['categoryT1', 'categoryT2']).reset_index(drop=True)
obs

Unnamed: 0,obs,dependent,ES,N,t1,t2,study,design,n1,n2,sd1,sd2,ESLower,ESUpper,paper,doi,independentProperties,categoryT1,categoryT2
0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.243930,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,3.700,0.676655,1.811200,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking
1,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.084430,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,4.080,0.528299,1.640570,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.087300,58.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,29.0,29.0,3.700,4.080,-0.602256,0.427657,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/positionInGame],1,2
3,https://data.cooperationdatabank.org/id/ENG00008_2.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.324082,192.0,https://data.cooperationdatabank.org/id/ENG00008_2.1.1,https://data.cooperationdatabank.org/id/ENG00008_2.1.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,96.0,3.288,4.281,-0.608830,-0.039335,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/expectationsLevel],https://data.cooperationdatabank.org/id/expectationslevel/low,https://data.cooperationdatabank.org/id/expectationslevel/high
4,https://data.cooperationdatabank.org/id/ENG00008_2.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,0.315689,144.0,https://data.cooperationdatabank.org/id/ENG00008_2.2.1,https://data.cooperationdatabank.org/id/ENG00008_2.2.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,48.0,3.910,3.520,-0.032700,0.664078,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,[https://data.cooperationdatabank.org/vocab/prop/sequentiality],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.100519,48.0,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2,https://data.cooperationdatabank.org/id/ENG02667_1b.2.3,https://data.cooperationdatabank.org/id/ENG02667_1b,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.552567,0.753605,https://data.cooperationdatabank.org/id/ENG02667,http://dx.doi.org/10.2466/pr0.1975.36.2.371,[https://data.cooperationdatabank.org/vocab/prop/iteratedPreprogrammedCooperationRate],0.5,1.0
2476,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.739089,127.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.318960,1.159220,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],https://data.cooperationdatabank.org/id/svotype/prosocial,https://data.cooperationdatabank.org/id/svotype/individualist
2477,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.719742,126.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.297905,1.141580,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],https://data.cooperationdatabank.org/id/svotype/prosocial,https://data.cooperationdatabank.org/id/svotype/competitor
2478,https://data.cooperationdatabank.org/id/ENG02668_1.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.019347,107.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.438130,0.399437,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,[https://data.cooperationdatabank.org/vocab/prop/sVOType],https://data.cooperationdatabank.org/id/svotype/individualist,https://data.cooperationdatabank.org/id/svotype/competitor


In [36]:
type(obs['independentProperties'].unique())

TypeError: unhashable type: 'list'

In [38]:
obs.to_csv('data/observations_3.csv')
obs = pd.read_csv('data/observations_3.csv', index_col = 0)

In [47]:
#  bin continous IV values that create to much hypothesis numbers  

for independent in obs['independentProperties'].unique():
    independent_name = ast.literal_eval(independent)[0]
    catsT1 = obs.loc[obs['independentProperties'] == independent]['categoryT1']
    catsT2 = obs.loc[obs['independentProperties'] == independent]['categoryT2']
    ind_df = obs.loc[obs['independentProperties'] == independent]
                  
    values = np.concatenate((catsT1, catsT2), axis=None)
    cats_unique_len = len(np.unique(values))
    if cats_unique_len > 6:
        try:
            catsT1.astype('float')
            catsfloats = values.astype('float')
        except:
            continue
           
        intervals = pd.qcut(catsfloats, 4, duplicates='drop').categories
        print(intervals)
        for cat in catsT1.unique(): 
            index = np.where(intervals.contains(float(cat)) == True)[0][0]
            obs_indexes = ind_df.loc[ind_df['categoryT1'] == cat].index
            intervals[index]
            obs.at[obs_indexes,'categoryT1'] = intervals[index]
            
        for cat in catsT2.unique(): 
            index = np.where(intervals.contains(float(cat)) == True)[0][0]
            obs_indexes = ind_df.loc[ind_df['categoryT2'] == cat].index
            intervals[index]
            obs.at[obs_indexes,'categoryT2'] = intervals[index]   

## 2.2 Make IV with Hypothesis Number 

In [48]:
def make_final_IV(obs):    
    lists = []
    sets = []
    loop = tqdm(total = len(obs), position=0, leave=False)
    IVs_and_cats = pd.DataFrame(columns = ['IV', 'catT1', 'catT2', 'number'])
    
    for i, row in obs.iterrows(): 
        IV = ast.literal_eval(row['independentProperties'])[0]
        catT1 = row['categoryT1']
        catT2 = row['categoryT2']
        ES = row['ES']
        ESUp = row['ESUpper']
        ESLo = row['ESLower']
        check_list = [IV, catT1, catT2]
        check_set = set([IV, catT1, catT2])
        
        if check_set not in sets and check_list not in lists:
            lists.append(check_list)
            sets.append(check_set)

            try:
                num = IVs_and_cats['IV'].value_counts()[IV]+1
            except:
                num = 1
            
            df_row = pd.DataFrame([[IV, catT1, catT2, num]], columns = ['IV', 'catT1', 'catT2', 'number'])
            IVs_and_cats = IVs_and_cats.append(df_row, ignore_index=True)
            
            obs.at[i, 'IV_new'] = IV + str('_H') +str(num)  
        
        elif check_set in sets and check_list in lists: 
            
            index = IVs_and_cats.loc[(IVs_and_cats['IV'] == IV) & 
                                     (IVs_and_cats['catT1'] == catT1) &
                                     (IVs_and_cats['catT2'] == catT2)]['number']
            obs.at[i, 'IV_new'] = IV + str('_H') + str(index.values[0])  
            
        elif check_set in sets and check_list not in lists: 
            obs.at[i, 'categoryT1'] = catT2
            obs.at[i, 'categoryT2'] = catT1
            obs.at[i, 'ES'] = (ES * -1)
            obs.at[i, 'ESUpper'] = (ESUp * -1)
            obs.at[i, 'ESLower'] = (ESLo * -1)
            
            index = IVs_and_cats.loc[(IVs_and_cats['IV'] == IV) & 
                                     (IVs_and_cats['catT1'] == catT2) &
                                     (IVs_and_cats['catT2'] == catT1)]['number']
            obs.at[i, 'IV_new'] = IV + str('_H') + str(index.values[0])             
            
        #  Show percentage bar
        loop.set_description("Loading...".format(i))
        loop.update(1)

    loop.close()
    display(IVs_and_cats)
    return obs
obs = make_final_IV(obs)

                                                                                                                       

Unnamed: 0,IV,catT1,catT2,number
0,https://data.cooperationdatabank.org/vocab/prop/sequentiality,https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking,1
1,https://data.cooperationdatabank.org/vocab/prop/positionInGame,1,2,1
2,https://data.cooperationdatabank.org/vocab/prop/expectationsLevel,https://data.cooperationdatabank.org/id/expectationslevel/low,https://data.cooperationdatabank.org/id/expectationslevel/high,1
3,https://data.cooperationdatabank.org/vocab/prop/religiousLevel,https://data.cooperationdatabank.org/id/religiouslevel/high,https://data.cooperationdatabank.org/id/religiouslevel/low,1
4,https://data.cooperationdatabank.org/vocab/prop/blockOfCooperation,1,2,1
...,...,...,...,...
377,https://data.cooperationdatabank.org/vocab/prop/emotionValence,https://data.cooperationdatabank.org/id/emotionvalence/positive,https://data.cooperationdatabank.org/id/emotionvalence/negative,2
378,https://data.cooperationdatabank.org/vocab/prop/emotionValence,https://data.cooperationdatabank.org/id/emotionvalence/neutral,https://data.cooperationdatabank.org/id/emotionvalence/negative,3
379,https://data.cooperationdatabank.org/vocab/prop/nationality,GBR,DEU,24
380,https://data.cooperationdatabank.org/vocab/prop/conflictIndexLevel,https://data.cooperationdatabank.org/id/conflictindexlevel/low,https://data.cooperationdatabank.org/id/conflictindexlevel/medium,2


In [49]:
obs

Unnamed: 0,obs,dependent,ES,N,t1,t2,study,design,n1,n2,sd1,sd2,ESLower,ESUpper,paper,doi,independentProperties,categoryT1,categoryT2,IV_new
0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.243930,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,3.700,0.676655,1.811200,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,['https://data.cooperationdatabank.org/vocab/prop/sequentiality'],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking,https://data.cooperationdatabank.org/vocab/prop/sequentiality_H1
1,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,1.084430,57.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,28.0,29.0,3.360,4.080,0.528299,1.640570,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,['https://data.cooperationdatabank.org/vocab/prop/sequentiality'],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking,https://data.cooperationdatabank.org/vocab/prop/sequentiality_H1
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.087300,58.0,https://data.cooperationdatabank.org/id/ENG00008_1.4.2,https://data.cooperationdatabank.org/id/ENG00008_1.4.3,https://data.cooperationdatabank.org/id/ENG00008_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,29.0,29.0,3.700,4.080,-0.602256,0.427657,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,['https://data.cooperationdatabank.org/vocab/prop/positionInGame'],1,2,https://data.cooperationdatabank.org/vocab/prop/positionInGame_H1
3,https://data.cooperationdatabank.org/id/ENG00008_2.1.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,-0.324082,192.0,https://data.cooperationdatabank.org/id/ENG00008_2.1.1,https://data.cooperationdatabank.org/id/ENG00008_2.1.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,96.0,3.288,4.281,-0.608830,-0.039335,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,['https://data.cooperationdatabank.org/vocab/prop/expectationsLevel'],https://data.cooperationdatabank.org/id/expectationslevel/low,https://data.cooperationdatabank.org/id/expectationslevel/high,https://data.cooperationdatabank.org/vocab/prop/expectationsLevel_H1
4,https://data.cooperationdatabank.org/id/ENG00008_2.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/contributions,0.315689,144.0,https://data.cooperationdatabank.org/id/ENG00008_2.2.1,https://data.cooperationdatabank.org/id/ENG00008_2.2.2,https://data.cooperationdatabank.org/id/ENG00008_2,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,96.0,48.0,3.910,3.520,-0.032700,0.664078,https://data.cooperationdatabank.org/id/ENG00008,http://dx.doi.org/10.1016/j.jesp.2004.09.004,['https://data.cooperationdatabank.org/vocab/prop/sequentiality'],https://data.cooperationdatabank.org/id/sequentiality/simultaneous,https://data.cooperationdatabank.org/id/sequentiality/sequential_turn-taking,https://data.cooperationdatabank.org/vocab/prop/sequentiality_H1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.100519,48.0,https://data.cooperationdatabank.org/id/ENG02667_1b.2.2,https://data.cooperationdatabank.org/id/ENG02667_1b.2.3,https://data.cooperationdatabank.org/id/ENG02667_1b,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.552567,-0.753605,https://data.cooperationdatabank.org/id/ENG02667,http://dx.doi.org/10.2466/pr0.1975.36.2.371,['https://data.cooperationdatabank.org/vocab/prop/iteratedPreprogrammedCooperationRate'],"(0.962, 2.46]","(0.1, 0.5]",https://data.cooperationdatabank.org/vocab/prop/iteratedPreprogrammedCooperationRate_H3
2476,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.2.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.739089,127.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.318960,-1.159220,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,['https://data.cooperationdatabank.org/vocab/prop/sVOType'],https://data.cooperationdatabank.org/id/svotype/individualist,https://data.cooperationdatabank.org/id/svotype/prosocial,https://data.cooperationdatabank.org/vocab/prop/sVOType_H1
2477,https://data.cooperationdatabank.org/id/ENG02668_1.2.1.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,0.719742,126.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.1,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,0.297905,1.141580,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,['https://data.cooperationdatabank.org/vocab/prop/sVOType'],https://data.cooperationdatabank.org/id/svotype/prosocial,https://data.cooperationdatabank.org/id/svotype/competitor,https://data.cooperationdatabank.org/vocab/prop/sVOType_H10
2478,https://data.cooperationdatabank.org/id/ENG02668_1.2.2.3.d,https://data.cooperationdatabank.org/id/dependentvariable/cooperation,-0.019347,107.0,https://data.cooperationdatabank.org/id/ENG02668_1.2.2,https://data.cooperationdatabank.org/id/ENG02668_1.2.3,https://data.cooperationdatabank.org/id/ENG02668_1,https://data.cooperationdatabank.org/id/betweenorwithinparticipantsdesign/1,,,,,-0.438130,0.399437,https://data.cooperationdatabank.org/id/ENG02668,http://dx.doi.org/NA,['https://data.cooperationdatabank.org/vocab/prop/sVOType'],https://data.cooperationdatabank.org/id/svotype/individualist,https://data.cooperationdatabank.org/id/svotype/competitor,https://data.cooperationdatabank.org/vocab/prop/sVOType_H4


## 2.3 Binning Effect Sizes

In [50]:
#  Binning effect sizes 

bins = [-np.inf, -0.5, -0.2, 0.2, 0.5, np.inf]
labels = ['LargeMediumNegativeES',
'SmallNegativeES',
'NullFinding',  
'SmallPositiveES',
'LargeMediumPositiveES'] 
categories = pd.cut(obs['ES'], bins, labels = labels)
obs['ESType'] = categories

In [51]:
# retrieval = pd.read_csv('data/observations_with_intervals_final.csv', index_col=0)
# retrieval

In [None]:
# indexes = retrieval['ESLower'].index[retrieval['ESLower'].isnull()]
# for index in indexes: 
#     print(retrieval.at[index, 'SE'])

# 3. From data table to triple

In [59]:
from validator_collection import validators, checkers

def make_effect_query(obs, IV, IV_H, catT1, catT2, es_type, effect = 'noEffect'): 
    if checkers.is_url(catT1) and checkers.is_url(catT2) == True: 
        value_lineT1 = '?t1 <' + IV + '> <' + catT1 + '> . '
        value_lineT2 = '?t2 <' + IV + '> <' + catT2 + '> . '
    else:         
        value_lineT1 = '?t1 <' + IV + '> "' + str(catT1) + '" .'
        value_lineT2 = '?t2 <' + IV + '> "' + str(catT2) + '" .'

    cp_effect = '<' + IV_H + '>' + ' cp:hasNoEffectOn ?dependentVariable .'
    
    if effect == 'positive': 
        cp_effect = '<' + IV_H + '>' + ' cp:hasPositiveEffectOn ?dependentVariable .'

    elif effect == 'negative': 
        cp_effect = '<' + IV_H + '>' + ' cp:hasNegativeEffectOn ?dependentVariable .'
        
        
    query = """ 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX cp: <https://data.cooperationdatabank.org/vocab/prop/>
PREFIX class: <https://data.cooperationdatabank.org/vocab/class/>

CONSTRUCT {
  ?study cp:reportsEffect """ + '<' + obs + '>' + """ . 
  """ + '<' + obs + '>' + """ cp:dependentVariable ?dependentVariable .
  """ + '<' + obs + '>' + """ cp:ESType  class:""" + es_type + """ . 
  """ + '<' + obs + '>' + """ cp:treatment ?t1, ?t2 .
  """ + '<' + IV + '>' + """ rdfs:subPropertyOf ?superProperty . 
  
  """ + value_lineT1 + value_lineT2 + cp_effect + '<' + IV + '>' + """ rdfs:range ?class . 
  ?class rdfs:subClassOf ?superClass . 
  ?superClass rdfs:subClassOf class:IndependentVariable . 
   """ + '<' + IV_H + '>' + """ rdfs:subPropertyOf  """ + '<' + IV + '>' + """ . 
} 

WHERE {
  ?study cp:reportsEffect """ + '<' + obs + '>' + """ . 
  """ + '<' + obs + '>' + """ rdf:type class:Observation .
  """ + '<' + obs + '>' + """ cp:eSmeasure <https://data.cooperationdatabank.org/id/esmeasure/d> . 
  """ + '<' + obs + '>' + """ cp:dependentVariable ?dependentVariable .
  """ + '<' + obs + '>' + """ cp:treatment ?t1, ?t2 . 
""" + '<' + IV + '>' + """ rdfs:subPropertyOf ?superProperty . 
  
  OPTIONAL { 
  """ + value_lineT1 + value_lineT2 + """
  }
  OPTIONAL {
    """ + '<' + IV + '>' + """ rdfs:range ?class . 
    ?class rdfs:subClassOf ?superClass . 
    ?superClass rdfs:subClassOf class:IndependentVariable . 
  }
FILTER (?t1 < ?t2)
} """
    return query

In [53]:
def zero_is_between(start, stop): 
    if start <= 0 <= stop: 
        return True
    else: 
        return False 
    
    
#  if 0 is between lower and upper bounds then the effect is not significant 
def type_of_effect(effect_size, lower, upper):
    effect = ''
    if math.isnan(lower) or math.isnan(upper):
        if effect_size > -0.2 and effect_size < 0.2:
            effect = 'noEffect'
            
        elif effect_size > 0.2: 
            effect = 'positive'
            
        elif effect_size < -0.2: 
            effect = 'negative'
    else: 
        # not significant effect sizes 
        if zero_is_between(lower, upper): 
            effect = 'noEffect'

        # significant effect sizes 
        elif not zero_is_between(lower, upper):
            
            #  positive effect sizes 
            if float(effect_size) > 0:
                effect = 'positive'

            #  negative effect sizes 
            elif float(effect_size) < 0: 
                effect = 'negative'
    
    return effect

In [60]:
def table_to_triples(table):
    #  Initiate empty triples_df 
    tripels_df = pd.DataFrame(columns=['s', 'p', 'o'])
    
    loop = tqdm(total = len(table), position=0, leave=False)

    for index, row in table.iterrows():        
        # Get stuff from df row
        observation = row['obs']
        lower = float(row['ESLower'])
        upper = float(row['ESUpper'])
        es_type = row['ESType']
        IV = ast.literal_eval(row['independentProperties'])[0]
        IV_H = row['IV_new']
        cat1 = row['categoryT1']
        cat2 = row['categoryT2']
        effect_size = row['ES']
#         standard_error = row['SE']
        
        #  if lower or upper are not in the data, calculate them 
        if math.isnan(lower) or math.isnan(upper): 
            try: 
                lower = effect_size - (1.96 * standard_error)
                upper = effect_size + (1.96 * standard_error)
    
                #  Retrieve type of effect (negative, noEffect, positive)
                effect = type_of_effect(effect_size, lower, upper)
            except: 
                effect = type_of_effect(effect_size, lower, upper)
        else: 
            effect = type_of_effect(effect_size, lower, upper)
        
        
        #  make query 
        query = make_effect_query(observation, IV, IV_H, cat1, cat2, es_type, effect = effect) 
        
        #  get df with tripels 
        new_triples = get_sparql_dataframe_construct(url, query)

        #  append triples to the full df
        tripels_df = tripels_df.append(new_triples)
                
        #  Show percentage bar to see how far allong we are 
        loop.set_description("Loading...".format(index))
        loop.update(1)
    
    loop.close()
    return tripels_df.reset_index(drop=True)

triples = table_to_triples(obs)

                                                                                                                       

In [68]:
triples

Unnamed: 0,s,p,o
0,https://data.cooperationdatabank.org/vocab/class/SequentialityVariable,http://www.w3.org/2000/01/rdf-schema#subClassOf,https://data.cooperationdatabank.org/vocab/class/IndependentVariable
1,https://data.cooperationdatabank.org/vocab/prop/sequentiality_H1,https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn,https://data.cooperationdatabank.org/id/dependentvariable/contributions
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/vocab/prop/sequentiality,https://data.cooperationdatabank.org/id/sequentiality/simultaneous
3,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/vocab/prop/ESType,https://data.cooperationdatabank.org/vocab/class/LargeMediumPositiveES
4,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/vocab/prop/dependentVariable,https://data.cooperationdatabank.org/id/dependentvariable/contributions
...,...,...,...
29754,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1.2.d,https://data.cooperationdatabank.org/vocab/prop/ESType,https://data.cooperationdatabank.org/vocab/class/NullFinding
29755,https://data.cooperationdatabank.org/vocab/class/PersonalityVariable,http://www.w3.org/2000/01/rdf-schema#subClassOf,https://data.cooperationdatabank.org/vocab/class/IndependentVariable
29756,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1,https://data.cooperationdatabank.org/vocab/prop/sVOType,https://data.cooperationdatabank.org/id/svotype/individualist
29757,https://data.cooperationdatabank.org/id/ENG02672_1a,https://data.cooperationdatabank.org/vocab/prop/reportsEffect,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1.2.d


In [71]:
triples.to_csv('data/triples.csv')
triples = pd.read_csv('data/triples.csv', index_col=0)

In [72]:
triples

Unnamed: 0,s,p,o
0,https://data.cooperationdatabank.org/vocab/class/SequentialityVariable,http://www.w3.org/2000/01/rdf-schema#subClassOf,https://data.cooperationdatabank.org/vocab/class/IndependentVariable
1,https://data.cooperationdatabank.org/vocab/prop/sequentiality_H1,https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn,https://data.cooperationdatabank.org/id/dependentvariable/contributions
2,https://data.cooperationdatabank.org/id/ENG00008_1.4.1,https://data.cooperationdatabank.org/vocab/prop/sequentiality,https://data.cooperationdatabank.org/id/sequentiality/simultaneous
3,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/vocab/prop/ESType,https://data.cooperationdatabank.org/vocab/class/LargeMediumPositiveES
4,https://data.cooperationdatabank.org/id/ENG00008_1.4.1.2.d,https://data.cooperationdatabank.org/vocab/prop/dependentVariable,https://data.cooperationdatabank.org/id/dependentvariable/contributions
...,...,...,...
29754,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1.2.d,https://data.cooperationdatabank.org/vocab/prop/ESType,https://data.cooperationdatabank.org/vocab/class/NullFinding
29755,https://data.cooperationdatabank.org/vocab/class/PersonalityVariable,http://www.w3.org/2000/01/rdf-schema#subClassOf,https://data.cooperationdatabank.org/vocab/class/IndependentVariable
29756,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1,https://data.cooperationdatabank.org/vocab/prop/sVOType,https://data.cooperationdatabank.org/id/svotype/individualist
29757,https://data.cooperationdatabank.org/id/ENG02672_1a,https://data.cooperationdatabank.org/vocab/prop/reportsEffect,https://data.cooperationdatabank.org/id/ENG02672_1a.1.1.2.d


In [73]:
def convert_to_array(df):
    listje = []
    loop = tqdm(total = len(df), position=0, leave=False)
    for index, row in df.iterrows():
        listje.append([row['s'], row['p'], row['o']])  
            
        #  Show percentage bar to see how far allong we are 
        loop.set_description("Loading...".format(index))
        loop.update(1)

    array = np.asarray(listje)
    return array

In [75]:
triples_arr = convert_to_array(triples)

Loading...:  23%|██████████████▋                                                 | 6826/29759 [00:07<00:24, 942.26it/s]

KeyboardInterrupt: 

In [76]:
triples_arr1 = triples.to_numpy()


In [None]:
print(len(triples_arr))
triples_arr


In [None]:
full_CoDa_data_df = triples

# Create seen, unseen triples

In [None]:
#  Filter out out the effect triples 

df_hasPositiveEffectOn_only = full_CoDa_data_df[(full_CoDa_data_df.p == 'https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn')]
df_hasNoEffectOn_only = full_CoDa_data_df[(full_CoDa_data_df.p == 'https://data.cooperationdatabank.org/vocab/prop/hasNoEffectOn')]
df_hasNegativeEffectOn_only = full_CoDa_data_df[(full_CoDa_data_df.p == 'https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn')]

In [None]:
print(len(df_hasPositiveEffectOn_only))
print(len(df_hasNoEffectOn_only))
print(len(df_hasNegativeEffectOn_only))

In [None]:
#  Put all effect triples together 

effect_triples_df = pd.concat([df_hasPositiveEffectOn_only, df_hasNegativeEffectOn_only])
print(len(effect_triples_df))
effect_triples_df

In [None]:
#  Count and check unique entities and relationships 

s_unique = list(effect_triples_df["s"].unique())
p_unique = list(effect_triples_df["p"].unique())
o_unique = list(effect_triples_df["o"].unique())

print(len(s_unique), s_unique)
print(len(p_unique), p_unique)
print(len(o_unique), o_unique)

In [None]:
#  Create all possible effect triples using the seen effect triples 

df_unseen = []

def make_doubles(s_all,p_all,o_all):  
    doubles = []
    for s in s_all: 
        triple = []
        iterations_p = len(p_all)   
   
        while iterations_p: 
            triple = []
            iterations_p-=1
            triple.append(s)
            triple.append(p_all[iterations_p])
            doubles.append(triple)
    return doubles 


def make_triples(s_p, o_all): 
    list_with_triples = []
    iterations_o = len(o_all)
    for double in s_p: 
        triple = [double]
        iterations_o = len(o_all)
        while iterations_o:
            iterations_o-=1
            triple_finish = double + [o_all[iterations_o]]
            list_with_triples.append(triple_finish)
    return list_with_triples


doubles = make_doubles(s_unique, p_unique, o_unique)

yay_triples = make_triples(doubles, o_unique)
print(len(yay_triples))

In [None]:
#  See all the possible triples in a df for inspection  

all_triples = pd.DataFrame(yay_triples, columns=list('spo'))
all_triples

In [None]:
#  drop duplicates and turn into array for seen triples input in ComplEx

seen_triples = effect_triples_df.drop_duplicates()
len(seen_triples)

In [None]:
def triple_filter(df1, df2): 
    unseen_triples = df1
    for index, row in df2.iterrows():
        match = df1[(df1==row).all(axis=1)]
        index = match.index.astype(int)
        unseen_triples = unseen_triples.drop(match.index, axis=0)

    return unseen_triples  

#  Filter out all unseen triples using the triple filter 
unseen_triples = triple_filter(all_triples, seen_triples)
len(unseen_triples)

In [None]:
unseen_triples_arr = unseen_triples

In [None]:
%store triples_arr
%store unseen_triples_arr

In [None]:
np.save('triples2244.npy', triples_arr)