In [47]:
import pandas as pd
import seaborn as sns
import re
from glob import glob
from collections import Counter

In [38]:
out_files = glob('../data/out/**/*.gz')

In [40]:
triples = []
for out_file in out_files:
    _, _, _, year, source_file_name = out_file.split('/')
    trip_type = source_file_name.replace('dpef.html-', '').split('.nq')[0]
    with open(out_file, 'r') as f:
        for l in f:
            trip = l[:-1].split(',')
            trip.append(year)
            trip.append(source_file_name)
            trip.append(trip_type)
            triples.append(trip)

In [41]:
total_triples = len(triples)

### Extraction Error Analysis

In [42]:
# % of triples with ,s
clean_triples = [t for t in triples if len(t) == 7]
clean_triples_count = len(clean_triples)
n, total_triples, (total_triples - clean_triples_count) / total_triples * 100

(15474, 1083689, 1.63783151808314)

`1.63 %` of triples have an avoidable error

### Clean Dataset

In [43]:
df = pd.DataFrame(clean_triples, columns=['s', 'p', 'o', 'url', 'year', 'source', 'tipe'])

In [44]:
df['simple_p'] = df.p.str.extract('.*/([a-zA-Z]+)>')[0]

In [45]:
df['domain'] = df.url.str.extract('<http.*://(.*?)/.*>')[0]

### Counts by year

#### Totals

In [46]:
df.groupby('year').s.count().rename('triples_per_year')

year
2019    734797
2020     62743
2021    268400
Name: triples_per_year, dtype: int64

#### Broken down by predicate

In [58]:
total_p_counts = df.simple_p.value_counts().rename('total')
def ordered_p_counts(df):
    counts = df.simple_p.value_counts()
    ordered_counts = [counts.get(p) for p in total_p_counts.index]
    return pd.Series(ordered_counts, index=total_p_counts.index)

pd.concat([df.groupby('year').apply(ordered_p_counts), total_p_counts.to_frame().transpose()], axis=0).transpose()

Unnamed: 0,2019,2020,2021,total
timeRequired,162328.0,20189.0,82512.0,265029.0
typicalAgeRange,121006.0,15433.0,50650.0,187089.0
learningResourceType,112100.0,6119.0,21451.0,139670.0
educationalUse,102015.0,4909.0,12793.0,119717.0
isBasedOnUrl,74643.0,3168.0,12154.0,89965.0
educationalRole,72428.0,3681.0,8854.0,84963.0
interactivityType,30633.0,4184.0,18287.0,53104.0
alignmentType,18022.0,1273.0,10388.0,29683.0
targetName,14269.0,1017.0,10246.0,25532.0
educationalAlignment,13416.0,1611.0,10435.0,25462.0


#### Broken down by provider

In [59]:
total_domain_counts = df.domain.value_counts().rename('total')
def ordered_domain_counts(df):
    counts = df.domain.value_counts()
    ordered_counts = [counts.get(d) for d in total_domain_counts.index]
    return pd.Series(ordered_counts, index=total_domain_counts.index)

pd.concat([df.groupby('year').apply(ordered_domain_counts), 
           total_domain_counts.to_frame().transpose()
          ], axis=0).transpose().head(15)

Unnamed: 0,2019,2020,2021,total
www.litcharts.com,101990.0,3360.0,4112.0,109462.0
www.math-drills.com,56784.0,1360.0,2896.0,61040.0
www.mathslibres.com,28288.0,1008.0,2768.0,32064.0
www.tabletennis365.com,21638.0,596.0,2284.0,24518.0
www.mateslibres.com,19136.0,1232.0,3936.0,24304.0
tabletennis365.com,18464.0,,496.0,18960.0
www.commonsense.org,16112.0,166.0,1693.0,17971.0
www.pbslearningmedia.org,,,17154.0,17154.0
www.ck12.org,16154.0,152.0,760.0,17066.0
www.getabstract.com,14064.0,368.0,426.0,14858.0


In [41]:
top

simple_p,isBasedOnUrl,timeRequired,typicalAgeRange,learningResourceType,alignmentType,educationalAlignment,educationalRole,targetName,interactivityType,targetURL,educationalUse,educationalFramework,useRightsURL,educationalLevel,targetDescription,aggregateRating,sum
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
www.litcharts.com,,,,6.0,,,115864.0,,,,,,,,,,115870.0
www.tabletennis365.com,43873.0,,,,,,,,,,,,,,,,43873.0
www.math-drills.com,,,,,,,23034.0,,,,15356.0,,,,,,38390.0
www.ck12.org,,,11202.0,6290.0,,,,,10698.0,,,,8630.0,,,,36820.0
tabletennis365.com,34606.0,,,,,,,,,,,,,,,,34606.0
www.getabstract.com,,16063.0,,16063.0,,,,,,,,,,,,,32126.0
www.commonsense.org,,52.0,,72.0,7740.0,7740.0,7518.0,6224.0,,,1455.0,989.0,,,,,31790.0
www.sondakika.com,,12677.0,12677.0,,,,,,,,,,,,,,25354.0
www.domkino.tv,,,22330.0,,,,,,,,,,,,,,22330.0
www.haberler.com,,10287.0,10287.0,,,,,,,,,,,,,,20574.0
