## This notebook includes the code to extract from KYM raw data

In [9]:
import json, csv
from csv import writer, reader

In [10]:
def save2json(filename, dump):
    out_file = open(filename, "w")
    json.dump(dump, out_file, indent = 6)
    out_file.close()

In [11]:
def save2csv(filename, header, data):
    with open(filename, 'w', newline='') as csvfile:
        csvw = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csvw.writerow(header)
        for d in data:
            csvw.writerow(d)

In [39]:
raw_data  = json.load(open('data/raw/kym.raw.json'))

## Clean KYM Extracted Data

The code below cleans the KYM raw data extracting only those entries that are categorised as Memes (see Infobox below). It also removes ans separate data related to structured info present in the page.

![infobox](images/kym-infobox.png)

In [13]:
raw_media_frames = []
metas = []
lds = []

for m in raw_data:
    if ("sites" in m['url'] or "culture" in m['url'] or "subculture" in m['url'] or "event" in m['url'] or "people" in m['url'] or "type" in m['url']  ):
        continue
    else:
        if('content' in m.keys() and 'about' in m['content'].keys() and 'text' in m['content']['about'].keys() ):
            m['content']['about']['fulltext'] = "".join(m['content']['about']['text'])
        if('content' in m.keys() and 'origin' in m['content'].keys() and 'text' in m['content']['origin'].keys() ):
            m['content']['origin']['fulltext'] = "".join(m['content']['origin']['text'])
        if('content' in m.keys() and 'spread' in m['content'].keys() and 'text' in m['content']['spread'].keys() ):
            m['content']['spread']['fulltext'] = "".join(m['content']['spread']['text'])
        if('content' in m.keys() and 'subsection' in m['content'].keys() and 'text' in m['content']['usage'].keys() ):
            m['content']['subsection']['fulltext'] = "".join(m['content']['subsection']['text'])
        if('meta' in m.keys()):
            metas.append(m.pop('meta', None))
        if('ld' in m.keys()):
            lds.append(m.pop('ld', None))
        raw_media_frames.append(m)

In [14]:
save2json('data/kym.media.frames.meta.json', metas)

In [15]:
save2json('data/kym.media.frames.ls.json', lds)

In [16]:
save2json('data/kym.media.frames.json', raw_media_frames)

In [41]:
save2csv('data/kym.media.frames.csv', ['title','meme'], [[m['title'],m['url']] for m in raw_media_frames])

## Extract those Media Frame that have a corresponding seed in KYM

In [17]:
def get_media_frame_from_seeds(seed,memefile,outfile):
    dump = [ ]
    with open(seed) as csvfile:
        urls = reader(csvfile)
        with open(memefile) as memes:
            data = json.load(memes)
            for row in urls:
                for i in data:
                    if (i['url'] == row[1]):
                        dump.append(i)
    save2json(outfile,dump)

In [18]:
get_media_frame_from_seeds('data/seeds.to.kym.media.frames.csv','data/kym.media.frames.json','data/kym.seed.media.frames.json')

## Extract the related media frames
- siblings (rdfs:seeAlso)
- parent (skos:broader)
- children (skos:narrower)



In [29]:
def extract_X(infile, outfile,x):
    xs = []
    for m in infile:
        if(x in m.keys()):
            if(type(m[x])==list):
                for s in m[x]:
                    xs = xs + [[m['url'],s]]
            else:
                xs = xs + [[m['url'],m[x]]]
    save2csv(outfile,['meme',x],xs)

### For all Media Frames

In [28]:
extract_X(raw_media_frames, 'data/kym.siblings.media.frames.csv', 'siblings')

In [26]:
extract_X(raw_media_frames, 'data/kym.parent.media.frames.csv', 'parent')

In [27]:
extract_X(raw_media_frames, 'data/kym.children.media.frames.csv', 'children')

### For Seeds only

In [25]:
with open('data/kym.seed.media.frames.json') as inF:
        data2filter = json.load(inF)
        extract_X(data2filter, 'data/kym.seed.siblings.media.frames.csv', 'siblings')
        extract_X(data2filter, 'data/kym.seed.parent.media.frames.csv', 'parent')
        extract_X(data2filter, 'data/kym.seed.children.media.frames.csv', 'children')

## Extract the taxonomy of media frames

In [30]:
def extract_types(infile, outfile):
    xs = []
    for m in infile:
        if('details' in m.keys() and 'type' in m['details'].keys()):
            for x in m['details']['type']:
                xs = xs + [[m['url'],x]]
    save2csv(outfile,['meme','type'],xs)

### For all Media Frames

In [32]:
extract_types(raw_media_frames, 'data/kym.types.media.frames.csv')

### For Seeds only

In [31]:
with open('data/kym.seed.media.frames.json') as inF:
        data2filter = json.load(inF)
        extract_types(data2filter, 'data/kym.seed.types.media.frames.csv')