In [1]:
from IPython.core.display import HTML
HTML(""" <link href="https://fonts.googleapis.com/css2?family=Inter:wght@600&family=Noto+Sans+JP&display=swap" rel="stylesheet"> 
<style>
    div.text_cell_render h1 {
        font-family: 'Inter';
        font-size: 1.7em;
        line-height:1.4em;
        text-align:center;
        }

    div.text_cell_render { 
        font-family: 'Noto Sans JP';
        font-size:1.05em;
        line-height:1.5em;
        padding-left:3em;
        padding-right:3em;
        }
</style>""")

# Analyzing the Data

The webpages associated with the input image(s) are now downloaded and enriched with metadata. Now we can start analyzing the data. In this notebook we analyze several features of the data: 
- the diachronic frequency (when was the image published)
- the top level domains (on what websites was the image found)
- the distribution of languages

# Reformatting the Data
Before we do any (text) analysis, it is useful to assemble the data and put it all in one file. Below we pull together the information found in the .json files with dates, texts and entities. We reformat the information into one .csv (comma-separated) file. CSV-files can be opened in for example Excel and Notepad and are easy to use in for example Excel or Notepad.

In [None]:
base_path = "/media/ruben/Data Drive/react-data/protest/carlo-batch-selection"

'''
Import the dates from the dates.txt file in the photo folder. The dates.txt file contains the URL and the associated date, separated by '||'.
We create a dictionary with the URL-date pair. This requires splitting the line on the '||' characters and excluding the URLs that have no date
associated (they are marked as 'na')
'''
dates_ref = dict()

for photo in [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]:
    photo_folder = os.path.join(base_path, photo)
    with open(os.path.join(photo_folder,"dates.txt"),'r') as f:
        x = f.readlines()
    dates_ref.update({d.split('|')[0]:d.split('|')[-1].replace('\n','') for d in x if d.split('|')[-1].replace('\n','') != "na" and "ERROR" not in d.split('|')[-1].replace('\n','')})

'''
Import that languages from the languages-photo.json file. 
'''
language_ref = dict()

for photo in [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]:
    photo_folder = os.path.join(base_path, photo)
    
    with open(os.path.join(photo_folder,'languages-'+photo+".json"),'r') as f:
        lang = json.load(f)
    
    languages = []
    
    for iterkey,items1 in lang.items():
        
        for id_, lan_items in items1.items():
            language = lan_items[0] # the language identifier also outputs the probability that the guess is right. We don't need that so we discard it here
            language_ref.update({id_:language})
                
'''
Import texts from parsed_text.json file in iteration folders. N.B.: this means that for every iteration one .json with texts is constructed.
Hence the extra loop. This loop also combines all the information in one object. In this process we also extract the so-called Top Level
Domain (TLD), for example www.facebook.com in the case of the URL www.facebook.com/user/something. 
'''
text_ref = dict()

for photo in [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]:
    photo_folder = os.path.join(base_path, photo)
    num_iterations = [fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol and "context" not in fol]
    num_iterations = len(num_iterations)

    start_iter = 1
    range_iter = [str(i) for i in list(range(1,num_iterations + 1))]

    folder_base = os.path.join(base_path,photo,photo)

    for iteration in range_iter:
        fn = os.path.join(folder_base + "_" +str(iteration),"txt", "parsed_text.json")
        with open(fn) as fp:
            pages = json.load(fp)
            
        for identifier,sentences in pages.items():
            
            sentences = [s.replace("\n","").lower() for s in sentences]
            sentences = [re.sub(' +', ' ', s) for s in sentences]

            url = identifier.split('html_')[-1]
            id_ = identifier.split('/html/')[1].split('.html_')[0]
            if url in dates_ref.keys():
                date = dates_ref[url]
            else:
                date = "na"
            
            text_ref.update({identifier:dict()})
            text_ref[identifier].update({"photo":photo,"url":url,"identifier":id_,"date":date,"language":language_ref[url],"topleveldomain":URL.from_string(url).domain(),"category":category_ref[URL.from_string(url).domain()],"sentences":"||".join(sentences)})

In [None]:
'''
Here we transform the constructed dictionary is a DataFrame that can be exported to a .csv file
'''

df = pd.DataFrame.from_dict(text_ref,orient='index').reset_index()
df.columns = ["path","photo","url","identifier","date","language","topleveldomain","category","sentences"]
df.to_csv('path/to/datafolder/data-full.csv',index=False)

# 1. Diachronic Frequency

One of the first things to inspect is the distribution of the webpages over the years. When were most pages published? Can we identify peaks, or gaps? With the data stored in a .csv we can relatively easily generate visualizations that provide an insight into the data. Below we first aggregate the number of webpages per year and plot it.

In [None]:
data = pd.read_csv('path/to/datafolder/data-full.csv')

subset = data[['photo','date']] # get the photo and date columns
subset['count'] = 1 # add a count column, because every row is one observation

# loop over the dates and if the dates is not "na" extract the year

subset['year'] = ''

for c,i in enumerate(subset['date']):
    
    if "na" not in str(i):
        year = str(i)[0:4]
        subset['year'][c] = year
    else:
        subset['year'][c] = "na"
        
# Remove all observations with "na"
subset = subset[~subset['year'] == "na"]

# Group by photo and year

subset = subset.groupby(['photo','year']).sum().reset_index()


# 2. Top Level Domains

# 3. Language Distribution

In [None]:
d_ = dict()
for photo in [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]:
    photo_folder = os.path.join(base_path, photo)
    
    with open(os.path.join(photo_folder,'languages-'+photo+".json"),'r') as f:
        lang = json.load(f)
    
    languages = []
    
    for iterkey,items1 in lang.items():
        languages += [v[0] for k,v in items1.items()]
    
    languages = dict(Counter(languages))
    #pretty_title = refdf_title[photo]
    d_.update({photo:languages})