In [1]:
from IPython.core.display import HTML
HTML(""" <link href="https://fonts.googleapis.com/css2?family=Inter:wght@600&family=Noto+Sans+JP&display=swap" rel="stylesheet"> 
<style>
    div.text_cell_render h1 {
        font-family: 'Inter';
        font-size: 1.7em;
        line-height:1.4em;
        text-align:center;
        }

    div.text_cell_render { 
        font-family: 'Noto Sans JP';
        font-size:1.05em;
        line-height:1.5em;
        padding-left:3em;
        padding-right:3em;
        }
</style>""")

# Working with Web Data

In the previous notebook we identified webpages that host a specific image. The URLs that refer to these webpages are found in .json files. In this notebook, we use the URLs to download the actual pages, extract their textual content and prepare the textual data for analysis. Also, we extract general features of the webpages (the metadata) for analysis. To get an idea of the .json files, we start by opening them and inspecting the list of URLs. 

First, we define some basic variables. Then we gather the list of .json files and open the first one.

In [2]:
import os,json

base_path = "C:/Users/Ruben/Documents/GitHub/ReACT_GCV/notebooks/photo_folder/" 
photo = "example_photo_1_folder"
photo_folder = os.path.join(base_path, photo)

# Check how many iterations we have by using the os.listdir function. We don't want the "source" folder because it doesn't contain jsons
num_iterations = len([fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol])
start_iter = 1
range_iter = [str(i) for i in list(range(1,num_iterations+1))]

list_jsons = []

# We now "loop" through the folders associated with the iterations and gather the .jsons in these folders
for iteration in range_iter:
    iteration_folder = os.path.join(photo_folder, photo + "_" + str(iteration))
    list_json_in_iteration_folder = [os.path.join(iteration_folder,js) for js in os.listdir(iteration_folder) if ".json" in js]
    print("Foun {} jsons in {}".format(len(list_json_in_iteration_folder),iteration_folder))
    list_jsons += list_json_in_iteration_folder

Foun 1 jsons in C:/Users/Ruben/Documents/GitHub/ReACT_GCV/notebooks/photo_folder/example_photo_1_folder\example_photo_1_folder_1
Foun 61 jsons in C:/Users/Ruben/Documents/GitHub/ReACT_GCV/notebooks/photo_folder/example_photo_1_folder\example_photo_1_folder_2


Now we can open the .json files by loading them with the json module that comes automatically with your Python installation. You can inspect or "walk" the data by selecting keys with names (```json_data['responses']```) or elements in lists (```json_data['responses'][0:10]```). To find the URLs, navigate to the ```pagesWithMatchingImages``` list.

In [3]:
with open(list_jsons[0],'r') as fr:
    json_data = json.load(fr)

# Show the first elements in the list:
json_data['responses'][0]['webDetection']['pagesWithMatchingImages'][0:2]

[{'pageTitle': 'The march of January 11th, 2015 by <b>Martin Argyroglo</b> - The Eye of ...',
  'partialMatchingImages': [{'url': 'https://loeildelaphotographie.com/wp-content/uploads/2015/01/martin-argyroglo.com_.jpg'}],
  'url': 'https://loeildelaphotographie.com/en/the-march-of-january-11th-2015-by-martin-argyroglo/'},
 {'pageTitle': '<b>Martin Argyroglo</b> on (avec images) | La liberté guidant le peuple ...',
  'partialMatchingImages': [{'url': 'https://i.pinimg.com/originals/23/c8/d6/23c8d6155449bfbaf22616b2a7f2bed3.jpg'}],
  'url': 'https://www.pinterest.com/pin/148829962662178632/'}]

As you can see, the json data consists of key:value pairs. In the 'pagesWithMatchingImages' list you can find:
1. The title of the page where the image is found
2. The link to the image file (www.example.com/media/examplephoto.png). This can be either a 'partialMatchingImage' or a 'fullMatchingImage'. The difference between the two is hard to explain, but in most cases a "fullMatch" concerns a copy of the input image, only different in scale or quality. A 'partialMatch' usually consists of for example images where the input image is only part of (for example an image of a t-shirt that includes a print of the input image.
3. The link to the page itself

# The Pipeline

To work with the metadata and text data associated with the webpages we need to extract, clean and harmonize this data. We do so by:
- downloading the webpages in .html format
- extract the text from the .html pages
- identify languages
- identify dates
- identify Named Entities

In [4]:
from functions import *

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [5]:

photo_folder = os.path.join(base_path, photo)
num_iterations = len([fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol])
start_iter = 1
range_iter = [str(i) for i in list(range(1,num_iterations+1))]
folder_base = os.path.join(photo_folder,photo)

for iteration in range_iter:

    jsFiles = Organize.gatherJson(folder_base,iteration)

    # Import previously scraped URLs if iteration number is larger than 1:
    scraped_urls = []
    if int(iteration) > 1:
        for i in range(1,int(iteration)):
            try:
                with open(os.path.join(folder_base + "_" + str(i), "html","results.txt"), 'r', encoding='utf-8') as f:
                    print("INFO: importing from {}".format(os.path.join(folder_base + "_" + str(i), "html","results.txt")))
                    lu = f.readlines()
                    lu = [l.split('|') for l in lu]
                    lu = [l for l in lu if len(l) == 2]
                    lu = [l[1].replace('\n','') for l in lu]
                    scraped_urls = scraped_urls + lu
            except FileNotFoundError:
                print(os.path.join(folder_base + "_" + str(i), "html","results.txt"), "not found")

    #Scrape All Page URLs to 'image[...]/html' folder
    destination_path = os.path.join(folder_base + "_" + str(iteration), "html")
    if not os.path.exists(destination_path):
        os.makedirs(destination_path)
    
    list_urls = list(set([j['url'] for j in jsFiles]))
    list_urls = [u for u in list_urls if u not in scraped_urls]
    print('INFO: Scraping {} URLs in iteration {}'.format(len(list_urls),iteration))

    #HTML.PoolScrape(list_urls, destination_path)

INFO: Scraping 108 URLs in iteration 1
0ef84b4a-9f3b-11ea-b1a7-b0359fc72c2e_20200526-1026.json has an error
11a80536-9f3b-11ea-a670-b0359fc72c2e_20200526-1026.json has an error
INFO: importing from C:/Users/Ruben/Documents/GitHub/ReACT_GCV/notebooks/photo_folder/example_photo_1_folder\example_photo_1_folder_1\html\results.txt
INFO: Scraping 654 URLs in iteration 2


You will now find a ```html``` folders inside the ```example_photo1_[iteration number]``` folders. The folder structure now looks like this:
```
+-- photo_folder
    +-- example_photo_1_folder
        +-- example_photo_1_1
            +-- html (location of downloaded webpages)
            +-- img (location of images used for gathering webpages)
            +-- photo_name_identifier1.json (json files)
            +-- photo_name_identifier2.json
        +-- example_photo_1_2
        +-- example_photo_1_3
        +-- example_photo_1_folder_source
```

Next, we download the text to a ```txt``` folder that is going to be located at the same level as ```html``` and ```img```. In this folder, we gather all the webpage texts in one single .json file (for easy loading during the analysis). 

Extracting text from webpages is everything but a straightforward process, because it is not clear beforehand what is relevant. For example, text from ads or menubars are not relevant to the research. Luckily, there is software available that "parses" relevant texts. In this pipeline, we use a Python implementation of [boilerpipe](https://github.com/misja/python-boilerpipe) a piece of software that removes clutter and irrelevant bits from webpage texts. Boilerpipe offers various options. Inside the ```function.py``` you will find the setup of our use of the parser. To make things easier we handle the parsing in a separate function, to be called over the different iteration folders. 


In [6]:
    ParseText.Parse(os.path.join(folder_base+ "_" + str(iteration), "html"))

Looking for results.txt in C:/Users/Ruben/Documents/GitHub/ReACT_GCV/notebooks/photo_folder/example_photo_1_folder\example_photo_1_folder_2\html
INFO: parsing tekst from 1262 files


100%|██████████████████████████████████████████████████████████████████████████████| 1262/1262 [00:58<00:00,  4.51it/s]


Based on the URL text and the webpage text, we can now identifiy the language of the webpage. Here, we make use of ```langid``` a language identification library for Python. Below we identify the languages in a similar way as the we extracted the texts: by iterating over the folders. We write the languages to a .json file that is located in the main photo folder. The language identifier returns a probability score and the best guess language. 

In [7]:
photo_folder = os.path.join(base_path, photo)
num_iterations = len([fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol])
start_iter = 1
range_iter = [str(i) for i in list(range(1,num_iterations+1))]
folder_base = os.path.join(base_path,photo,photo)

language_dict = dict()
for iteration in tqdm(range_iter):
    language_dict.update({str(iteration):dict()})
    list_json= [js for js in os.listdir(os.path.join(base_path,photo,photo + "_" + str(iteration),"txt")) if ".json" in js]

    df = pd.DataFrame()
    if len(list_json) > 0:
        for js in list_json:
            with open(os.path.join(base_path,photo,photo + "_" + str(iteration),"txt", js)) as f:
                d_ = json.load(f)
            val = [" ".join(i) for i in d_.values()]
            ids = [i for i in d_.keys()]
            val = pd.DataFrame([ids,val]).T
            val.columns = ['id','text']
            df = df.append(val)
    else:
        print('no .json files found')
    df['url'] = [i.split('.html_')[1] for i in df['id']]


    for c,url in enumerate(df['url']):
        language_score = Language.ParseUrl(url)
        if language_score is None or language_score[1] < 0.7:
            try:
                language_score = Language.ParseText(str(df['text'][c])[1:-1])
                language_score.append('text')
            except Exception as e:
                continue
        else:
            language_score.append('url')
        language_dict[str(iteration)].update({url:[language_score[0],language_score[1],language_score[2]]})

# Write Detected Languages to language.json
with open(os.path.join(base_path,photo,'languages-{}.json'.format(photo)), 'w') as fp:
    json.dump(language_dict, fp)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.77s/it]


With the languages written to a .json, we now repeat a similarly looking procedure for the dates. The ```htmldate``` module identifies the date of publication for a URL. Because the module depends on the avaiability of information embedded in the html, it does not cover all the URLs, but enough to get an idea of the temporal distribution of our data. We write the extracted dates to ```dates.json``` in the main photo folder.

In [None]:
scraped_urls = dict()

for iteration in range_iter:
    try:
        with open(os.path.join(base_path, photo, photo + "_" + str(iteration), "html", "results.txt"), 'r', encoding='utf-8') as f:
            lu = f.readlines()
        lu = [l.split('|') for l in lu]
        lu = [l for l in lu if len(l) == 2]
        lu = [l[1].replace('\n','') for l in lu]
        print("---- {} dates found in iteration {}".format(len(lu),iteration))
        scraped_urls.update({str(iteration):lu})
    except Exception as e:
        print("Error: ",e)

dates_dict = dict()
for it,list_ in scraped_urls.items():
    dates_dict.update({str(it):dict()})
    print('---- Scraping Dates Iteration {}, {} URLs'.format(it,len(list_)))
    if sampling == True:
        print('----- Sampling with Size {}'.format(sample_size))
        if len(list_) < sample_size:
            list_ =list_
        else:
            list_ = random.sample(list_,sample_size)

    for u in tqdm(list_):
        try:
            date = WebPage.gatherSingleDate(u)
            dates_dict[str(it)].update({u:date})
        except Exception as e:
            continue

with open(os.path.join(base_path,photo,'dates-{}.json'.format(photo)), 'w') as fp:
    json.dump(dates_dict, fp)

Lastly, we identify Named Entities in our text data. This method extracts entities (locations, persons, dates etc.) from the text, based on pretrained models. We use the popular Spacy models (available for english, dutch, italian, french, spanish, portugese) to do this. Named Entities can be used to study the context of certain keywords, and the phenomena associated with the photo.

In [None]:
# F. Named Entitiy Recognition using Spacy
print("INFO: Language Detection & Named Entitiy Recognition using Spacy")

identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
selected_languages = "en de fr es it nl pt".split(' ')
selected_languages = {i:i+"_core_news_sm" for i in selected_languages}
selected_languages.update({"en":"en_core_web_sm"})

def PreProc(text):
    text = text[1:-1].replace('\xa0', ' ')
    text = " ".join(text.split('\r\n'))
    return text

d_ = dict()
for iteration in range_iter:

    # Language Detection
    list_csv = [csv for csv in os.listdir(os.path.join(base_path, photo, photo + "_" + str(iteration),"txt")) if ".csv" in csv]

    df= pd.DataFrame()
    for csv in list_csv:
        tmp = pd.read_csv(os.path.join(base_path, photo, photo + "_" + str(iteration),"txt",csv))
        df = df.append(tmp)

    df['text'] = [PreProc(str(i)) for i in df['text']]
    df['lang'] = [identifier.classify(i)[0] for i in df['text']]
    df.to_csv(os.path.join(base_path, photo, "text-language-{}.csv".format(photo)),index=False)

    # NER
    for lang in [i for i in list(set(df['lang'])) if i in selected_languages.keys()]:
        if lang not in d_.keys():
            d_.update({lang:dict()})
        nlp = spacy.load(selected_languages[lang])
        tmp = df[df['lang'] == lang]

        for count,text in enumerate(df['text']):
            identif = str(df['id'][count])
            d_[lang].update({identif:dict()})
            d_[lang][identif].update({"text":text})
            doc = nlp(text)
            doc = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
            d_[lang][identif].update({"entities":doc})

with open(os.path.join(base_path, photo,"entities-{}.csv".format(photo)), 'w') as fp:
    json.dump(language_dict, fp)