In [None]:
from IPython.core.display import HTML
HTML(""" <link href="https://fonts.googleapis.com/css2?family=Inter:wght@600&family=Noto+Sans+JP&display=swap" rel="stylesheet"> 
<style>
    div.text_cell_render h1 {
        font-family: 'Inter';
        font-size: 1.7em;
        line-height:1.4em;
        text-align:center;
        }

    div.text_cell_render { 
        font-family: 'Noto Sans JP';
        font-size:1.05em;
        line-height:1.5em;
        padding-left:3em;
        padding-right:3em;
        }
</style>""")

# Working with Web Data

In the previous notebook we identified webpages that host a specific image. The URLs that refer to these webpages are found in .json files. In this notebook, we use the URLs to download the actual pages, extract their textual content and prepare the textual data for analysis. Also, we extract general features of the webpages (the metadata) for analysis. To get an idea of the .json files, we start by opening them and inspecting the list of URLs. 

First, we define some basic variables. Then we gather the list of .json files and open the first one.

In [None]:
import os,json

base_path = "D:/test/" 
photo = "9"
photo_folder = os.path.join(base_path, photo)

# Check how many iterations we have by using the os.listdir function. We don't want the "source" folder because it doesn't contain jsons
num_iterations = len([fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol])
start_iter = 1
range_iter = [str(i) for i in list(range(1,num_iterations+1))]

list_jsons = []

# We now "loop" through the folders associated with the iterations and gather the .jsons in these folders
for iteration in range_iter:
    iteration_folder = os.path.join(photo_folder, photo + "_" + str(iteration))
    list_json_in_iteration_folder = [os.path.join(iteration_folder,js) for js in os.listdir(iteration_folder) if ".json" in js]
    print("Found {} .json files in {}".format(len(list_json_in_iteration_folder),iteration_folder))
    list_jsons += list_json_in_iteration_folder

Now we can open the .json files by loading them with the json module that comes automatically with your Python installation. You can inspect or "walk" the data by selecting keys with names (```json_data['responses']```) or elements in lists (```json_data['responses'][0:10]```). To find the URLs, navigate to the ```pagesWithMatchingImages``` list.

In [None]:
with open(list_jsons[0],'r') as fr:
    json_data = json.load(fr)

# Show the first elements in the list:
json_data['responses'][0]['webDetection']['pagesWithMatchingImages'][0:2]

As you can see, the json data consists of key:value pairs. In the 'pagesWithMatchingImages' list you can find:
1. The title of the page where the image is found
2. The link to the image file (www.example.com/media/examplephoto.png). This can be either a 'partialMatchingImage' or a 'fullMatchingImage'. The difference between the two is hard to explain, but in most cases a "fullMatch" concerns a copy of the input image, only different in scale or quality. A 'partialMatch' usually consists of for example images where the input image is only part of (for example an image of a t-shirt that includes a print of the input image.
3. The link to the page itself

# The Pipeline

To work with the metadata and text data associated with the webpages we need to extract, clean and harmonize this data. We do so by:
- downloading the webpages in .html format
- extract the text from the .html pages
- identify languages
- identify dates
- identify Named Entities

Because these individual steps require a lot of code, we decided to import some classes and functions that do most of the work. If you want to see what happens when we parse the texts, languages, dates and entities, just open ```functions.py``` in the notebook folder!

In [None]:
from functions import *

In [None]:
photo_folder = os.path.join(base_path, photo)
num_iterations = len([fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol])
start_iter = 1
range_iter = [str(i) for i in list(range(1,num_iterations+1))]
folder_base = os.path.join(photo_folder,photo)

for iteration in range_iter:
    
    # Import page URLs from .json files (using Json.extract_pages())
    iteration_path = folder_base + "_" + str(iteration)
    jsFiles = [js for js in os.listdir(iteration_path) if ".json" in js]

    list_urls = []
    for js in jsFiles:
        json_file_path = os.path.join(folder_base + "_" + str(iteration), js)
        list_urls += Json.extract_pages(json_file_path)
            
    # Import previously scraped URLs if iteration number is larger than 1:
    scraped_urls = []
    if int(iteration) > 1:
        for i in range(1,int(iteration)):
            try:
                with open(os.path.join(folder_base + "_" + str(i), "html","results.txt"), 'r', encoding='utf-8') as f:
                    print("INFO: importing from {}".format(os.path.join(folder_base + "_" + str(i), "html","results.txt")))
                    lu = f.readlines()
                    lu = [l.split('|') for l in lu]
                    lu = [l for l in lu if len(l) == 2]
                    lu = [l[1].replace('\n','') for l in lu]
                    scraped_urls = scraped_urls + lu
            except FileNotFoundError:
                print(os.path.join(folder_base + "_" + str(i), "html","results.txt"), "not found")

    #Scrape All Page URLs to 'image[...]/html' folder
    destination_path = os.path.join(folder_base + "_" + str(iteration), "html")
    if not os.path.exists(destination_path):
        os.makedirs(destination_path)
    
    list_urls = [u for u in list_urls if u not in scraped_urls]
    print('INFO: Scraping {} URLs in iteration {}'.format(len(list_urls),iteration))

    pagescraper.PoolScrape(list_urls, destination_path)

You will now find a ```html``` folders inside the ```example_photo1_[iteration number]``` folders. The folder structure now looks like this:
```
+-- photo_folder
    +-- example_photo_1_folder
        +-- example_photo_1_1
            +-- html (location of downloaded webpages)
            +-- img (location of images used for gathering webpages)
            +-- photo_name_identifier1.json (json files)
            +-- photo_name_identifier2.json
        +-- example_photo_1_2
        +-- example_photo_1_3
        +-- example_photo_1_folder_source
```

Next, we download the text to a ```txt``` folder that is going to be located at the same level as ```html``` and ```img```. In this folder, we gather all the webpage texts in one single .json file (for easy loading during the analysis). 

Extracting text from webpages is everything but a straightforward process, because it is not clear beforehand what is relevant. For example, text from ads or menubars are not relevant to the research. Luckily, there is software available that "parses" relevant texts. In this pipeline, we use a Python implementation of [boilerpipe](https://github.com/misja/python-boilerpipe) a piece of software that removes clutter and irrelevant bits from webpage texts. Boilerpipe offers various options. Inside the ```function.py``` you will find the setup of our use of the parser. To make things easier we handle the parsing in a separate function. Simply name the base path and the photo. Check the ```functions.py``` file to see how it works.

In [None]:
    Parse.Texts(base_path,photo)

Based on the URL text and the webpage text, we can now identifiy the language of the webpage. Here, we make use of ```langid``` a language identification library for Python. Below we identify the languages in a similar way as the we extracted the texts: by iterating over the folders. We write the languages to a .json file that is located in the main photo folder. The language identifier returns a probability score and the best guess language. First, a quick demonstration of the langid module:

In [None]:
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
print(identifier.classify("this is a test sentence that could be in english"))

To scrape the languages, we need to open the .json file that contains the texts. Every iteration folder has such a .json. We thus need to iterate over the folders. Because we already know how to do this, we can use the predefined Parse.Languages() function here:

In [None]:
Parse.Languages(base_path,photo)

With the languages written to a .json, we now repeat a similarly looking procedure for the dates. The ```htmldate``` module identifies the date of publication for a URL. Because the module depends on the avaiability of information embedded in the html, it does not cover all the URLs, but enough to get an idea of the temporal distribution of our data. We write the extracted dates to ```dates.json``` in the main photo folder. An example of the library:

In [None]:
import htmldate
print(htmldate.find_date("http://www.eleggua.com/Objects/WTO/Death.html"))

Again, running this function over all the URLs stored in different folders is handled in a predefined function

In [None]:
Parse.Dates(base_path, photo)

Lastly, we identify Named Entities in our text data. This method extracts entities (locations, persons, dates etc.) from the text, based on pretrained models. We use the popular Spacy models (available for english, dutch, italian, french, spanish, portugese) to do this. Named Entities can be used to study the context of certain keywords, and the phenomena associated with the photo.

In [None]:
#########################
photo_folder = os.path.join(base_path, photo)
num_iterations = len([fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol])
start_iter = 1
range_iter = [str(i) for i in list(range(1,num_iterations+1))]
folder_base = os.path.join(base_path,photo,photo)
#########################

print("INFO: Named Entitiy Recognition using Spacy")

selected_languages = "en de fr es it nl pt".split(' ')
selected_languages = {i:i+"_core_news_sm" for i in selected_languages}
selected_languages.update({"en":"en_core_web_sm"})

def PreProc(text):
    text = text[1:-1].replace('\xa0', ' ')
    text = " ".join(text.split('\r\n'))
    return text

d_ = {}
for iteration in range_iter:
    fn = os.path.join(folder_base + "_" +str(iteration),"txt", "parsed_text.json")
    if os.path.isdir(os.path.join(folder_base + "_" +str(iteration),"txt")) == False:
        print('INFO: no parsed text found in iteration {}'.format(iteration))
        continue

    with open(fn) as fp:
        pages = json.load(fp)

    df = []
    for id_,sentences in pages.items():

        sentences = [s.replace("\n","").lower() for s in sentences]
        sentences = [re.sub(' +', ' ', s) for s in sentences]

        url = id_.split('html_')[-1]
        id_ = id_.split('/html/')[1].split('.html_')[0]
        language = identifier.classify(str(sentences))[0]
        df.append([url,id_,language,str(sentences)])

    df = pd.DataFrame(df,columns=['url','id','lang','text'])

    # NER
    for lang in [i for i in list(set(df['lang'])) if i in selected_languages.keys()]:
        print('INFO: working on language: {}'.format(lang))
        if lang not in d_.keys():
            d_.update({lang:dict()})
        nlp = spacy.load(selected_languages[lang])
        tmp = df[df['lang'] == lang]

        for count,text in enumerate(df['text']):
            identif = str(df['id'][count])
            d_[lang].update({identif:dict()})
            d_[lang][identif].update({"text":text})
            doc = nlp(text)
            doc = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
            d_[lang][identif].update({"entities":doc})

with open(os.path.join(base_path, photo,"entities-{}.json".format(photo)), 'w') as fp:
    json.dump(d_, fp)