# Technical Annex 2: Data scrapping for comrehensive case analysis
<div class="alert alert-block alert-danger"><b>Info:</b> This page is being developed.</div>
This notebook describes the process of identifying, downloading, extracting and cleaning data for case files published on http://act.sot.kg. This is a technical step for a comprehensive case analysis of judicial decisions in relation to discrimination in Kyrgyzstan undertaken by the UN Human Rights Office for Central Asia.

The notebook contains all neccessary Python (3.8) code to execute the entire process independently. For further information please contact Peter Naderer (peter.naderer@outlook.com).

## What do we want to achieve?

## The toolbox

## Step 1: Scrapping the website and identifying data

```
def souping(court, act_type):

    '''
    :court Type of court
    :act_type Type of the document to be downloaded
    :return:
    '''

    url_list = []
    link_list = []
    data = []

    url = str('http://act.sot.kg/ru/search?caseno=&name=&articles=&court={}&judge=all&caseOpenedFrom=&caseType=all&actType={}&caseOpenedTo=&from=&to=&side1=&side2=&submit-act=%D0%90%D0%BA%D1%82%D1%8B&quantity=5000&page='.format(court, act_type))
               # SET QUANTITY TO APPROPRIATE LEVEL

    print('Finding out how many pages I need to look through.')

    soup = BeautifulSoup(requests.get(url).text, "html.parser")

    lastpage = soup.find("li", class_="last")
    lastpage = lastpage.find('a')['href']
    lastpage = int(re.search('\d*$', lastpage).group(0))
    lastpage += 1
    page_list = list(range(1, lastpage))  # For testing only, ENDVALUE should be lastpage

    print('I going to download information from {} pages'.format(lastpage))

    for i in page_list:
        url_list.append(url + str(i))

    for counter, url in enumerate(url_list, start=1):
        print('I am working on page {} of {}'.format(counter, lastpage-1))
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        table = soup.find('table')
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols_strip = []
            element = []
            links = []
            for ele in cols:
                cols_strip.append(ele.text.strip())
            for i in cols_strip:
                element.append(i)
            for ele in cols:
                link = ele.find('a')
                if link != None:
                    link = link.get('href')
                    links.append(link)
                    links_list = links
            element = element + links_list
            data.append(element)

    return data

if __name__ == '__main__':


    court = ""
    act_type = '2' # Постановление = 4 // Приговор = 2

    result = souping(court, act_type)

    with open(r'..\data\sotkg_data_acttype_{}_{}.pkl'.format(act_type, date.today()), 'wb') as file:
        pickle.dump(result, file)
```

## Step 2: Downloading identified files

```
import os
import re
import urllib
from pathlib import Path
from urllib.request import urlretrieve
import pandas as pd
import pickle
import time
import concurrent.futures
from functools import wraps
from tqdm import tqdm

def timeit(method):
    @wraps(method)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = method(*args, **kwargs)
        end_time = time.time()
        print(f"{method.__name__} => {(end_time-start_time)*1000} ms")

        return result

    return wrapper

@timeit
def downloading_one(url, directory):

    req = urllib.request.urlopen(url)
    fullpath = Path(url)
    fname = fullpath.name

        # Combine the name and the downloads directory to get the local filename
    download = os.path.join(directory, fname)

    # print('I am now attempting to download {}'.format(fname))
    if not os.path.isfile(download): # Do not download the file if it already exists in directory
        try:
            urlretrieve(url, download)
        except urllib.error.HTTPError as e:
            print(e.reason)

@timeit
def downloading_bulk_singleprocess(url_list):
    print('I will download {} files.'.format(len(url_list)))
    return [downloading_one(url, directory) for url in url_list]

@timeit
def downloading_bulk(url_list):
    '''
    This function uses download_one() to download a list of files with multi-threading.
    '''
    with tqdm(total=int(len(url_list))) as pbar:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(downloading_one, url, directory): url for url in url_list}
            results = {}
            for future in concurrent.futures.as_completed(futures):
                arg = futures[future]
                results[arg] = future.result()
                pbar.update(1)

if __name__ == '__main__':

    # url = 'http://act.sot.kg/act/download/190580.pdf'
    # url_list = ['http://act.sot.kg/act/download/68.pdf','http://act.sot.kg/act/download/70.pdf', 'http://act.sot.kg/act/download/83.pdf','http://act.sot.kg/act/download/94.pdf']
    directory = r'C:\Users\peter\Python\research_files\files'

    df = pd.read_pickle(r'../data/DF_acttype_4_2020-11-01.pkl')
    url_list = df['link_file'].tolist()

    downloading_bulk(url_list)
```

## Step 3: Extracting text from case files

## Step 4: Data cleaning