In [163]:
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import sleep
import os
import json

In [2]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [182]:
def convert_to_arabic(string):
    numbers = {'٠':'0', '١':'1', '٢':'2', '٣':'3', '٤':'4', '٥':'5', '٦':'6', '٧':'7', '٨':'8', '٩':'9'}
    return ''.join([numbers[s] for s in string])


In [183]:
def prep_cases_links(i,n):   
    """
    prepare/extract all cases links from https://sjp.moj.gov.sa/Filter/AhkamDetails/[number] 
     where  [number] starts from i to n
    :param n: specifies the number of pages to retrieve the links from. 
    :return: a list of all answers links from the pages from 1 to n
    """
    base = r"https://sjp.moj.gov.sa/Filter/AhkamDetails/"
    links=[]
    for ii in range(i,n):
        links.append(base+str(ii))
    return links

In [187]:
def fetch_pages(links):
    '''Given a list of links, the method should fetch the pages/page and return '''
    
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
        }
    for link in links:
        try:
            req = requests.get(link, headers,verify=False)
            soup = BeautifulSoup(req.content, 'html.parser')
            #All main sections have the same class, 
            main_sections = soup.find_all(class_="panel-body main2-color")
            details = soup.find_all(lambda tag: tag.name == 'div' and 
                                       tag.get('class') == ['panel-body'])
            metadata = get_metadata(main_sections[0]) #Get metadata dict
            legal_basis = get_legal_basis(main_sections[1])
            summary = get_summary(main_sections[2])
            transcript = get_trans(main_sections[3])
            appeal_decision = get_appeal_decision(main_sections[4])
            #Additional details 
            categories = get_categories(details[0])
            topics = get_topics(details[1])
            title = get_title(details[2])
            #Merge all dicts 
            data = {
                **metadata, **legal_basis,**summary,
                **transcript,**appeal_decision,**categories
                , **topics, **title
            }
            #Write the case to a text file
            #Will use case_no to name the file
            case_no = re.findall(r'\d+', metadata['case_no'])[0]
            page_no = os.path.split(link)[-1]
            filename = convert_to_arabic(case_no) +"_"+page_no
        except Exception as e:
            print('Error in reading the source ..',str(e))
        # Serialize data into file:
        try:
            json.dump(data, open('data/' + 
                             filename+".json", 'w',encoding='utf-8-sig' ),ensure_ascii=False )  
        except Exception as e:
            print('Error in writing the file..',str(e))


In [188]:
links = prep_cases_links(24340,24539)
fetch_pages(links)



Error in reading the source .. list index out of range




Error in reading the source .. list index out of range




Error in reading the source .. list index out of range




All main sections (5) have the same class name: panel-body main2-color. And the order is as the following: 

1: case metadata (e.g. date and court location)

2: legal basis (السند الشرعي أو النظامي)

3: case summary 

4: transcript 

5: appeal decision (قرار محكمة الاستئناف) 

In [92]:
def get_metadata(metadata):
    h5s = metadata.find_all('h5')
    spans = metadata.find_all('span')
    court = h5s[0].text
    city =  h5s[1].text
    appeal_court = h5s[2].text
    appeal_city = h5s[3].text
    case_no = spans[0].text
    case_date = spans[1].text
    decision_no = spans[2].text
    decision_date = spans[3].text
    metadata = {
        'court': court, 'city': city, 
        'appeal_court':appeal_court,
        'appeal_city':appeal_city, 
        'case_no':case_no, 
        'case_date':case_date,
        'decision_no': decision_no,
        'decision_date': decision_date,   
    }
    return metadata

def get_legal_basis(basis):
    return {'legal_basis': legal_basis.p.text}

def get_summary(summary):
    return {'summary' : summary.p.text}

def get_trans(trans):
    return {'trans':trans.p.text}

def get_appeal_decision(appeal_decision):
    return {'appeal_decision': appeal_decision.p.text}

Case's details: 
1. categories
2. Topics
3. Title

In [91]:
def get_categories(categories):
    spans = categories.find_all('span')
    return {'categories':[s.text for s in spans ]}

def get_title(title):
    return {'title':title.span.text}

def get_topics(topics):
    a_tags = topics.find_all('a')
    return {'topics':[tag.text.replace('-',' ') for tag in a_tags ]}