## Wavelenght.law assignment (1)

**Relevant information**

How to extract data from a text: https://www.computerhope.com/issues/ch001721.htm


**Exercice – Data Extraction**

[File: Data Extraction.xlsx]

<u>Actions</u>

1) Create an application/script to extract the first 30 results from a publicly available resource

a. File/[Template]/[C1]

2) Extract information from the collected data as specified in the template and save to an Excel file

In [1]:
# Import relevant libraries
import urllib3
from bs4 import BeautifulSoup
import re as re
import pandas as pd

In [2]:
# Scrape 1st page
url = 'http://www.bailii.org/cgi-bin/lucy_search_1.cgi?query=%22Planning+court%22+AND+%28%22The+Royal+Courts+of+Justice%22+OR+%22Supreme+Court%22+OR+%22Manchester+Civil+Justice+Centre%22%29&datelow=&datehigh=&sort=date&highlight=1'
http = urllib3.PoolManager()
response = http.request('GET', url)
soup = BeautifulSoup(response.data)

In [3]:
# Find links
links = []
for data in soup.find_all('li'):
    for a in data.find_all('a'):
        links.append('http://www.bailii.org'+a.get('href')) #for getting link
# Disregard non relevant links (one on two)
links = links[1::2]

In [4]:
# Extract cases content, clean html tag and store cases in list
list_cases = []
counter = 0
for i in links:
    case_request = http.request('GET',i)
    counter += 1
    list_cases.append(BeautifulSoup(case_request.data).text)
    
# Clean text and split text in list of lines
clean_cases = []
for i in list_cases:
    text = str(i).split('\n')    
    text = list(filter(None,text))
    clean_cases.append(text)

## 1. Case name, url, dates, week, cite as, case number

In [5]:
# Extract case name, url, n*
case_name = []
case_url = []
for i in clean_cases:
    case_name.append(i[23:24])
    case_url.append(i[24:25])

In [6]:
# Create pandas dataframe with Case name
df = pd.DataFrame(case_name, columns=['case_name'])
# Add url
df['url'] = [i.replace('URL: ','') for i in list(map(''.join,case_url))]
# Add dates
case_date = [re.findall('(\d\d\s[a-z]+\s\d{4})',str(i),re.IGNORECASE) for i in case_name]
df['date'] = list(map(''.join,case_date))
df['date'] = pd.to_datetime(df['date'])
# Add week
df['week'] = df['date'].dt.strftime('%YWk%w')
# Cite as
df['cite_as'] = [i[25:28] for i in clean_cases]

In [7]:
df.head(3)

Unnamed: 0,case_name,url,date,week,cite_as
0,Binning Property Corporation Ltd v Secre...,http://www.bailii.org/ew/cases/EWCA/Civ/20...,2019-02-28,2019Wk4,"[Cite as: , [2019] EWCA Civ 250, ]"
1,Visao Ltd v The Secretary of State for H...,http://www.bailii.org/ew/cases/EWHC/Admin/...,2019-02-14,2019Wk4,"[Cite as: , [2019] EWHC 276 (Admin), ]"
2,South Gloucestershire Council v Secretar...,http://www.bailii.org/ew/cases/EWHC/Admin/...,2019-02-07,2019Wk4,"[Cite as: , [2019] EWHC 181 (Admin), ]"


In [8]:
# Function to replace values

# Function to extact elements between tag
def extractor(tag):
    '''
    Function used to extract elements within relevant html tag
    tag (string): html tag to search
    Return a dictionary with the values extracted. Key is esquivalent to the decision index in pd dataframe.
    ''' 
    dictionary = {}
    counter = -1 
    for i in links:
        counter += 1
        url = i
        response = http.request('GET', i)
        soup = BeautifulSoup(response.data)
        var = soup.select(tag)
        dictionary[counter] = re.sub("<.*?>", " ", str(var[0]))

    return dictionary   

## 2. Case num

In [9]:
case_num = extractor('casenum')
df['casenum'] = df.index.map(case_num)

## 3. Judges

In [10]:
judges_all = extractor('panel')
df['judges'] = df.index.map(judges_all)

## 4. Court names

In [11]:
court_all = extractor('court')
df['court'] = df.index.map(court_all)

## 5. Authorithies

In [12]:
authorities = extractor('parties')
authority_all = []
for key, value in authorities.items():
    case = []
    test = [s.strip() for s in re.split("and|\n", value)]
    for i in test:
        if any(re.findall('COUNCIL|BOROUGH|CITY|AUTHORITY', i,re.IGNORECASE)):
                    case.append(i)
    authority_all.append(case)
df['authorities'] = authority_all

In [13]:
df.head(2)

Unnamed: 0,case_name,url,date,week,cite_as,casenum,judges,court,authorities
0,Binning Property Corporation Ltd v Secre...,http://www.bailii.org/ew/cases/EWCA/Civ/20...,2019-02-28,2019Wk4,"[Cite as: , [2019] EWCA Civ 250, ]",Case No: C1/2018/1297,Lady Justice Sharp and Lord Justice Lindblom,IN THE COURT OF APPEAL (CIVIL DIVISION) ON AP...,[- London Borough of Hav...
1,Visao Ltd v The Secretary of State for H...,http://www.bailii.org/ew/cases/EWHC/Admin/...,2019-02-14,2019Wk4,"[Cite as: , [2019] EWHC 276 (Admin), ]",Case No: C0/3981/2018,NEIL CAMERON QC (Sitting as a Deputy High Co...,IN THE HIGH COURT OF JUSTICE QUEEN'S BENCH DI...,[CHILTERN DISTRICT COUNCIL]


## 6. Relevant act

In [14]:
# Extracting body data, appending acts to list & creating dictionary 
cases_body = []
list_acts = []
dict_acts = {}
counter = 0

for case in links:
    response = http.request('GET', case)
    soup_case = BeautifulSoup(response.data)
    body = soup_case.body
    cases_body.append(body.find_all(['li','p'])) 
    for i in re.findall('(section.+?\s\d{4})',str(cases_body[0]),re.IGNORECASE):
        if len(i) <= 100:
            list_acts.append(i)       
    dict_acts[counter] = list_acts
    list_acts = []
    cases_body = []
    counter += 1    

In [15]:
# Formatting dictionnary to pandas dataframe and joining to current df
act_index = []
act_list = []

for keys, values in dict_acts.items():
    for values in dict_acts[keys]:
        # itterating trough values and adding to seperate list + filling empty values
        act_index.append(keys)
        act_list.append(values)


d = {'act_index':act_index,'act_list':act_list}

# Create pandas data frame 
act_dataframe = pd.DataFrame(d)

In [16]:
# Counting number of cases cited several times and removing duplicate values
act_dataframe['nr_act'] = act_dataframe['act_list'].apply(act_dataframe['act_list'].tolist().count)
act_dataframe = act_dataframe.drop_duplicates(subset=['act_index','act_list'])
act_dataframe.sort_values(by=['act_index'], inplace=True, ascending=True)

In [17]:
# String formatting 
act_dataframe['nr_act'] = act_dataframe['nr_act'].apply(lambda x: '{'+str(x)+'}')

# Cleaning of data frame 
act_dataframe['act_list'] = act_dataframe['act_list'] + ' // ' + act_dataframe['nr_act']
act_dataframe = act_dataframe.drop(columns=['nr_act'])


In [18]:
# creation of dictionary of list based on case_index
act_dict = {}
temp_list = []
counter = 0

for i in range(0,10):
    temp_list = []
    for case in act_dataframe.iterrows():
        if i == case[1][0]:
            temp_list.append(case[1][1])
        else:
            act_dict[i] = temp_list
                   
df['cited_act'] = df.index.map(act_dict)

## 7. Key cases

In [19]:
# Extracting body raw data
cases_body = []
for i in links:
    response = http.request('GET', i)
    soup_case = BeautifulSoup(response.data)
    body = soup_case.body
    cases_body.append(body.find_all(['li','p']))  

In [20]:
# Creating a dicitonary with cases & relevant cases per decision
key_cases = {}
counter = -1
for case in range(0,len(links)):
    cases = []
    
    # Extract key cases based on html tags
    for i in range(0,len(cases_body[case])):
        for val in cases_body[case][i].findAll(['i','a','u'],attrs= {"name":False}):
            try:
                if (val.name == 'i' or 'u') and 'v.' in val.string:
                    cases.append(val.string)
                elif (val.name == 'i' or 'u') and (' v ') in val.string:
                    cases.append(val.string)
                else:
                    pass
                if (val.get('href') == None) or ('cgi-bin' in val.get('href')):
                    pass
                else:
                    url = 'http://www.bailii.org'+val.get('href')
                    cases.append(url)
            except:
                cases.append('')
    counter += 1
    
    # Cleaning and creating dic
    cases.remove('http://www.bailii.org/form/search_cases.html')
    cases.remove('http://www.bailii.org/bailii/help/')
    key_cases[counter] = cases[1:]

In [21]:
# Formatting dictionnary to pandas dataframe and joining to current df
case_index = []
key_cases_list = []
link = []

for keys, values in key_cases.items():
    for values in key_cases[keys]:
        # itterating trough values and adding to seperate list + filling empty values
        case_index.append(keys)
        if 'http' in values:
            
            link.append(values)
            key_cases_list.append(saver)
            
        else:
            key_cases_list.append(values)
            link.append('No link available')
        
        saver = values

d = {'case_index':case_index,'key_cases_list':key_cases_list, 'link':link}

# Create pandas data frame + counting duplicates + cleaning
key_cases_dataframe = pd.DataFrame(d)

In [22]:
# Counting number of cases cited several times and removing duplicate values
key_cases_dataframe['nr_citation'] = key_cases_dataframe['key_cases_list'].apply(key_cases_dataframe['key_cases_list'].tolist().count)
key_cases_dataframe.sort_values(by=['link'], inplace=True, ascending=False)
key_cases_dataframe = key_cases_dataframe.drop_duplicates(subset=['case_index','key_cases_list','link'], keep='first')
key_cases_dataframe.sort_values(by=['case_index'], inplace=True, ascending=True)


In [23]:
# String formatting 
key_cases_dataframe['nr_citation'] = key_cases_dataframe['nr_citation'].apply(lambda x: '{'+str(x)+'}')

# Cleaning of data frame 
key_cases_dataframe['case_cited'] = key_cases_dataframe['key_cases_list'] + ' // ' + key_cases_dataframe['nr_citation'] + ' ' + key_cases_dataframe['link']
key_cases_dataframe = key_cases_dataframe.drop(columns=['key_cases_list','link','nr_citation'])

# creation of dictionary of list based on case_index
case_cited_dict = {}
temp_list = []
counter = 0

for i in range(0,10):
    temp_list = []
    for case in key_cases_dataframe.iterrows():
        if i == case[1][0]:
            temp_list.append(case[1][1])
        else:
            case_cited_dict[i] = temp_list
                   
df['cited_cases'] = df.index.map(case_cited_dict)

In [37]:
df.to_csv('10 most recent case data extract.csv')