In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# Base URL
base_url = "https://www.sprm.gov.my/index.php?r=site%2Findex&page_id=96&language=en&page=1&per-page=8"

all_dataframes = []

# Function to get the next page URL
def get_next_page_url(soup):
    next_page = soup.find('a', string='»')
    if next_page and 'href' in next_page.attrs:
        return "https://www.sprm.gov.my" + next_page['href']
    return None

# Function to extract the images
def extract_image_urls(soup):
    images = soup.find_all('img', src=lambda x: x and 'uploads/pesalah' in x)
    return ["https://www.sprm.gov.my/" + img['src'] for img in images]

#scrapping all the pages from the website
current_url = base_url
page_num = 1

while current_url:
    print(f"Scraping page {page_num}")
    response = requests.get(current_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    data = soup.select('table')
    
    if data:
        data_frames = pd.read_html(str(data))
        page_image_links = extract_image_urls(soup)
        for df in data_frames:
            # Duplicate the list of image URLs to match the number of rows
            if len(page_image_links) == len(df):
                df['Image_URL'] = page_image_links
            elif len(page_image_links) > 0:
                df['Image_URL'] = page_image_links[0]  # If not matched, assign first URL to all rows
            all_dataframes.append(df)
    
    current_url = get_next_page_url(soup)
    page_num += 1

# Final Dataframe
df_merged = pd.concat(all_dataframes, axis=0, ignore_index=True)

# Saving the DataFrame to a CSV file
df_merged.to_csv('2024-07-01_sprm_data.csv', index=False)

# Printing the final DataFrame
print(df_merged)

# Download Images
image_links = df_merged['Image_URL'].unique()
if not os.path.exists('sprm_pesalah'):
    os.makedirs('sprm_pesalah')

for idx, link in enumerate(image_links):
    img_data = requests.get(link).content
    with open(f'sprm_pesalah/{link.split("/")[-1]}', 'wb') as handler:
        handler.write(img_data)

print("Images have been downloaded and saved in the 'sprm_pesalah' folder.")

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Scraping page 43
Scraping page 44
Scraping page 45
                        0                              1  \
0                 Accused          Abd Rasid bin Mohamad   
1      Identification No.                   70040712XXXX   
2                  Gender                           Male   
3   

In [7]:
def reorganize_dataframe(df):
    organized_data = []
    current_accused = {}

    for index, row in df.iterrows():
        if row[0] == 'Accused':
            if current_accused:
                organized_data.append(current_accused)
            current_accused = {'Cases': [], 'Image_URL': row['Image_URL']}
        if pd.notna(row[0]):
            if row[0] == '#':
                current_accused['Cases'].append({
                    'No Kes': row[1],
                    'Ringkasan Pertuduhan': row[2],
                    'Kesalahan': row[3],
                    'Hukuman': row[4]
                })
            else:
                current_accused[row[0]] = row[1]

    if current_accused:
        organized_data.append(current_accused)

    final_data = []
    for data in organized_data:
        temp_dict = {key: value for key, value in data.items() if key != 'Cases'}
        cases = data.get('Cases', [])
        for i, case in enumerate(cases, start=1):
            for k, v in case.items():
                if pd.notna(v):
                    temp_dict[f'Case {i} {k}'] = v
        final_data.append(temp_dict)

    final_df = pd.DataFrame(final_data)
    return final_df

# Reorganizing Data
final_df = reorganize_dataframe(df_merged)

# Save the DataFrame to a CSV file
final_df.to_csv('2024-07-01_sprm_data_reorganized.csv', index=False)

# Print the final DataFrame
final_df

Unnamed: 0,Image_URL,Accused,Identification No.,Gender,Nationality,State,Type,Employer,Position,Court,Judge,Deputy Public Prosecutor/Prosecuting Officer,Defense Counsel,Previous Conviction,Date of Sentence,Appeal
0,https://www.sprm.gov.my/admin/uploads/pesalah/...,Abd Rasid bin Mohamad,70040712XXXX,Male,Malaysia,Sabah,Penjawat Awam,Kementerian Pelajaran Malaysia,Guru Besar SK Kolapis,,,1. PO Mohd Faliq bin Basirudin 2. PO Dzulkarna...,1. Marzuki Spawi,,2024-06-27,
1,https://www.sprm.gov.my/admin/uploads/pesalah/...,Shaharuddin bin Ahmad,79082508XXXX,Male,Malaysia,W.P Kuala Lumpur,Orang Awam,Swasta,Pengarah Syarikat,,,1. TPR Irna Julieza binti Maaras 2. PO Afiqah ...,,,2024-06-26,
2,https://www.sprm.gov.my/admin/uploads/pesalah/...,Ahmad Jefri Azizi bin Mohamad Sukri,80021703XXXX,Male,Malaysia,Kelantan,Penjawat Awam,Polis Diraja Malaysia,Koperal,,,1. TPR Tengku Nurul Haziqah binti Tuan Yacob,,,2024-06-23,
3,https://www.sprm.gov.my/admin/uploads/pesalah/...,Ameyrudin bin Ahmad Zuki,81081914XXXX,Male,Malaysia,Kedah,Penjawat Awam,Polis Diraja Malaysia,ASP,,,1. TPR Allan Suman Pillai 2. TPR Maziah binti ...,1. Yoegeswaran,,2024-06-20,
4,https://www.sprm.gov.my/admin/uploads/pesalah/...,Roney Saimey bin Sakah,80081112XXXX,Male,Malaysia,Sabah,Penjawat Awam,Polis Diraja Malaysia,Inspektor,,,1. TPR Mohd Faliq bin Basirudin 2. TPR Michael...,1. Salina Fadzil & CO 2. Chang & Kamarudin 3. ...,,2024-06-20,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,https://www.sprm.gov.my/admin/uploads/pesalah/...,Noor Azma binti Azmin,820623-03-XXXX,Female,Malaysia,W.P Kuala Lumpur,Penjawat Awam,Jabatan Imigresen Malaysia,Pegawai Imigresen,,,1. TPR Mohd 'Afif bin Ali,1. Norliana Ali Othman,,2021-08-11,
350,https://www.sprm.gov.my/admin/uploads/pesalah/...,Teoh Peng Sin,661215-07-XXXX,Male,Malaysia,Selangor,Orang Awam,Bekerja Sendiri,Bekerja Sendiri,,,1. TPR Fadhli bin Ab Wahab 2. TPR Maziah binti...,1. Dato’ Hj Hanif Hassan (Hanif Hassan & Co.),,2021-08-05,
351,https://www.sprm.gov.my/admin/uploads/pesalah/...,Shahrul Izuan bin Mohd Zin,801212-05-XXXX,Male,Malaysia,Selangor,Penjawat Awam,Suruhanjaya Pencegahan Rasuah Malaysia,Penguasa,,,1. TPR Fadhli bin Ab Wahab 2. TPR Maziah binti...,1. Dato’ Hj Hanif Hassan (Hanif Hassan & Co.),,2021-08-05,
352,https://www.sprm.gov.my/admin/uploads/pesalah/...,Tengku Kamarul Ariffin Bin Raja Jalil,790812-06-XXXX,Male,Malaysia,W.P Kuala Lumpur,Penjawat Awam,Majlis Perbandaran Selayang,Juruukur Bahan,,,1. TPR Mohd Aliff bin Shahruzaman 2. PP Tuan A...,1. Encik Chris Kooi Wei Kit (Tetuan Kit & Asso...,,2021-08-04,


In [8]:
# Conversion to a Json File
json_data = final_df.to_json(orient='records', indent=4)

# Saving Json File
with open('2024-07-01_sprm_data.json', 'w') as json_file:
    json_file.write(json_data)



[
    {
        "Image_URL":"https:\/\/www.sprm.gov.my\/admin\/uploads\/pesalah\/abd-rasid-bin-mohamad-09072024.jpg",
        "Accused":"Abd Rasid bin Mohamad",
        "Identification No.":"70040712XXXX",
        "Gender":"Male",
        "Nationality":"Malaysia",
        "State":"Sabah",
        "Type":"Penjawat Awam",
        "Employer":"Kementerian Pelajaran Malaysia",
        "Position":"Guru Besar SK Kolapis",
        "Court":null,
        "Judge":null,
        "Deputy Public Prosecutor\/Prosecuting Officer":"1. PO Mohd Faliq bin Basirudin 2. PO Dzulkarnain Rousan bin Hasbi Hasbi",
        "Defense Counsel":"1. Marzuki Spawi",
        "Previous Conviction":null,
        "Date of Sentence":"2024-06-27",
        "Appeal":null
    },
    {
        "Image_URL":"https:\/\/www.sprm.gov.my\/admin\/uploads\/pesalah\/abd-rasid-bin-mohamad-09072024.jpg",
        "Accused":"Shaharuddin bin Ahmad",
        "Identification No.":"79082508XXXX",
        "Gender":"Male",
        "Nationality":"Ma