In [31]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl
from openpyxl import load_workbook

In [35]:

# Set URL parameters
universitas = 2058
page = 11
columns = ['SINTA_ID', 'NAMA']
df_master = pd.DataFrame(columns=columns)

## Scrape Authors

In [36]:


# Loop through pages to scrape data
for p in range(1, page + 1):
    url = f"https://sinta.kemdikbud.go.id/affiliations/authors/{universitas}?page={p}"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract author names and SINTA IDs
        names = [element.get_text(strip=True) for element in soup.select(".profile-name")]
        ids = [element.get_text(strip=True).replace("ID : ", "") for element in soup.select("div.profile-id")]
        
        # Store the results in a DataFrame
        data = list(zip(ids, names))
        df_page = pd.DataFrame(data, columns=columns)
        df_master = pd.concat([df_master, df_page], ignore_index=True)
        
        print(f"Scraping page {p}")
    else:
        print(f"Failed to retrieve page {p}")

# Display first and last few rows of the DataFrame
print("\nFirst 5 rows:\n", df_master.head())
print("\nLast 5 rows:\n", df_master.tail())




Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11

First 5 rows:
   SINTA_ID                   NAMA
0  5986966             AJI SURAJI
1   161222                ISTIADI
2  6172418     AVIV YUNIAR RAHMAN
3  5978881  ANA SOPANAH SUPRIYADI
4   161208           FITRI MARISA

Last 5 rows:
     SINTA_ID                    NAMA
99   6647611              SURIANSYAH
100  5999217              SOLEHODDIN
101  6705239               ZAENUDDIN
102  6681992          ADILOKA SUJONO
103  6662504  DJOKO IMBAWANI ATMADJA


In [37]:
# Export the results to an Excel file
# df_master.to_excel("Master_Authors_394.xlsx", index=False)

# Load master authors data from Excel
# df_master = pd.read_excel("Master_Authors_394.xlsx")

# Initialize an empty DataFrame to store author details
df_authors = pd.DataFrame(columns=[
    'SINTA_ID', 'Nama', 'Universitas', 'Department', 'Subjects',
    'SINTA_Score_Overall', 'SINTA_Score_3Yr',
    'Affil_Score', 'Affil_Score_3Yr', 'Scopus', 'GScholar', 'WOS'
])


In [38]:
df_authors

Unnamed: 0,SINTA_ID,Nama,Universitas,Department,Subjects,SINTA_Score_Overall,SINTA_Score_3Yr,Affil_Score,Affil_Score_3Yr,Scopus,GScholar,WOS


In [39]:
import requests
from io import StringIO

In [40]:

# Loop through each author to scrape detailed data
for sinta_id in df_master['SINTA_ID']:
    url = f"https://sinta.kemdikbud.go.id/authors/profile/{sinta_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract author name, affiliation details, subjects, and scores
        nama_authors = soup.select_one("h3").get_text(strip=True)
        meta_profile = soup.select("div.meta-profile a")
        nama_univ = meta_profile[0].get_text(strip=True) if len(meta_profile) > 0 else None
        nama_dept = meta_profile[1].get_text(strip=True) if len(meta_profile) > 1 else None
        
        subject_list = soup.select("div.profile-subject.mt-3 ul.subject-list li")
        subjects = ", ".join([s.get_text(strip=True) for s in subject_list])
        
        stat_profile = soup.select("div.pr-num")
        SINTA_Score_Overall = stat_profile[0].get_text(strip=True) if len(stat_profile) > 0 else None
        SINTA_Score_3Yr = stat_profile[1].get_text(strip=True) if len(stat_profile) > 1 else None
        Affil_Score = stat_profile[2].get_text(strip=True) if len(stat_profile) > 2 else None
        Affil_Score_3Yr = stat_profile[3].get_text(strip=True) if len(stat_profile) > 3 else None
        
        # Extract data tables (Scopus, GScholar, WOS)
        table = soup.select_one("table")
        if table:
            # df_table = pd.read_html(str(table))[0]
            # Membungkus 'table' dalam StringIO
            df_table = pd.read_html(StringIO(str(table)))[0]
            data_Scopus = df_table.get('Scopus', None)
            data_GScholar = df_table.get('GScholar', None)
            data_WOS = df_table.get('WOS', None)
        else:
            data_Scopus, data_GScholar, data_WOS = None, None, None
        
        # Create a data row and append it to the DataFrame
        df_authors = pd.concat([df_authors, pd.DataFrame([[
            sinta_id, nama_authors, nama_univ, nama_dept, subjects,
            SINTA_Score_Overall, SINTA_Score_3Yr,
            Affil_Score, Affil_Score_3Yr, data_Scopus, data_GScholar, data_WOS
        ]], columns=df_authors.columns)], ignore_index=True)
        
        print(f"Scraping author profile {sinta_id}")
    else:
        print(f"Failed to retrieve author profile {sinta_id}")


Scraping author profile 5986966
Scraping author profile 161222
Scraping author profile 6172418
Scraping author profile 5978881
Scraping author profile 161208
Scraping author profile 5977179
Scraping author profile 5977530
Scraping author profile 5972684
Scraping author profile 5973160
Scraping author profile 5998828
Scraping author profile 5986025
Scraping author profile 6663726
Scraping author profile 5998680
Scraping author profile 6684776
Scraping author profile 6104467
Scraping author profile 6660008
Scraping author profile 6666868
Scraping author profile 5977277
Scraping author profile 5978136
Scraping author profile 5972680
Scraping author profile 161268
Scraping author profile 6727265
Scraping author profile 5987236
Scraping author profile 5972668
Scraping author profile 6738705
Scraping author profile 5989963
Scraping author profile 5990602
Scraping author profile 5985900
Scraping author profile 5982671
Scraping author profile 5972723
Scraping author profile 6648492
Scraping au

In [41]:
df_authors

Unnamed: 0,SINTA_ID,Nama,Universitas,Department,Subjects,SINTA_Score_Overall,SINTA_Score_3Yr,Affil_Score,Affil_Score_3Yr,Scopus,GScholar,WOS
0,5986966,AJI SURAJI,Universitas Widya Gama,S1 - Teknik Sipil,"Highway Engineering, Traffic Engineering and M...",2.326,1.289,0,0,0 12 1 36 2 8 3 4 4 0 5 ...,0 79 1 283 2 44 3 10 4 11 5...,0 4 1 15 2 4 3 2 4 0 5 ...
1,161222,ISTIADI,Universitas Widya Gama,S1 - Teknik Informatika,"Electronic and Information Technology, Expert ...",1.873,950,0,0,0 27 1 64 2 19 3 5 4 1 5 ...,0 86 1 386 2 63 3 10 4 10 5...,0 7 1 4 2 4 3 1 4 0 5 2 Name...
2,6172418,AVIV YUNIAR RAHMAN,Universitas Widya Gama,S1 - Teknik Informatika,"Image Processing, Computer Vision, Deep Learni...",1.438,810,0,0,0 32 1 128 2 21 3 8 4 6 5...,0 92 1 450 2 54 3 12 4 15 5...,0 6 1 8 2 6 3 2 4 0 5 2 Name...
3,5978881,ANA SOPANAH SUPRIYADI,Universitas Widya Gama,S1 - Akuntansi,Public Sector of Accounting,1.665,801,0,0,0 5 1 21 2 4 3 3 4 1 5 ...,0 89 1 573 2 57 3 13 4 20 5...,0 1 1 12 2 1 3 1 4 1 5 ...
4,161208,FITRI MARISA,Universitas Widya Gama,S1 - Teknik Informatika,"Computer Support learning, data mining",2.173,787,0,0,0 17 1 84 2 13 3 5 4 3 5 ...,0 175 1 1526 2 112 3 22 4 ...,0 9 1 26 2 6 3 3 4 1 5 ...
...,...,...,...,...,...,...,...,...,...,...,...,...
99,6647611,SURIANSYAH,Universitas Widya Gama,Unknown,Teknik Mesin,135,5,0,0,0 2 1 2 2 2 3 1 4 0 5 2 Name...,0 27 1 15 2 9 3 2 4 0 5 ...,0 0.0 1 0.0 2 0.0 3 NaN 4 NaN 5...
100,5999217,SOLEHODDIN,Universitas Widya Gama,S2 - Ilmu Hukum,Ilmu Hukum,61,4,0,0,0 0 1 0 2 0 3 0 4 0 5 0 Name...,0 10 1 37 2 5 3 3 4 1 5 ...,0 0.0 1 0.0 2 0.0 3 NaN 4 NaN 5...
101,6705239,ZAENUDDIN,Universitas Widya Gama,S1 - Akuntansi,Akuntansi,15,2,0,0,0 0 1 0 2 0 3 0 4 0 5 0 Name...,0 5 1 1 2 1 3 1 4 0 5 1 Name...,0 0.0 1 0.0 2 0.0 3 NaN 4 NaN 5...
102,6681992,ADILOKA SUJONO,Universitas Widya Gama,S1 - Ilmu Hukum,Bahasa Inggris,34,0,0,0,0 1 1 0 2 0 3 0 4 0 5 1 Name...,0 17 1 6 2 3 3 2 4 0 5 ...,0 0.0 1 0.0 2 0.0 3 NaN 4 NaN 5...


In [43]:

# Save the detailed author data to Excel
df_authors.to_excel("Detail_Authors.xlsx", index=False)
df_authors.to_csv("Detail_Authors.csv", index=False)
print("Data saved to Detail_Authors.xlsx")

Data saved to Detail_Authors.xlsx


In [44]:


# Membaca file Excel
# file_path = "Master_Authors_394.xlsx"
# df_master = pd.read_excel(file_path)

# Mengambil data SINTA_ID dari df_master
SINTA_ID = df_master['SINTA_ID'].tolist()


In [45]:
df_master

Unnamed: 0,SINTA_ID,NAMA
0,5986966,AJI SURAJI
1,161222,ISTIADI
2,6172418,AVIV YUNIAR RAHMAN
3,5978881,ANA SOPANAH SUPRIYADI
4,161208,FITRI MARISA
...,...,...
99,6647611,SURIANSYAH
100,5999217,SOLEHODDIN
101,6705239,ZAENUDDIN
102,6681992,ADILOKA SUJONO


In [46]:

# Membuat DataFrame kosong
columns = ["SINTA_ID", "Author_Name", "University", "Department", "Subjects", "SINTA_Score_Overall", 
           "SINTA_Score_3Yr", "Affil_Score", "Affil_Score_3Yr", "Scopus", "GScholar", "WOS"]
df_authors = pd.DataFrame(columns=columns)


In [47]:
df_authors

Unnamed: 0,SINTA_ID,Author_Name,University,Department,Subjects,SINTA_Score_Overall,SINTA_Score_3Yr,Affil_Score,Affil_Score_3Yr,Scopus,GScholar,WOS


In [48]:

# Loop untuk scraping data dari masing-masing SINTA_ID
for i in SINTA_ID:
    url = f"https://sinta.kemdikbud.go.id/authors/profile/{i}"
    response = requests.get(url)
    sinta = BeautifulSoup(response.text, 'html.parser')

    # Scrape Author Name
    nama_authors = sinta.find("h3").get_text(strip=True) if sinta.find("h3") else ""

    # Scrape Meta Profile
    meta_profile = sinta.find_all("div", class_="meta-profile")
    meta_links = meta_profile[0].find_all("a") if meta_profile else []
    nama_univ = meta_links[0].get_text(strip=True) if len(meta_links) > 0 else ""
    nama_dept = meta_links[1].get_text(strip=True) if len(meta_links) > 1 else ""
    sinta_id = meta_links[2].get_text(strip=True).replace("SINTA ID : ", "") if len(meta_links) > 2 else ""

    # Scrape Subject List
    subject_list = sinta.find("div", class_="profile-subject mt-3")
    subject_items = subject_list.find_all("li") if subject_list else []
    subject_list_text = ", ".join([item.get_text(strip=True) for item in subject_items])

    # Scrape Stat Profile
    stat_profile = sinta.find_all("div", class_="pr-num")
    SINTA_Score_Overall = stat_profile[0].get_text(strip=True) if len(stat_profile) > 0 else ""
    SINTA_Score_3Yr = stat_profile[1].get_text(strip=True) if len(stat_profile) > 1 else ""
    Affil_Score = stat_profile[2].get_text(strip=True) if len(stat_profile) > 2 else ""
    Affil_Score_3Yr = stat_profile[3].get_text(strip=True) if len(stat_profile) > 3 else ""

    # Scrape Table Data (Scopus, GScholar, WOS)
    table = sinta.find("table")
    data_Scopus, data_GScholar, data_WOS = "", "", ""
    if table:
        table_data = pd.read_html(StringIO(str(table)))[0]
        # df_table = pd.read_html(StringIO(str(table)))[0]

        data_Scopus = table_data['Scopus'].iloc[0] if 'Scopus' in table_data.columns else ""
        data_GScholar = table_data['GScholar'].iloc[0] if 'GScholar' in table_data.columns else ""
        data_WOS = table_data['WOS'].iloc[0] if 'WOS' in table_data.columns else ""

    # Menyimpan data ke DataFrame
    row_data = [sinta_id, nama_authors, nama_univ, nama_dept, subject_list_text, SINTA_Score_Overall, SINTA_Score_3Yr,
                Affil_Score, Affil_Score_3Yr, data_Scopus, data_GScholar, data_WOS]
    df_authors.loc[len(df_authors)] = row_data

    # Print status scraping
    print(f"scraping page {i}")


scraping page 5986966
scraping page 161222
scraping page 6172418
scraping page 5978881
scraping page 161208
scraping page 5977179
scraping page 5977530
scraping page 5972684
scraping page 5973160
scraping page 5998828
scraping page 5986025
scraping page 6663726
scraping page 5998680
scraping page 6684776
scraping page 6104467
scraping page 6660008
scraping page 6666868
scraping page 5977277
scraping page 5978136
scraping page 5972680
scraping page 161268
scraping page 6727265
scraping page 5987236
scraping page 5972668
scraping page 6738705
scraping page 5989963
scraping page 5990602
scraping page 5985900
scraping page 5982671
scraping page 5972723
scraping page 6648492
scraping page 5981061
scraping page 5978109
scraping page 6695241
scraping page 5974912
scraping page 5990635
scraping page 6808233
scraping page 259792
scraping page 5983146
scraping page 6007790
scraping page 5983346
scraping page 5998823
scraping page 6833552
scraping page 5979034
scraping page 6064308
scraping page 

In [50]:

# Menyimpan hasil scraping ke file CSV atau Excel
df_authors.to_csv("authors_data2.csv", index=False)
df_authors.to_excel("authors_data2.xlsx", index=False)


## Scrape Journals

In [51]:
# Membuat DataFrame kosong untuk menyimpan data jurnal
columns = ["Judul Jurnal", "Lokasi Penerbit Jurnal", "Profile ID"]
df_jurnal = pd.DataFrame(columns=columns)

# Loop untuk scraping data dari setiap halaman
for p in range(1, page + 1):
    url = f"https://sinta.kemdikbud.go.id/journals/index/{universitas}?page={p}"
    response = requests.get(url)
    sinta = BeautifulSoup(response.text, 'html.parser')

    # Scrape data jurnal
    judul_jurnal = [judul.get_text(strip=True) for judul in sinta.select(".affil-name")]
    lokasi_penerbit_jurnal = [lokasi.get_text(strip=True) for lokasi in sinta.select(".affil-loc")]
    profile_id = [profile.get_text(strip=True) for profile in sinta.select(".profile-id")]

    # Menggabungkan data yang di-scrape menjadi DataFrame sementara dan menambahkannya ke df_jurnal
    output = pd.DataFrame({
        "Judul Jurnal": judul_jurnal,
        "Lokasi Penerbit Jurnal": lokasi_penerbit_jurnal,
        "Profile ID": profile_id
    })
    df_jurnal = pd.concat([df_jurnal, output], ignore_index=True)

    print(f"scraping page {p}")

# Menampilkan data yang telah di-scrape
print(df_jurnal.head())

# Menyimpan data ke dalam file Excel
df_jurnal.to_excel("Detail_Journals.xlsx", index=False)


scraping page 1
scraping page 2
scraping page 3
scraping page 4
scraping page 5
scraping page 6
scraping page 7
scraping page 8
scraping page 9
scraping page 10
scraping page 11
                                        Judul Jurnal  \
0          Journal of Socioeconomics and Development   
1  Journal of Information Technology and Computer...   
2                       Widya Yuridika: Jurnal Hukum   
3                AGRIKA : Jurnal Ilmu-Ilmu Pertanian   
4         JOURNAL OF SCIENCE AND APPLIED ENGINEERING   

                          Lokasi Penerbit Jurnal  \
0    Badan Penerbit Universitas Widyagama Malang   
1                          Universitas Widyagama   
2  Fakultas Hukum, Universitas Widya Gama Malang   
3              Badan Penerbitan Widyagama Malang   
4                                         BP UWG   

                                          Profile ID  
0  P-ISSN : 26156946 |\n                         ...  
1  P-ISSN : 25413619 |\n                         ...  
2  P-IS