In [1]:
import torch
import pandas as pd
import numpy as np
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


from transformers import BertTokenizer, AutoModel

from Preprocessing import preprocess_text

# os.chdir('C:/Users/LENOVO/GitHub/Jurnal-Clustering')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-multilingual-cased')
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 300),
            nn.ReLU(),
            nn.Linear(300, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs['last_hidden_state'][:, 0, :]
        x = self.classifier(x)
        return x

In [3]:
from torch.utils.data import Dataset, DataLoader

# Buat dataloader
class ArticleDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx]
        }

    def __len__(self):
        return len(self.input_ids)


In [4]:
def tokenize_data(texts, tokenizer, max_length=256):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            truncation=True,
                            max_length=max_length,
                            pad_to_max_length=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [5]:
df = pd.read_csv('data/extracted_publication_journal_s1.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

df


Unnamed: 0,journal,title,abstract
0,25024760,Localization schemes in Underwater Sensor Netw...,<jats:p>&lt;p&gt;Underwater Wireless Sensor Ne...
1,25024760,A Review on Voltage Balancing Solutions in Mul...,<jats:p>&lt;p&gt;Multilevel inverters are used...
2,25024760,Collision Detection and Trajectory Planning fo...,<jats:p>This paper proposes an algorithm for C...
3,25024760,A New Method for Optimal Coordination of Overc...,<jats:p>&lt;p&gt;The most of the new protectiv...
4,25024760,Fairness Evaluation and Comparison of Current ...,<jats:p>Transmission Control Protocol (TCP) is...
...,...,...,...
79988,25409581,Hesperitin Synergistically Promotes the Senesc...,"<jats:p>Pentagamavunone-1 (PGV-1), a curcumin ..."
79989,25409581,First Report on Wild Occurrences of Phoenix Mu...,<jats:p>The genus Pleurotus is known as a comm...
79990,25409581,Nannoplankton Biostratigraphy from Banggai-Sul...,<jats:p>The nannoplankton research was conduct...
79991,25409581,Spatial Modelling Habitat Suitability of Javan...,<jats:p>Javan Langur (T. auratus) is well-know...


In [6]:
cwd = os.getcwd()

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertClassifier(12)
model.load_state_dict(torch.load('model/finetuned_model_sinta_translated.pt'))

# Membuat dataloader
batch_size = 32


In [9]:
for jour in df.journal.unique():

    journal_type = 'sinta_new_data_s1_new'

    # Assuming 'jurnal_id' is a variable containing the directory name
    file_path = os.path.join('src', journal_type, jour)

    if not os.path.exists(file_path):
        os.mkdir(file_path)

        data = df[df['journal'] == jour]
        data['data_cleaned'] = data['title'] + data['abstract'] 
        data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))
        X = list(data['data_cleaned'])

        input_ids, attention_masks = tokenize_data(X, tokenizer)
        dataset = ArticleDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        device = 'cpu'
        if torch.cuda.is_available() :
            device = 'cuda'

        model.to(device)

        # Set model ke mode evaluasi (non-training)
        model.eval()

        # Embedding
        embeddings = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                embeddings.append(outputs.cpu().numpy())

        embeddings = np.concatenate(embeddings, axis=0)

        # Mengubah array embeddings menjadi matriks dua dimensi
        X = embeddings.reshape(embeddings.shape[0], -1)

        pca = PCA(n_components=2, random_state=0)
        X = pca.fit_transform(X)

        # Perform KMeans clustering
        num_clusters = 1
        kmeans = KMeans(n_clusters=num_clusters, random_state=0, max_iter=1000)
        kmeans.fit(X)

        # Assign each journal to its cluster
        cluster_labels = kmeans.labels_

        # Mendapatkan koordinat pusat cluster
        centroid = kmeans.cluster_centers_

        # Menghitung jarak antara setiap titik data dengan centroid
        jarak_ke_centroid = np.sqrt(np.sum((X - centroid)**2, axis=1))

        # Menentukan batas jarak yang dianggap sebagai "outscoop"
        outscoop_threshold = np.mean(jarak_ke_centroid) + 2 * np.std(jarak_ke_centroid)

        # Memisahkan data yang masih masuk dalam "scoop" dan "outscoop"
        scoop_data = X[jarak_ke_centroid <= outscoop_threshold]
        outscoop_data = X[jarak_ke_centroid > outscoop_threshold]

        scoop_labels = np.ones(len(X))
        scoop_labels[jarak_ke_centroid > outscoop_threshold] = -1

        filename_kmeans = f"{file_path}/{jour}_kmeans.pkl"
        joblib.dump(kmeans, filename_kmeans)

        np.save(f"{file_path}/{jour}_threshold.npy", outscoop_threshold)
        np.save(f"{file_path}/{jour}_pca_data.npy", X)
        np.save(f"{file_path}/{jour}_bert_data.npy", embeddings.reshape(embeddings.shape[0], -1))

        df_res = pd.DataFrame({'Data': data['data_cleaned'],
                   'Label': scoop_labels})

        inScoop_df = df_res[df_res['Label'] == 1]
        outScoop_df = df_res[df_res['Label'] == -1]

        df_res.to_csv(f'{file_path}/{jour}_data_jurnal.csv')
        inScoop_df.to_csv(f'{file_path}/{jour}_inscoop_data_jurnal.csv')
        outScoop_df.to_csv(f'{file_path}/{jour}_outscoop_data_jurnal.csv')

        print("Data sebaran PCA {} telah disimpan.".format(jour))
    else:
        pass

    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['title'] + data['abstract']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providi

Data sebaran PCA 25024760 telah disimpan.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['title'] + data['abstract']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))


Data sebaran PCA 23387238 telah disimpan.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['title'] + data['abstract']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))


Data sebaran PCA 2442-8620 telah disimpan.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['title'] + data['abstract']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))


Data sebaran PCA 25278045 telah disimpan.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['title'] + data['abstract']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))


Data sebaran PCA 27224708 telah disimpan.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['title'] + data['abstract']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))


KeyboardInterrupt: 

In [71]:
src = '_s4'
df = pd.read_csv('data/data_sinta_raw_s4_full.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.jid = df.jid.apply(lambda x : str(x))

df

Unnamed: 0,jid,desc,title,date
0,10011,Penelitian ini bertujuan untuk menentukan meto...,Analisis Komponen Pertumbuhan Terhadap Pola Me...,2016-12-06
1,10011,Tujuan dari penelitian ini adalah untuk menget...,Analisis Laju Sedimentasi di Saluran Intake Ir...,2016-12-07
2,10011,Tujuan dari penelitian ini adalah untuk menget...,Optimalisasi Proses Adsorbsi Biji Kelor Untuk ...,2016-12-07
3,10011,Penelitian ini dilakukan untuk mengetahui peng...,Tinjauan Nafsu Makan dan Sintasan Ikan Gurami ...,2016-06-07
4,10011,Penelitian ini bertujuan untuk 1) menganalisis...,Analisis Biaya dan Kelayakan Usaha Penggilinga...,2016-12-07
...,...,...,...,...
365019,9998,Cutibacterium speciesis a member of the skin m...,Cutibacterium species: An Underestimated Patho...,2024-06-30
365020,9998,The excessive use of synthetic fungicides has ...,Compatibility study of Trichoderma sp. with Ch...,2024-06-30
365021,9998,Glucanases are important industrial enzymes th...,Optimization and Characterization of Exo-β-Glu...,2024-07-18
365022,9998,"Based on the literature study, the demand for ...",Utilization of Tasikmadu Starfruit Waste Compo...,2024-06-30


In [72]:
df.jid.sample().values

array(['4695'], dtype=object)

In [73]:
df[df.jid == '10011']

Unnamed: 0,jid,desc,title,date
0,10011,Penelitian ini bertujuan untuk menentukan meto...,Analisis Komponen Pertumbuhan Terhadap Pola Me...,2016-12-06
1,10011,Tujuan dari penelitian ini adalah untuk menget...,Analisis Laju Sedimentasi di Saluran Intake Ir...,2016-12-07
2,10011,Tujuan dari penelitian ini adalah untuk menget...,Optimalisasi Proses Adsorbsi Biji Kelor Untuk ...,2016-12-07
3,10011,Penelitian ini dilakukan untuk mengetahui peng...,Tinjauan Nafsu Makan dan Sintasan Ikan Gurami ...,2016-06-07
4,10011,Penelitian ini bertujuan untuk 1) menganalisis...,Analisis Biaya dan Kelayakan Usaha Penggilinga...,2016-12-07
...,...,...,...,...
209,10011,Penelitian ini bertujuan untuk mengetahui peng...,Pengaruh Pemberian Kompos Tandan Kosong Kelapa...,2024-06-23
210,10011,Pengumpanan bahan merupakan salah satu permasa...,Uji Kinerja Pengumpan Tipe Screw Conveyor pada...,2024-06-04
211,10011,Penelitian ini bertujuan untuk mengetahui pote...,Kandungan Nutrisi Limbah Buah Lai (Durio kutej...,2024-06-23
212,10011,"Dalam budidaya tanaman kehutanan, tanah merupa...",Morfologi Tanah Tegakan Jati di Kecamatan Sang...,2024-06-22


In [66]:
journal_type = 'sinta_scoop_sample'

jour = '6171'

# Assuming 'jurnal_id' is a variable containing the directory name
file_path = os.path.join('src', journal_type, jour)

if not os.path.exists(file_path):
    os.mkdir(file_path)

    data = df[df['jid'] == str(jour)]
    data['data_cleaned'] = data['title'] + data['desc'] 
    data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))
    X = list(data['data_cleaned'])

    # print(X, data)

    input_ids, attention_masks = tokenize_data(X, tokenizer)
    dataset = ArticleDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    device = 'cpu'
    if torch.cuda.is_available() :
        device = 'cuda'

    model.to(device)

    # Set model ke mode evaluasi (non-training)
    model.eval()

    # Embedding
    embeddings = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            embeddings.append(outputs.cpu().numpy())

    embeddings = np.concatenate(embeddings, axis=0)

    # Mengubah array embeddings menjadi matriks dua dimensi
    X = embeddings.reshape(embeddings.shape[0], -1)

    pca = PCA(n_components=2, random_state=0)
    X = pca.fit_transform(X)

    # Perform KMeans clustering
    num_clusters = 1
    kmeans = KMeans(n_clusters=num_clusters, random_state=0, max_iter=1000)
    kmeans.fit(X)

    # Assign each journal to its cluster
    cluster_labels = kmeans.labels_

    # Mendapatkan koordinat pusat cluster
    centroid = kmeans.cluster_centers_

    # Menghitung jarak antara setiap titik data dengan centroid
    jarak_ke_centroid = np.sqrt(np.sum((X - centroid)**2, axis=1))

    # Menentukan batas jarak yang dianggap sebagai "outscoop"
    outscoop_threshold = np.mean(jarak_ke_centroid) + 2 * np.std(jarak_ke_centroid)

    # Memisahkan data yang masih masuk dalam "scoop" dan "outscoop"
    scoop_data = X[jarak_ke_centroid <= outscoop_threshold]
    outscoop_data = X[jarak_ke_centroid > outscoop_threshold]

    scoop_labels = np.ones(len(X))
    scoop_labels[jarak_ke_centroid > outscoop_threshold] = -1

    filename_kmeans = f"{file_path}/{jour}_kmeans.pkl"
    joblib.dump(kmeans, filename_kmeans)

    np.save(f"{file_path}/{jour}_threshold.npy", outscoop_threshold)
    np.save(f"{file_path}/{jour}_pca_data.npy", X)
    np.save(f"{file_path}/{jour}_bert_data.npy", embeddings.reshape(embeddings.shape[0], -1))

    df_res = pd.DataFrame({'Data': data['data_cleaned'],
                'Label': scoop_labels})

    inScoop_df = df_res[df_res['Label'] == 1]
    outScoop_df = df_res[df_res['Label'] == -1]

    df_res.to_csv(f'{file_path}/{jour}_{src}_data_jurnal.csv')
    inScoop_df.to_csv(f'{file_path}/{jour}_inscoop_data_jurnal.csv')
    outScoop_df.to_csv(f'{file_path}/{jour}_outscoop_data_jurnal.csv')

    print("Data sebaran PCA {} telah disimpan.".format(jour))
else:
    pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['title'] + data['desc']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))


Data sebaran PCA 6171 telah disimpan.
