In [None]:
import torch
import pandas as pd
import numpy as np
import joblib   
import os   
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


from transformers import BertTokenizer, AutoModel

from Preprocessing import preprocess_text

# os.chdir('C:/Users/LENOVO/GitHub/Jurnal-Clustering')

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-multilingual-cased')
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 300),
            nn.ReLU(),
            nn.Linear(300, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs['last_hidden_state'][:, 0, :]
        x = self.classifier(x)
        return x

In [None]:
from torch.utils.data import Dataset, DataLoader

# Buat dataloader
class ArticleDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx]
        }

    def __len__(self):
        return len(self.input_ids)


In [None]:
def tokenize_data(texts, tokenizer, max_length=256):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            pad_to_max_length=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [None]:
df = pd.read_csv('data/extracted_publication_journal_s2.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

df


In [None]:
cwd = os.getcwd()

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertClassifier(12)
model.load_state_dict(torch.load('model/finetuned_model_sinta_translated.pt'))

# Membuat dataloader
batch_size = 32


In [None]:
for jour in df.journal.unique():

    journal_type = 'sinta_new_data_s2'

    # Assuming 'jurnal_id' is a variable containing the directory name
    file_path = os.path.join('src', journal_type, jour)

    if not os.path.exists(file_path):
        os.mkdir(file_path)

        data = df[df['journal'] == jour]
        data['data_cleaned'] = data['title'] + data['abstract'] 
        data['data_cleaned'] = data['data_cleaned'].apply(lambda x : preprocess_text(x))
        X = list(data['data_cleaned'])

        input_ids, attention_masks = tokenize_data(X, tokenizer)
        dataset = ArticleDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        device = 'cpu'
        if torch.cuda.is_available() :
            device = 'cuda'

        model.to(device)

        # Set model ke mode evaluasi (non-training)
        model.eval()

        # Embedding
        embeddings = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                embeddings.append(outputs.cpu().numpy())

        embeddings = np.concatenate(embeddings, axis=0)

        # Mengubah array embeddings menjadi matriks dua dimensi
        X = embeddings.reshape(embeddings.shape[0], -1)

        pca = PCA(n_components=2, random_state=0)
        X = pca.fit_transform(X)

        # Perform KMeans clustering
        num_clusters = 1
        kmeans = KMeans(n_clusters=num_clusters, random_state=0, max_iter=1000)
        kmeans.fit(X)

        # Assign each journal to its cluster
        cluster_labels = kmeans.labels_

        # Mendapatkan koordinat pusat cluster
        centroid = kmeans.cluster_centers_

        # Menghitung jarak antara setiap titik data dengan centroid
        jarak_ke_centroid = np.sqrt(np.sum((X - centroid)**2, axis=1))

        # Menentukan batas jarak yang dianggap sebagai "outscoop"
        outscoop_threshold = np.mean(jarak_ke_centroid) + 2 * np.std(jarak_ke_centroid)

        # Memisahkan data yang masih masuk dalam "scoop" dan "outscoop"
        scoop_data = X[jarak_ke_centroid <= outscoop_threshold]
        outscoop_data = X[jarak_ke_centroid > outscoop_threshold]

        scoop_labels = np.ones(len(X))
        scoop_labels[jarak_ke_centroid > outscoop_threshold] = -1

        filename_kmeans = f"{file_path}/{jour}_kmeans.pkl"
        joblib.dump(kmeans, filename_kmeans)

        np.save(f"{file_path}/{jour}_threshold.npy", outscoop_threshold)
        np.save(f"{file_path}/{jour}_pca_data.npy", X)
        np.save(f"{file_path}/{jour}_bert_data.npy", embeddings.reshape(embeddings.shape[0], -1))

        df_res = pd.DataFrame({'Data': data['data_cleaned'],
                   'Label': scoop_labels})

        inScoop_df = df_res[df_res['Label'] == 1]
        outScoop_df = df_res[df_res['Label'] == -1]

        df_res.to_csv(f'{file_path}/{jour}_data_jurnal.csv')
        inScoop_df.to_csv(f'{file_path}/{jour}_inscoop_data_jurnal.csv')
        outScoop_df.to_csv(f'{file_path}/{jour}_outscoop_data_jurnal.csv')

        print("Data sebaran PCA {} telah disimpan.".format(jour))
    else:
        pass

    


In [None]:
# data = df[df['journal'] == '25024760']
# data['data_cleaned'] = data['title'] + data['abstract'] 
# data['data_cleaned'].apply(lambda x : preprocess_text(x))
# X = list(data['data_cleaned'])

# input_ids, attention_masks = tokenize_data(X, tokenizer)
# dataset = ArticleDataset(input_ids, attention_masks)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
# device = 'cpu'
# if torch.cuda.is_available() :
#     device = 'cuda'

# model.to(device)

# # Set model ke mode evaluasi (non-training)
# model.eval()

# # Embedding
# embeddings = []

# with torch.no_grad():
#     for batch in dataloader:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask)
#         embeddings.append(outputs.cpu().numpy())

# embeddings = np.concatenate(embeddings, axis=0)

In [None]:




# # Mengubah array embeddings menjadi matriks dua dimensi
# X = embeddings.reshape(embeddings.shape[0], -1)

# pca = PCA(n_components=2, random_state=0)
# X = pca.fit_transform(X)

# # Perform KMeans clustering
# num_clusters = 1
# kmeans = KMeans(n_clusters=num_clusters, random_state=0, max_iter=1000)
# kmeans.fit(X)

# # Assign each journal to its cluster
# cluster_labels = kmeans.labels_

# # Mendapatkan koordinat pusat cluster
# centroid = kmeans.cluster_centers_

# # Menghitung jarak antara setiap titik data dengan centroid
# jarak_ke_centroid = np.sqrt(np.sum((X - centroid)**2, axis=1))

# # Menentukan batas jarak yang dianggap sebagai "outscoop"
# outscoop_threshold = np.mean(jarak_ke_centroid) + 2 * np.std(jarak_ke_centroid)

# # Memisahkan data yang masih masuk dalam "scoop" dan "outscoop"
# scoop_data = X[jarak_ke_centroid <= outscoop_threshold]
# outscoop_data = X[jarak_ke_centroid > outscoop_threshold]

# scoop_labels = np.ones(len(X))
# scoop_labels[jarak_ke_centroid > outscoop_threshold] = -1

# filename_kmeans = f"{file_path}/{jour}_kmeans.pkl"
# joblib.dump(kmeans, filename_kmeans)

# np.save(f"{file_path}/{jour}_threshold.npy", outscoop_threshold)
# np.save(f"{file_path}/{jour}_pca_data.npy", X)
# np.save(f"{file_path}/{jour}_bert_data.npy", embeddings.reshape(embeddings.shape[0], -1))

# df_res = pd.DataFrame({'Data': data['data_cleaned'],
#             'Label': scoop_labels})

# inScoop_df = df_res[df_res['Label'] == 1]
# outScoop_df = df_res[df_res['Label'] == -1]

# df_res.to_csv(f'{file_path}/{jour}_data_jurnal.csv')
# inScoop_df.to_csv(f'{file_path}/{jour}_inscoop_data_jurnal.csv')
# outScoop_df.to_csv(f'{file_path}/{jour}_outscoop_data_jurnal.csv')
