# DATA PREPOCESSING

In [None]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
import time
from transformers import BertTokenizer, BertModel
from tqdm.auto import tqdm

In [None]:
# Set device
torch.cuda.set_device(1)
device = "cuda:%s" % torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
device

LOAD DATASET

In [None]:
# Load dataset .tsv
kaggle_dataset  = pd.read_csv('./training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")
kaggle_dataset

DATA CLEANSING

In [None]:
# Data Cleansing Function
def clean_dataset(input_dataset):
  # Remove unused column 
  dataset = pd.DataFrame(
    {
      'essay_id' : input_dataset['essay_id'],
      'essay_set' : input_dataset['essay_set'],
      'essay' : input_dataset['essay'],
      'score' : input_dataset['domain1_score']
    }
  )

  # Check missing value
  missing_values = dataset.isnull().sum()
  print("Jumlah missing values:")
  print(missing_values)

  # Remove missing value
  dataset_cleaned = dataset.dropna()
  cleaned_missing_values = dataset_cleaned.isnull().sum()
  print("\nJumlah missing values setelah data dibersihkan:")
  print(cleaned_missing_values)

  print("\nDataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:")

  return dataset_cleaned

In [None]:
dataset_cleaned = clean_dataset(kaggle_dataset)
dataset_cleaned

SCORE NORMALIZATION

In [None]:
# Rentang nilai esai (nilai minimum dan maksimum pada tiap set esai)
min_max_ranges = {
    1: (2, 12),
    2: (1, 6),
    3: (0, 3),
    4: (0, 3),
    5: (0, 4),
    6: (0, 4),
    7: (0, 30),
    8: (0, 60)
}

In [None]:
#Score Normalization Function
def normalize_score(dataset, min_max_ranges):

    #Rumus min max normalization
    def min_max_normalize(score, min_score, max_score):
        return (score - min_score) / (max_score - min_score)
    
    #Normalisasi nilai skor
    for essay_set, (min_score, max_score) in min_max_ranges.items():

        # Filter dataset berdasarkan essay set
        subset = dataset[dataset['essay_set'] == essay_set]
        
        # Lakukan normalisasi skor secara manual
        normalized_scores = subset['score'].apply(lambda x: min_max_normalize(x, min_score, max_score))
        
        # Update kolom skor pada subset dataset dengan skor yang telah dinormalisasi
        dataset.loc[subset.index, 'normalized_score'] = normalized_scores

    # Ganti nilai kolom score dengan normalized_score
    dataset['score'] = dataset['normalized_score']

    # Hapus kolom normalized_score
    dataset.drop('normalized_score', axis=1, inplace=True)
    
    return dataset

In [None]:
dataset_normalized = normalize_score(dataset_cleaned, min_max_ranges)
dataset_normalized

DATA SPLITTING

In [None]:
#Data Splitting Function
def data_splitting(dataset):
    # Dictionary untuk menyimpan data latih dan data uji untuk setiap essay_set
    train_data_perset = {}
    test_data_perset = {}

    # Mendefinisikan essay_set yang tersedia dalam dataset
    essay_sets = dataset['essay_set'].unique()

    for essay_set in essay_sets:
        # Filter dataset berdasarkan essay_set
        subset = dataset[dataset_cleaned['essay_set'] == essay_set]
        
        features = ['essay_id', 'essay_set', 'essay']
        X = subset.loc[:, features]
        y = subset.loc[:, ['score']]
        
        # Lakukan splitting menjadi data train (70%) dan data test (30%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
        
        # Menggabungkan X_train dan y_train menjadi dataframe data latih
        train_data_perset[essay_set] = pd.concat([X_train, y_train], axis=1)

        # Menggabungkan X_test dan y_test menjadi dataframe data uji
        test_data_perset[essay_set] = pd.concat([X_test, y_test], axis=1)

    # Menggabungkan semua data train dari setiap essay_set menjadi satu DataFrame data_train
    train_data = pd.concat(train_data_perset.values(), ignore_index=True)

    # Menggabungkan semua data test dari setiap essay_set menjadi satu DataFrame data_test
    test_data = pd.concat(test_data_perset.values(), ignore_index=True)

    return train_data, test_data

In [None]:
train_data, test_data = data_splitting(dataset_normalized)

In [None]:
# Cetak data latih
print("Train Data:")
train_data

In [None]:
# Cetak data uji
print("Test Data:")
test_data

# EMBEDDING

Data Loader - Batch Size

In [None]:
# Fungsi untuk membuat kamus yang memetakan id ke suatu indeks
def get_id2emb(ids):

  id2emb = {}
  for n,id in enumerate(ids.to_list()):
    id2emb[id] = n

  print('Essay ids to embeddings dictionary created.')
  
  return id2emb

In [None]:
id2emb_train = get_id2emb(train_data['essay_id'])
id2emb_test = get_id2emb(test_data['essay_id'])

In [None]:
def get_loader(df, id2emb, essay_embeddings, batch_size, shuffle):
    
    # Extract embeddings for each essay_id using the id2emb dictionary
    embeddings = np.array([essay_embeddings[id2emb[id]] for id in df['essay_id']])
    
    # Extract scores from the DataFrame
    scores = np.array(df['score'])
    
    # Create a PyTorch TensorDataset from the embeddings and scores
    data = TensorDataset(torch.from_numpy(embeddings).float(), torch.from_numpy(scores).float())
    
    # Create a PyTorch DataLoader from the TensorDataset
    loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=2)
    
    return loader

# EMBEDDING BERT

LOAD PRETRAINED MODEL BERT

In [None]:
# Memuat pretrained BERT dan tokenizer
bert_model = BertModel.from_pretrained("bert-base-cased").to(device)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
print(type(tokenizer_bert))

In [None]:
vocab_size = tokenizer_bert.vocab_size
print(f"Jumlah kata dalam kamus tokenizer: {vocab_size}")

In [None]:
tokens = tokenizer_bert(["waiting for my turn"])
print(tokens['input_ids'])

In [None]:
# Akses embedding layer
embedding_layer = bert_model.embeddings

# Cetak informasi embedding layer
print(embedding_layer)
print(embedding_layer.word_embeddings.weight.shape)

In [None]:
print(tokenizer_bert(['waiting']))

In [None]:
print(embedding_layer.word_embeddings.weight[2613])

WORD EMBEDDING

In [None]:
def bert_embedding(essay_list, tokenizer, model):

  print('Encoding essay embeddings:')

  embeddings = []
  for essay in tqdm(essay_list):
    encoded_input = tokenizer(essay, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
      model_output = model(**encoded_input)
    tokens_embeddings = np.matrix(model_output[0].squeeze().cpu())
    embeddings.append(np.squeeze(np.asarray(tokens_embeddings.mean(0))))

  return np.matrix(embeddings)

In [None]:
# Menyimpan Embeddings yang dihasilkan BERT 
train_embeddings_bert = bert_embedding(train_data['essay'], tokenizer_bert, bert_model)

In [None]:
test_embeddings_bert = bert_embedding(test_data['essay'], tokenizer_bert, bert_model)

In [None]:
print(train_embeddings_bert.shape)

In [None]:
print(test_embeddings_bert.shape)

In [None]:
# np.shape(train_embeddings_bert)
train_embeddings_bert

In [None]:
test_embeddings_bert 

# REGRESI FCNN

INISIALISASI FCNN

In [None]:
# Menginisialisasi FCNN
class FCNN(nn.Module):
    # Fungsi untuk menentukan pengaturan layer
    def __init__(self, input_size):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 397) 
        self.dropout1 = nn.Dropout(0.3) 
        self.fc2 = nn.Linear(397, 32)          
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(32, 1)              
        self.sigmoid = nn.Sigmoid()             
    
    # Fungsi untuk untuk melakukan feedforward
    def forward(self, x):
        x = torch.relu(self.fc1(x))             
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))              
        x = self.dropout2(x) 
        x = self.fc3(x)                                    
        return self.sigmoid(x)                        

TESTING FUNCTION

In [None]:
def test_step(trained_model, cost_function, test_loader):
    trained_model.eval() # Mengatur model ke mode evaluasi (eval mode)
    test_loss = 0.
    samples = 0.
    
    with torch.no_grad():
        for step, (inputs, targets) in enumerate(test_loader):
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.squeeze(dim=1).to(device)
            
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = trained_model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
            
            # Menambahkan nilai loss dari batch ke test_loss
            test_loss += loss.item() * inputs.shape[0]
            
    # Menghitung rata-rata loss di seluruh batch (samples)
    avg_loss = test_loss / samples
    
    # Mengembalikan nilai rata-rata loss
    return avg_loss


TRAINING FUNCTION

In [None]:
# Contoh fungsi untuk training model
def training_step(model, cost_function, train_loader, test_loader, save_path, num_epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    train_losses = []
    test_losses = []
    
    for epoch in range(num_epochs):
        model.train() # Mengatur model ke mode pelatihan
        
        # Mengatur gradien parameter ke nilai nol untuk iterasi
        running_loss = 0.
        samples = 0.
        
        for inputs, targets in train_loader:
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.to(device)
        
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)

            # Mengatur gradien parameter ke nilai nol untuk iterasi berikutnya
            optimizer.zero_grad()
            
            # Melakukan backpropagation untuk menghitung gradien loss terhadap parameter model
            loss.backward()
            
            # Melakukan optimizer untuk mengupdate parameter model berdasarkan gradien
            optimizer.step()
            
            # Menambahkan nilai loss dari batch ke running_loss
            running_loss += loss.item() * inputs.shape[0]
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
        
        # Menghitung rata-rata loss pada data latih
        train_loss = running_loss / samples
        
         # Evaluasi pada data uji
        test_loss = test_step(model, cost_function, test_loader)
        
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        print('Epoch: {:}/{:}\tLoss/train: {:.5f}\tLoss/test: {:.5f}'.format(epoch+1, num_epochs, train_loss, test_loss))
    
    # Simpan model setelah pelatihan
    torch.save(model.state_dict(), save_path)
    print(f"Model saved at {save_path}")
    
    return train_losses, test_losses

SCORING FUNCTION

In [None]:
#Fungsi untuk melakukan prediksi pada data uji
def scoring(trained_model, test_loader):
    trained_model.to(device)  # Move the model to the correct device
    predictions = []
    
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            # Lakukan prediksi dengan model yang telah dilatih
            outputs = trained_model(inputs)
            
            # Menyimpan prediksi (outputs) dalam bentuk list predictions
            predictions.extend(outputs.squeeze().cpu().numpy())
            
    return predictions

# MAIN

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TRAINING

In [None]:
# Hyperparameter
input_size = 768
batch_size = 
epochs = 
lr = 

# TRAINING
# Inisialisasi model, loader, dan fungsi loss
model_bert = FCNN(input_size).to(device)  # Ganti dengan model Anda
cost_function = torch.nn.MSELoss()

# Dataloaders
train_loader_bert = get_loader(train_data, id2emb_train, train_embeddings_bert, batch_size, shuffle=True)
test_loader_bert = get_loader(test_data, id2emb_test, test_embeddings_bert, batch_size, shuffle=False)

print('------------------------------------------------------------------')
print(f"\t\t\tTraining model BERT: ")
print('------------------------------------------------------------------')
# Path tempat model akan disimpan dan dimuat
save_path = 'model_bert_v7-7.9.5.pth'

start_time_bert = time.time()
train_loss_bert, test_loss_bert = training_step(model_bert, cost_function, train_loader_bert, test_loader_bert, save_path, epochs, lr)
end_time_bert = time.time()

print('Training time:', end_time_bert - start_time_bert)

SCORING

In [None]:
# Memuat model yang telah dilatih
model_bert_trained = model_bert 
model_bert_trained.load_state_dict(torch.load(save_path))

# Menggunakan model untuk prediksi pada data uji
print('------------------------------------------------------------------')
print(f"\t\t\tScoring Essay: ")
print('------------------------------------------------------------------')
start_time_eval = time.time()
test_predictions_bert = scoring(model_bert_trained, test_loader_bert)
end_time_eval = time.time()
print('Evaluation time:', end_time_eval - start_time_eval)

# store train_df, test_df and predictions
train_df_bert = train_data
test_df_bert = test_data
preds_bert = test_predictions_bert

# RESULT

In [None]:
def get_results_df(test_df, model_preds):

  # create new results df with model scaled preds
  preds_df = pd.DataFrame(model_preds)
  results_df = test_df.reset_index(drop=True)\
              .join(preds_df)\
              .rename(columns={0:'prediction'})\
              .sort_values(by='essay_id')\
              .reset_index(drop=True)

  # move score to last colum
  s_df = results_df.pop('score')
  results_df['score'] = s_df

  return results_df

In [None]:
get_results_df(test_df_bert, preds_bert)

# DENORMALISASI

In [None]:
def inverse_normalize_score(score, min_max_range):
    # Mendapatkan nilai minimum dan maksimum dari rentang normalisasi
    min_score, max_score = min_max_range
    
    # Mengembalikan skor esai yang sudah dinormalisasi ke rentang aslinya
    return round(score * (max_score - min_score) + min_score)

def restore_original_scores(df, preds, min_max_ranges):
    # Membuat salinan dataframe untuk menghindari modifikasi dataframe asli
    df_copy = df.copy()
    
    # Mendapatkan kolom skor aktual
    actual_scores = df_copy['score'].values
    
    # Mendapatkan kolom essay_set
    essay_sets = df_copy['essay_set'].values
    
    # Memastikan preds memiliki panjang yang sama dengan jumlah data
    assert len(preds) == len(df_copy), "Length of predictions does not match length of dataframe"
    
    # Memulihkan skor prediksi dan skor aktual ke rentang aslinya
    restored_preds = [inverse_normalize_score(pred, min_max_ranges[essay_set]) for pred, essay_set in zip(preds, essay_sets)]
    restored_actuals = [inverse_normalize_score(actual, min_max_ranges[essay_set]) for actual, essay_set in zip(actual_scores, essay_sets)]
    
    # Mengganti kolom skor prediksi dan aktual dengan skor yang sudah dipulihkan
    df_copy['prediction'] = restored_preds
    df_copy['score'] = restored_actuals
    
    return df_copy

In [None]:
# Mengembalikan skor prediksi dan skor aktual ke rentang awalnya
restored_results_df_bert = restore_original_scores(test_df_bert, preds_bert, min_max_ranges)

# Cetak hasilnya
print("Restored Results:")
restored_results_df_bert

# EVALUASI QWK

In [None]:
import numpy as np

def calculate_qwk(actuals, preds):
    # Menentukan nilai minimum dan maksimum untuk rentang skor
    min_rating = min(min(actuals), min(preds))
    max_rating = max(max(actuals), max(preds))
    
    # Jumlah total kemungkinan penilaian
    num_ratings = max_rating - min_rating + 1

    # Membuat matriks bobot W
    weight_mat = np.zeros((num_ratings, num_ratings))
    for i in range(num_ratings):
        for j in range(num_ratings):
            weight_mat[i][j] = ((i - j) ** 2) / ((num_ratings - 1) ** 2)

    # Membuat matriks observasi O
    conf_mat = np.zeros((num_ratings, num_ratings))
    for actual, pred in zip(actuals, preds):
        conf_mat[actual - min_rating][pred - min_rating] += 1

    # Membuat matriks ekspektasi E
    actual_hist = np.zeros(num_ratings)
    pred_hist = np.zeros(num_ratings)
    for i in range(num_ratings):
        for j in range(num_ratings):
            actual_hist[i] += conf_mat[i][j]
            pred_hist[j] += conf_mat[i][j]

    expected_mat = np.outer(actual_hist, pred_hist) / len(actuals)

    # Menghitung nilai QWK
    num_agreements = np.sum(weight_mat * conf_mat)
    num_possible_agreements = np.sum(weight_mat * expected_mat)
    kappa_score = 1 - (num_agreements / num_possible_agreements)

    return kappa_score

In [None]:
# Fungsi untuk menghitung QWK per set
def calculate_qwk_per_set(df):
    # Menyimpan nilai QWK per set dalam dictionary
    qwk_per_set = {}
    
    # Mendapatkan unique essay_set values
    essay_sets = df['essay_set'].unique()
    
    # Iterasi melalui setiap essay_set
    for essay_set in essay_sets:
        # Filter dataframe berdasarkan essay_set
        subset_df = df[df['essay_set'] == essay_set]
        
        # Mengekstrak skor aktual dan prediksi dari subset dataframe
        actual_scores = subset_df['score'].astype(int)
        predicted_scores = subset_df['prediction'].astype(int)
        
        # Menghitung QWK untuk subset tersebut
        qwk = calculate_qwk(actual_scores, predicted_scores)
        
        # Menyimpan nilai QWK ke dalam dictionary
        qwk_per_set[f'Set {essay_set}'] = qwk
    
    return qwk_per_set


In [None]:
# Menghitung QWK per set
qwk_per_set_bert = calculate_qwk_per_set(restored_results_df_bert)

# Menampilkan nilai QWK per set
for essay_set, qwk_score in qwk_per_set_bert.items():
    print(f"Quadratic Weighted Kappa Score for {essay_set}: {qwk_score}")

# Menghitung rata-rata nilai QWK dari semua set
average_qwk_bert = np.mean(list(qwk_per_set_bert.values()))

# Menampilkan rata-rata nilai QWK
print("Average Quadratic Weighted Kappa Score:", average_qwk_bert)