# DATA PREPOCESSING

In [77]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score as kappa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import time

In [78]:
# Set device
torch.cuda.set_device(1)
device = "cuda:%s" % torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
device

'cuda:1'

LOAD DATASET

In [79]:
# Load dataset .tsv
kaggle_dataset  = pd.read_csv('./training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")
kaggle_dataset

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,17,18,,35,,,,...,4.0,4.0,4.0,3.0,,,,,,
12972,21628,8,I never understood the meaning laughter is th...,15,17,,32,,,,...,4.0,4.0,4.0,3.0,,,,,,
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",20,26,40.0,40,,,,...,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
12974,21630,8,Trippin' on fen...,20,20,,40,,,,...,4.0,4.0,4.0,4.0,,,,,,


DATA CLEANSING

In [80]:
# Data Cleansing Function
def clean_dataset(input_dataset):
  # Remove unused column 
  dataset = pd.DataFrame(
    {
      'essay_id' : input_dataset['essay_id'],
      'essay_set' : input_dataset['essay_set'],
      'essay' : input_dataset['essay'],
      'score' : input_dataset['domain1_score']
    }
  )

  # Check missing value
  missing_values = dataset.isnull().sum()
  print("Jumlah missing values:")
  print(missing_values)

  # Remove missing value
  dataset_cleaned = dataset.dropna()
  cleaned_missing_values = dataset_cleaned.isnull().sum()
  print("\nJumlah missing values setelah data dibersihkan:")
  print(cleaned_missing_values)

  print("\nDataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:")

  return dataset_cleaned

In [81]:
dataset_cleaned = clean_dataset(kaggle_dataset)
dataset_cleaned

Jumlah missing values:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Jumlah missing values setelah data dibersihkan:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Dataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:


Unnamed: 0,essay_id,essay_set,essay,score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,35
12972,21628,8,I never understood the meaning laughter is th...,32
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40
12974,21630,8,Trippin' on fen...,40


In [82]:
# Rata-rata jumlah kata pada tiap set esai
avg_word_counts = dataset_cleaned.groupby('essay_set')['essay'].apply(lambda x: x.str.split().str.len().mean()).round().astype(int)

# Filter data berdasarkan rata-rata jumlah kata yang diinginkan
target_avg_word_count = 374
filtered_dataset = pd.DataFrame(columns=dataset_cleaned.columns)
for essay_set, avg_count in avg_word_counts.items():
    subset = dataset_cleaned[dataset_cleaned['essay_set'] == essay_set]
    filtered_subset = subset[(subset['essay'].str.split().str.len() >= target_avg_word_count - 20) & 
                             (subset['essay'].str.split().str.len() <= target_avg_word_count + 20)]
    filtered_dataset = pd.concat([filtered_dataset, filtered_subset])

# Periksa distribusi jumlah kata setelah filtering
filtered_avg_word_counts = filtered_dataset.groupby('essay_set')['essay'].apply(lambda x: x.str.split().str.len().mean()).round().astype(int)
print("Filtered Average Word Counts per Essay Set:")
print(filtered_avg_word_counts)

Filtered Average Word Counts per Essay Set:
essay_set
1    375
2    373
3    375
4    357
6    358
7    370
8    374
Name: essay, dtype: int64


In [83]:
# Menghitung jumlah data pada setiap set esai
jumlah_data_per_set = filtered_dataset.groupby('essay_set').size()

# Menampilkan jumlah data pada setiap set esai
print("Jumlah Data per Set Esai:")
print(jumlah_data_per_set)

Jumlah Data per Set Esai:
essay_set
1    234
2    200
3      1
4      1
6      2
7     33
8     33
dtype: int64


In [84]:
sets_to_remove = [3, 4, 6]
# Filter out the specified sets
filtered_dataset = filtered_dataset[~filtered_dataset['essay_set'].isin(sets_to_remove)]

In [85]:
# Menghitung jumlah data pada setiap set esai
jumlah_data_per_set = filtered_dataset.groupby('essay_set').size()

# Menampilkan jumlah data pada setiap set esai
print("Jumlah Data per Set Esai:")
print(jumlah_data_per_set)

Jumlah Data per Set Esai:
essay_set
1    234
2    200
7     33
8     33
dtype: int64


SCORE NORMALIZATION

In [86]:
# Rentang nilai esai (nilai minimum dan maksimum pada tiap set esai)
min_max_ranges = {
    1: (2, 12),
    2: (1, 6),
    3: (0, 3),
    4: (0, 3),
    5: (0, 4),
    6: (0, 4),
    7: (0, 30),
    8: (0, 60)
}

In [87]:
#Score Normalization Function
def normalize_score(dataset, min_max_ranges):

    #Rumus min max normalization
    def min_max_normalize(score, min_score, max_score):
        return (score - min_score) / (max_score - min_score)
    
    #Normalisasi nilai skor
    for essay_set, (min_score, max_score) in min_max_ranges.items():

        # Filter dataset berdasarkan essay set
        subset = dataset[dataset['essay_set'] == essay_set]
        
        # Lakukan normalisasi skor secara manual
        normalized_scores = subset['score'].apply(lambda x: min_max_normalize(x, min_score, max_score))
        
        # Update kolom skor pada subset dataset dengan skor yang telah dinormalisasi
        dataset.loc[subset.index, 'normalized_score'] = normalized_scores

    # Ganti nilai kolom score dengan normalized_score
    dataset['score'] = dataset['normalized_score']

    # Hapus kolom normalized_score
    dataset.drop('normalized_score', axis=1, inplace=True)
    
    return dataset

In [88]:
dataset_normalized = normalize_score(filtered_dataset, min_max_ranges)
dataset_normalized

Unnamed: 0,essay_id,essay_set,essay,score
11,12,1,Dear @CAPS1 @CAPS2 I feel that computers do ta...,0.600000
17,18,1,"Dear Local Newspaper, I must admit that the ex...",0.600000
20,21,1,"Dear @CAPS1 of the @CAPS2 @CAPS3 daily, I am w...",0.600000
25,26,1,Do you think that computers are useless? Or do...,0.700000
27,28,1,"Dear Newspaper, Computers are high tec and hav...",0.700000
...,...,...,...,...
12887,21514,8,I think laughter should be a huge part in eve...,0.566667
12904,21537,8,In the @DATE1 of @NUM1' I spent two weeks at ...,0.550000
12948,21595,8,Have you ever experienced a time with your fr...,0.600000
12949,21596,8,I woke up just like any other day happy yet l...,0.516667


DATA SPLITTING

In [89]:
def data_splitting(dataset):
    # Dictionary untuk menyimpan data latih dan data uji untuk setiap essay_set
    train_data_perset = {}
    test_data_perset = {}

    # Mendefinisikan essay_set yang tersedia dalam dataset
    essay_sets = dataset['essay_set'].unique()

    for essay_set in essay_sets:
        # Filter dataset berdasarkan essay_set
        subset = dataset[dataset['essay_set'] == essay_set]
        
        features = ['essay_id', 'essay_set', 'essay']
        X = subset.loc[:, features]
        y = subset.loc[:, ['score']]
        
        # Lakukan splitting menjadi data train (70%) dan data test (30%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
        
        # Menggabungkan X_train dan y_train menjadi dataframe data latih
        train_data_perset[essay_set] = pd.concat([X_train, y_train], axis=1)

        # Menggabungkan X_test dan y_test menjadi dataframe data uji
        test_data_perset[essay_set] = pd.concat([X_test, y_test], axis=1)

    # Menggabungkan semua data train dari setiap essay_set menjadi satu DataFrame data_train
    train_data = pd.concat(train_data_perset.values(), ignore_index=True)

    # Menggabungkan semua data test dari setiap essay_set menjadi satu DataFrame data_test
    test_data = pd.concat(test_data_perset.values(), ignore_index=True)

    return train_data, test_data

In [90]:
train_data, test_data = data_splitting(dataset_normalized)

In [91]:
# Cetak data latih
print("Train Data:")
train_data

Train Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,1052,1,Did you know that @PERCENT2 out of @PERCENT1 c...,0.700000
1,645,1,The computer effects peoplelife by taking them...,0.600000
2,29,1,"Dear local newspaper, @CAPS1 people throughout...",0.700000
3,476,1,I think the computer dire a postive effect on ...,0.600000
4,1012,1,"Dear local newspaper writer, I'm writting to t...",0.700000
...,...,...,...,...
344,21598,8,"Laughter is an important part of my life, eit...",0.500000
345,21016,8,Laughter is such a great joy in my life. If p...,0.516667
346,21115,8,There are many reasons why laughter is an imp...,0.533333
347,21249,8,"It was the first day of the ninth grade, that...",0.700000


In [92]:
# Cetak data uji
print("Test Data:")
test_data

Test Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,560,1,"Dear @CAPS1 Newspaper, @CAPS2 though computers...",0.600000
1,1614,1,Dear @CAPS1 I think that computers are benefic...,0.700000
2,1392,1,I think that computers have a good effect on p...,0.300000
3,73,1,"Dear The @ORGANIZATION1, ""@CAPS1, @CAPS1, @CAP...",0.600000
4,1055,1,"To whom it @MONTH1 concern, To many people it ...",0.600000
...,...,...,...,...
146,21090,8,"The @CAPS1 of Laughter @CAPS2 friends and I, a...",0.616667
147,21369,8,Laughter is the key I think that being happy ...,0.483333
148,21407,8,The @CAPS1 of @CAPS2 @CAPS3 I think of laughte...,0.666667
149,21204,8,It was the first day of sophomore year. I had...,0.516667


In [93]:
# Function to count test data for each essay set
def count_test_data(test_data):
    # Create a dictionary to store the count of test data for each essay set
    test_data_count = {}

    # Loop through each essay set (1 to 8)
    for essay_set in range(1, 9):
        # Filter test data for the current essay set
        subset = test_data[test_data['essay_set'] == essay_set]
        
        # Count the number of rows in the subset
        count = subset.shape[0]
        
        # Store the count in the dictionary
        test_data_count[essay_set] = count

    return test_data_count

# Get the count of test data for each essay set
test_data_count = count_test_data(test_data)

# Print the count of test data for each essay set
for essay_set, count in test_data_count.items():
    print(f"Set {essay_set}: {count} test data")

Set 1: 71 test data
Set 2: 60 test data
Set 3: 0 test data
Set 4: 0 test data
Set 5: 0 test data
Set 6: 0 test data
Set 7: 10 test data
Set 8: 10 test data


Data Loader - Batch Size

In [94]:
# Fungsi untuk membuat kamus yang memetakan id ke suatu indeks
def get_id2emb(ids):

  id2emb = {}
  for n,id in enumerate(ids.to_list()):
    id2emb[id] = n

  print('Essay ids to embeddings dictionary created.')
  
  return id2emb

In [95]:
id2emb_train = get_id2emb(train_data['essay_id'])
id2emb_test = get_id2emb(test_data['essay_id'])

Essay ids to embeddings dictionary created.
Essay ids to embeddings dictionary created.


In [96]:
def get_loader(df, id2emb, essay_embeddings, batch_size, shuffle):
    
    # Extract embeddings for each essay_id using the id2emb dictionary
    embeddings = np.array([essay_embeddings[id2emb[id]] for id in df['essay_id']])
    
    # Extract scores from the DataFrame
    scores = np.array(df['score'])
    
    # Create a PyTorch TensorDataset from the embeddings and scores
    data = TensorDataset(torch.from_numpy(embeddings).float(), torch.from_numpy(scores).float())
    
    # Create a PyTorch DataLoader from the TensorDataset
    loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=2)
    
    return loader

# EMBEDDING SBERT

LOAD PRETRAINED MODEL SBERT

In [97]:
# Memuat pretrained SBERT dan tokenizer
tokenizer_sbert = AutoTokenizer.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1')
sbert_model = AutoModel.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1').to(device)

# Mencetak informasi tentang model
print("Model information:")
print(sbert_model)



Model information:
MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          

SENTENCE EMBEDDING

In [98]:
def sbert_embedding(essay_list, tokenizer, model):

    print('Encoding essay embeddings:')

    embeddings = []
    for essay in tqdm(essay_list):
        encoded_input = tokenizer(essay, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Get the token embeddings (excluding the batch dimension)
        token_embeddings = model_output.last_hidden_state.squeeze().cpu().numpy()

        # Compute mean pooling
        mean_pooling = np.mean(token_embeddings, axis=0)

        # Compute max pooling
        max_pooling = np.max(token_embeddings, axis=0)

        # Use the embedding of the CLS token (first token) for each input
        cls_pooling = model_output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

        # Concatenate mean and max pooling results
        all_pooling = np.concatenate((mean_pooling, max_pooling, cls_pooling))
        
        embeddings.append(all_pooling)

    return np.array(embeddings)

In [99]:
# Menyimpan Embeddings yang dihasilkan SBERT 
train_embeddings_sbert = sbert_embedding(train_data['essay'], tokenizer_sbert, sbert_model)

Encoding essay embeddings:


  0%|          | 0/349 [00:00<?, ?it/s]

In [100]:
test_embeddings_sbert = sbert_embedding(test_data['essay'], tokenizer_sbert, sbert_model)

Encoding essay embeddings:


  0%|          | 0/151 [00:00<?, ?it/s]

In [101]:
print(train_embeddings_sbert.shape)

(349, 2304)


In [102]:
print(test_embeddings_sbert.shape)

(151, 2304)


In [103]:
# np.shape(train_embeddings_sbert)
train_embeddings_sbert

array([[-0.04381351, -0.2929353 , -0.19225127, ...,  0.23661417,
        -0.10249371, -0.28071922],
       [-0.02088969, -0.41765168, -0.22702567, ..., -0.06197346,
        -0.22347306, -0.34798333],
       [-0.11438533, -0.32085142, -0.21381213, ..., -0.07138707,
        -0.14463581, -0.3485329 ],
       ...,
       [ 0.10865152, -0.41539565, -0.21752897, ...,  0.1955385 ,
         0.39938778, -0.30938926],
       [ 0.04142565, -0.26300216, -0.20707145, ...,  0.08358064,
         0.11153191, -0.28362325],
       [ 0.10163353, -0.3595566 , -0.1719999 , ...,  0.10730707,
         0.2526765 , -0.22698738]], dtype=float32)

In [104]:
test_embeddings_sbert 

array([[-0.0415417 , -0.39821178, -0.19009815, ...,  0.08180489,
        -0.16593353, -0.2926095 ],
       [-0.05087076, -0.18135877, -0.17721929, ..., -0.1233057 ,
        -0.04336806, -0.24059765],
       [-0.00315987, -0.3030437 , -0.21211623, ..., -0.05258212,
         0.03890963, -0.17849146],
       ...,
       [ 0.11772002, -0.3721691 , -0.20598808, ..., -0.03406287,
         0.13292538, -0.44794503],
       [ 0.07173629, -0.34980538, -0.19100817, ...,  0.262721  ,
         0.19996798, -0.20783165],
       [ 0.14203782, -0.3976082 , -0.18679394, ...,  0.2678221 ,
         0.32901722, -0.25610444]], dtype=float32)

# REGRESI FCNN

INISIALISASI FCNN

In [146]:
# Menginisialisasi FCNN
class FCNN(nn.Module):
    # Fungsi untuk menentukan pengaturan layer
    def __init__(self, input_size):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 397)  # Layer pertama: input_size -> 256
        self.dropout1 = nn.Dropout(0.3) 
        self.fc2 = nn.Linear(397, 32)           # Layer kedua: 256 -> 96
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(32, 1)              # Layer ketiga: 96 -> 1
        self.sigmoid = nn.Sigmoid()             # Fungsi aktivasi Sigmoid
    
    # Fungsi untuk untuk melakukan feedforward
    def forward(self, x):
        x = torch.relu(self.fc1(x))              # Aktivasi ReLU di layer pertama
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))              # Aktivasi ReLU di layer kedua
        x = self.dropout2(x) 
        x = self.fc3(x)                                     # Layer ketiga (output layer)
        return self.sigmoid(x)                        # Output dengan fungsi aktivasi Sigmoid

TESTING FUNCTION

In [147]:
def test_step(trained_model, cost_function, test_loader):
    trained_model.eval() # Mengatur model ke mode evaluasi (eval mode)
    test_loss = 0.
    samples = 0.
    
    with torch.no_grad():
        for step, (inputs, targets) in enumerate(test_loader):
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.squeeze(dim=1).to(device)
            
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = trained_model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
            
            # Menambahkan nilai loss dari batch ke test_loss
            test_loss += loss.item() * inputs.shape[0]
            
    # Menghitung rata-rata loss di seluruh batch (samples)
    avg_loss = test_loss / samples
    
    # Mengembalikan nilai rata-rata loss
    return avg_loss


TRAINING FUNCTION

In [148]:
# Contoh fungsi untuk training model
def training_step(model, cost_function, train_loader, test_loader, save_path, num_epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    train_losses = []
    test_losses = []
    
    for epoch in range(num_epochs):
        model.train() # Mengatur model ke mode pelatihan
        
        # Mengatur gradien parameter ke nilai nol untuk iterasi
        running_loss = 0.
        samples = 0.
        
        for inputs, targets in train_loader:
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.to(device)
        
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)

            # Mengatur gradien parameter ke nilai nol untuk iterasi berikutnya
            optimizer.zero_grad()
            
            # Melakukan backpropagation untuk menghitung gradien loss terhadap parameter model
            loss.backward()
            
            # Melakukan optimizer untuk mengupdate parameter model berdasarkan gradien
            optimizer.step()
            
            # Menambahkan nilai loss dari batch ke running_loss
            running_loss += loss.item() * inputs.shape[0]
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
        
        # Menghitung rata-rata loss pada data latih
        train_loss = running_loss / samples
        
         # Evaluasi pada data uji
        test_loss = test_step(model, cost_function, test_loader)
        
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        print('Epoch: {:}/{:}\tLoss/train: {:.5f}\tLoss/test: {:.5f}'.format(epoch+1, num_epochs, train_loss, test_loss))
    
    # Simpan model setelah pelatihan
    torch.save(model.state_dict(), save_path)
    print(f"Model saved at {save_path}")
    
    return train_losses, test_losses

SCORING FUNCTION

In [149]:
#Fungsi untuk melakukan prediksi pada data uji
def scoring(trained_model, test_loader):
    trained_model.to(device)  # Move the model to the correct device
    predictions = []
    
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            # Lakukan prediksi dengan model yang telah dilatih
            outputs = trained_model(inputs)
            
            # Menyimpan prediksi (outputs) dalam bentuk list predictions
            predictions.extend(outputs.squeeze().cpu().numpy())
            
    return predictions

# MAIN

In [150]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TRAINING

In [151]:
# Hyperparameter
input_size = 2304
batch_size = 32
lr = 2e-5
epochs = 100

# TRAINING
# Inisialisasi model, loader, dan fungsi loss
model_sbert = FCNN(input_size).to(device)  # Ganti dengan model Anda
cost_function = torch.nn.MSELoss()

# Dataloaders
train_loader_sbert = get_loader(train_data, id2emb_train, train_embeddings_sbert, batch_size, shuffle=True)
test_loader_sbert = get_loader(test_data, id2emb_test, test_embeddings_sbert, batch_size, shuffle=False)

print('------------------------------------------------------------------')
print(f"\t\t\tTraining model SBERT: ")
print('------------------------------------------------------------------')
# Path tempat model akan disimpan dan dimuat
save_path = 'model_sbert_v7-mean_max_cls_rata2_banyak.pth'

start_time_sbert = time.time()
train_loss_sbert, test_loss_sbert = training_step(model_sbert, cost_function, train_loader_sbert, test_loader_sbert, save_path, epochs, lr)
end_time_sbert = time.time()

print('Training time:', end_time_sbert - start_time_sbert)

------------------------------------------------------------------
			Training model SBERT: 
------------------------------------------------------------------
Epoch: 1/100	Loss/train: 0.01860	Loss/test: 0.01840
Epoch: 2/100	Loss/train: 0.01590	Loss/test: 0.01690
Epoch: 3/100	Loss/train: 0.01427	Loss/test: 0.01592
Epoch: 4/100	Loss/train: 0.01316	Loss/test: 0.01518
Epoch: 5/100	Loss/train: 0.01189	Loss/test: 0.01452
Epoch: 6/100	Loss/train: 0.01161	Loss/test: 0.01362
Epoch: 7/100	Loss/train: 0.01139	Loss/test: 0.01297
Epoch: 8/100	Loss/train: 0.01073	Loss/test: 0.01259
Epoch: 9/100	Loss/train: 0.01031	Loss/test: 0.01237
Epoch: 10/100	Loss/train: 0.00982	Loss/test: 0.01212
Epoch: 11/100	Loss/train: 0.00999	Loss/test: 0.01186
Epoch: 12/100	Loss/train: 0.00941	Loss/test: 0.01187
Epoch: 13/100	Loss/train: 0.00938	Loss/test: 0.01177
Epoch: 14/100	Loss/train: 0.00966	Loss/test: 0.01180
Epoch: 15/100	Loss/train: 0.00970	Loss/test: 0.01163
Epoch: 16/100	Loss/train: 0.00932	Loss/test: 0.01154
E

SCORING

In [152]:
# Memuat model yang telah dilatih
model_sbert_trained = model_sbert 
model_sbert_trained.load_state_dict(torch.load(save_path))

# Menggunakan model untuk prediksi pada data uji
print('------------------------------------------------------------------')
print(f"\t\t\tScoring Essay: ")
print('------------------------------------------------------------------')
start_time_eval = time.time()
test_predictions_sbert = scoring(model_sbert_trained, test_loader_sbert)
end_time_eval = time.time()
print('Evaluation time:', end_time_eval - start_time_eval)

# store train_df, test_df and predictions
train_df_sbert = train_data
test_df_sbert = test_data
preds_sbert = test_predictions_sbert

------------------------------------------------------------------
			Scoring Essay: 
------------------------------------------------------------------
Evaluation time: 0.1551046371459961


# RESULT

In [153]:
def get_results_df(test_df, model_preds):

  # create new results df with model scaled preds
  preds_df = pd.DataFrame(model_preds)
  results_df = test_df.reset_index(drop=True)\
              .join(preds_df)\
              .rename(columns={0:'prediction'})\
              .sort_values(by='essay_id')\
              .reset_index(drop=True)

  # move score to last colum
  s_df = results_df.pop('score')
  results_df['score'] = s_df

  return results_df

In [154]:
get_results_df(test_df_sbert, preds_sbert)

Unnamed: 0,essay_id,essay_set,essay,prediction,score
0,73,1,"Dear The @ORGANIZATION1, ""@CAPS1, @CAPS1, @CAP...",0.725677,0.600000
1,79,1,"Computers have caused many people money, frien...",0.663973,0.800000
2,120,1,Dear @CAPS1: The effects that computers have o...,0.642397,0.600000
3,121,1,Do you think computers are helpful? Well you s...,0.634302,0.600000
4,142,1,As more and more people are becoming accustome...,0.697494,0.800000
...,...,...,...,...,...
146,21354,8,i woke up at @NUM1 from a cal from my cousin ...,0.604292,0.583333
147,21369,8,Laughter is the key I think that being happy ...,0.544167,0.483333
148,21407,8,The @CAPS1 of @CAPS2 @CAPS3 I think of laughte...,0.608010,0.666667
149,21509,8,So one day I was at home babysitting my little...,0.602508,0.500000


# DENORMALISASI

In [155]:
def inverse_normalize_score(score, min_max_range):
    # Mendapatkan nilai minimum dan maksimum dari rentang normalisasi
    min_score, max_score = min_max_range
    
    # Mengembalikan skor esai yang sudah dinormalisasi ke rentang aslinya
    return round(score * (max_score - min_score) + min_score)

def restore_original_scores(df, preds, min_max_ranges):
    # Membuat salinan dataframe untuk menghindari modifikasi dataframe asli
    df_copy = df.copy()
    
    # Mendapatkan kolom skor aktual
    actual_scores = df_copy['score'].values
    
    # Mendapatkan kolom essay_set
    essay_sets = df_copy['essay_set'].values
    
    # Memastikan preds memiliki panjang yang sama dengan jumlah data
    assert len(preds) == len(df_copy), "Length of predictions does not match length of dataframe"
    
    # Memulihkan skor prediksi dan skor aktual ke rentang aslinya
    restored_preds = [inverse_normalize_score(pred, min_max_ranges[essay_set]) for pred, essay_set in zip(preds, essay_sets)]
    restored_actuals = [inverse_normalize_score(actual, min_max_ranges[essay_set]) for actual, essay_set in zip(actual_scores, essay_sets)]
    
    # Mengganti kolom skor prediksi dan aktual dengan skor yang sudah dipulihkan
    df_copy['prediction'] = restored_preds
    df_copy['score'] = restored_actuals
    
    return df_copy


In [156]:
# Mengembalikan skor prediksi dan skor aktual ke rentang awalnya
restored_results_df_sbert = restore_original_scores(test_df_sbert, preds_sbert, min_max_ranges)

# Cetak hasilnya
print("Restored Results:")
restored_results_df_sbert

Restored Results:


Unnamed: 0,essay_id,essay_set,essay,score,prediction
0,560,1,"Dear @CAPS1 Newspaper, @CAPS2 though computers...",8,9
1,1614,1,Dear @CAPS1 I think that computers are benefic...,9,8
2,1392,1,I think that computers have a good effect on p...,5,9
3,73,1,"Dear The @ORGANIZATION1, ""@CAPS1, @CAPS1, @CAP...",8,9
4,1055,1,"To whom it @MONTH1 concern, To many people it ...",8,8
...,...,...,...,...,...
146,21090,8,"The @CAPS1 of Laughter @CAPS2 friends and I, a...",37,41
147,21369,8,Laughter is the key I think that being happy ...,29,33
148,21407,8,The @CAPS1 of @CAPS2 @CAPS3 I think of laughte...,40,36
149,21204,8,It was the first day of sophomore year. I had...,31,41


# EVALUASI QWK

In [157]:
import numpy as np

def calculate_qwk(actuals, preds):
    # Menentukan nilai minimum dan maksimum untuk rentang skor
    min_rating = min(min(actuals), min(preds))
    max_rating = max(max(actuals), max(preds))
    
    # Jumlah total kemungkinan penilaian
    num_ratings = max_rating - min_rating + 1

    # Membuat matriks bobot W
    weight_mat = np.zeros((num_ratings, num_ratings))
    for i in range(num_ratings):
        for j in range(num_ratings):
            weight_mat[i][j] = ((i - j) ** 2) / ((num_ratings - 1) ** 2)

    # Membuat matriks observasi O
    conf_mat = np.zeros((num_ratings, num_ratings))
    for actual, pred in zip(actuals, preds):
        conf_mat[actual - min_rating][pred - min_rating] += 1

    # Membuat matriks ekspektasi E
    actual_hist = np.zeros(num_ratings)
    pred_hist = np.zeros(num_ratings)
    for i in range(num_ratings):
        for j in range(num_ratings):
            actual_hist[i] += conf_mat[i][j]
            pred_hist[j] += conf_mat[i][j]

    expected_mat = np.outer(actual_hist, pred_hist) / len(actuals)

    # Menghitung nilai QWK
    num_agreements = np.sum(weight_mat * conf_mat)
    num_possible_agreements = np.sum(weight_mat * expected_mat)
    kappa_score = 1 - (num_agreements / num_possible_agreements)

    return kappa_score

In [158]:
# Fungsi untuk menghitung QWK per set
def calculate_qwk_per_set(df):
    # Menyimpan nilai QWK per set dalam dictionary
    qwk_per_set = {}
    
    # Mendapatkan unique essay_set values
    essay_sets = df['essay_set'].unique()
    
    # Iterasi melalui setiap essay_set
    for essay_set in essay_sets:
        # Filter dataframe berdasarkan essay_set
        subset_df = df[df['essay_set'] == essay_set]
        
        # Mengekstrak skor aktual dan prediksi dari subset dataframe
        actual_scores = subset_df['score'].astype(int)
        predicted_scores = subset_df['prediction'].astype(int)
        
        # Menghitung QWK untuk subset tersebut
        qwk = calculate_qwk(actual_scores, predicted_scores)
        
        # Menyimpan nilai QWK ke dalam dictionary
        qwk_per_set[f'Set {essay_set}'] = qwk
    
    return qwk_per_set


In [159]:
# Menghitung QWK per set
qwk_per_set_sbert = calculate_qwk_per_set(restored_results_df_sbert)

# Menampilkan nilai QWK per set
for essay_set, qwk_score in qwk_per_set_sbert.items():
    print(f"Quadratic Weighted Kappa Score for {essay_set}: {qwk_score}")

# Menghitung rata-rata nilai QWK dari semua set
average_qwk_sbert = np.mean(list(qwk_per_set_sbert.values()))

# Menampilkan rata-rata nilai QWK
print("Average Quadratic Weighted Kappa Score:", average_qwk_sbert)

Quadratic Weighted Kappa Score for Set 1: 0.2589321557607386
Quadratic Weighted Kappa Score for Set 2: 0.1071428571428572
Quadratic Weighted Kappa Score for Set 7: 0.5467625899280576
Quadratic Weighted Kappa Score for Set 8: 0.1243561442236939
Average Quadratic Weighted Kappa Score: 0.2592984367638368
