# DATA PREPOCESSING

In [1]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
import time
from transformers import BertTokenizer, BertModel
from tqdm.auto import tqdm

In [2]:
# Set device
torch.cuda.set_device(3)
device = "cuda:%s" % torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
device

'cuda:3'

LOAD DATASET

In [3]:
# Load dataset .tsv
kaggle_dataset  = pd.read_csv('./training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")
kaggle_dataset

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,17,18,,35,,,,...,4.0,4.0,4.0,3.0,,,,,,
12972,21628,8,I never understood the meaning laughter is th...,15,17,,32,,,,...,4.0,4.0,4.0,3.0,,,,,,
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",20,26,40.0,40,,,,...,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
12974,21630,8,Trippin' on fen...,20,20,,40,,,,...,4.0,4.0,4.0,4.0,,,,,,


DATA CLEANSING

In [4]:
# Data Cleansing Function
def clean_dataset(input_dataset):
  # Remove unused column 
  dataset = pd.DataFrame(
    {
      'essay_id' : input_dataset['essay_id'],
      'essay_set' : input_dataset['essay_set'],
      'essay' : input_dataset['essay'],
      'score' : input_dataset['domain1_score']
    }
  )

  # Check missing value
  missing_values = dataset.isnull().sum()
  print("Jumlah missing values:")
  print(missing_values)

  # Remove missing value
  dataset_cleaned = dataset.dropna()
  cleaned_missing_values = dataset_cleaned.isnull().sum()
  print("\nJumlah missing values setelah data dibersihkan:")
  print(cleaned_missing_values)

  print("\nDataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:")

  return dataset_cleaned

In [5]:
dataset_cleaned = clean_dataset(kaggle_dataset)
dataset_cleaned

Jumlah missing values:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Jumlah missing values setelah data dibersihkan:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Dataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:


Unnamed: 0,essay_id,essay_set,essay,score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,35
12972,21628,8,I never understood the meaning laughter is th...,32
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40
12974,21630,8,Trippin' on fen...,40


SCORE NORMALIZATION

In [6]:
# Rentang nilai esai (nilai minimum dan maksimum pada tiap set esai)
min_max_ranges = {
    1: (2, 12),
    2: (1, 6),
    3: (0, 3),
    4: (0, 3),
    5: (0, 4),
    6: (0, 4),
    7: (0, 30),
    8: (0, 60)
}

In [7]:
#Score Normalization Function
def normalize_score(dataset, min_max_ranges):

    #Rumus min max normalization
    def min_max_normalize(score, min_score, max_score):
        return (score - min_score) / (max_score - min_score)
    
    #Normalisasi nilai skor
    for essay_set, (min_score, max_score) in min_max_ranges.items():

        # Filter dataset berdasarkan essay set
        subset = dataset[dataset['essay_set'] == essay_set]
        
        # Lakukan normalisasi skor secara manual
        normalized_scores = subset['score'].apply(lambda x: min_max_normalize(x, min_score, max_score))
        
        # Update kolom skor pada subset dataset dengan skor yang telah dinormalisasi
        dataset.loc[subset.index, 'normalized_score'] = normalized_scores

    # Ganti nilai kolom score dengan normalized_score
    dataset['score'] = dataset['normalized_score']

    # Hapus kolom normalized_score
    dataset.drop('normalized_score', axis=1, inplace=True)
    
    return dataset

In [8]:
dataset_normalized = normalize_score(dataset_cleaned, min_max_ranges)
dataset_normalized

Unnamed: 0,essay_id,essay_set,essay,score
0,1,1,"Dear local newspaper, I think effects computer...",0.600000
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",0.700000
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",0.500000
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",0.800000
4,5,1,"Dear @LOCATION1, I know having computers has a...",0.600000
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,0.583333
12972,21628,8,I never understood the meaning laughter is th...,0.533333
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",0.666667
12974,21630,8,Trippin' on fen...,0.666667


DATA SPLITTING

In [9]:
#Data Splitting Function
def data_splitting(dataset):
    # Dictionary untuk menyimpan data latih dan data uji untuk setiap essay_set
    train_data_perset = {}
    test_data_perset = {}

    # Mendefinisikan essay_set yang tersedia dalam dataset
    essay_sets = dataset['essay_set'].unique()

    for essay_set in essay_sets:
        # Filter dataset berdasarkan essay_set
        subset = dataset[dataset_cleaned['essay_set'] == essay_set]
        
        features = ['essay_id', 'essay_set', 'essay']
        X = subset.loc[:, features]
        y = subset.loc[:, ['score']]
        
        # Lakukan splitting menjadi data train (70%) dan data test (30%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
        
        # Menggabungkan X_train dan y_train menjadi dataframe data latih
        train_data_perset[essay_set] = pd.concat([X_train, y_train], axis=1)

        # Menggabungkan X_test dan y_test menjadi dataframe data uji
        test_data_perset[essay_set] = pd.concat([X_test, y_test], axis=1)

    # Menggabungkan semua data train dari setiap essay_set menjadi satu DataFrame data_train
    train_data = pd.concat(train_data_perset.values(), ignore_index=True)

    # Menggabungkan semua data test dari setiap essay_set menjadi satu DataFrame data_test
    test_data = pd.concat(test_data_perset.values(), ignore_index=True)

    return train_data, test_data

In [10]:
train_data, test_data = data_splitting(dataset_normalized)

In [11]:
# Cetak data latih
print("Train Data:")
train_data

Train Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,1514,1,"Dear Local Newspaper, In my opinion I support ...",0.600000
1,85,1,"Dear local newspaper, Computers are they a goo...",0.800000
2,1744,1,"Dear Local newspaper, @CAPS1 you have a comput...",0.900000
3,1004,1,"Dear @CAPS1, @CAPS2 people have computers in t...",0.400000
4,1580,1,"Dear @CITY1 @ORGANIZATION1, I am a @ORGANIZATI...",0.600000
...,...,...,...,...
9077,20806,8,There are a couple things that can lead stran...,0.583333
9078,20849,8,In a relationship you should be able to trust...,0.733333
9079,21055,8,Laughter is a huge part oh building friendshi...,0.583333
9080,21258,8,I think that laughter is a key element to any...,0.666667


In [12]:
# Cetak data uji
print("Test Data:")
test_data

Test Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,827,1,I think computers have a postitive affect on p...,0.400000
1,1477,1,I blive that computers have a lot of effects o...,0.400000
2,234,1,Many people think that computers are not a goo...,0.600000
3,801,1,"Dear Newspaper people, @CAPS1 you might heard ...",0.700000
4,780,1,More and more people are using computers on a ...,0.700000
...,...,...,...,...
3889,20920,8,In my storie I am going to tell you about a f...,0.533333
3890,21107,8,Laughter is an essential component to any rel...,0.666667
3891,21514,8,I think laughter should be a huge part in eve...,0.566667
3892,21309,8,Some people say that laughter is the best med...,0.633333


# EMBEDDING

Data Loader - Batch Size

In [13]:
# Fungsi untuk membuat kamus yang memetakan id ke suatu indeks
def get_id2emb(ids):

  id2emb = {}
  for n,id in enumerate(ids.to_list()):
    id2emb[id] = n

  print('Essay ids to embeddings dictionary created.')
  
  return id2emb

In [14]:
id2emb_train = get_id2emb(train_data['essay_id'])
id2emb_test = get_id2emb(test_data['essay_id'])

Essay ids to embeddings dictionary created.
Essay ids to embeddings dictionary created.


In [15]:
def get_loader(df, id2emb, essay_embeddings, batch_size, shuffle):
    
    # Extract embeddings for each essay_id using the id2emb dictionary
    embeddings = np.array([essay_embeddings[id2emb[id]] for id in df['essay_id']])
    
    # Extract scores from the DataFrame
    scores = np.array(df['score'])
    
    # Create a PyTorch TensorDataset from the embeddings and scores
    data = TensorDataset(torch.from_numpy(embeddings).float(), torch.from_numpy(scores).float())
    
    # Create a PyTorch DataLoader from the TensorDataset
    loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=2)
    
    return loader

# EMBEDDING BERT

LOAD PRETRAINED MODEL BERT

In [16]:
# Memuat pretrained BERT dan tokenizer
bert_model = BertModel.from_pretrained("bert-base-cased").to(device)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')



WORD EMBEDDING

In [17]:
def mean_encoding(essay_list, tokenizer, model):

  print('Encoding essay embeddings:')

  embeddings = []
  for essay in tqdm(essay_list):
    encoded_input = tokenizer(essay, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
      model_output = model(**encoded_input)
    tokens_embeddings = np.matrix(model_output[0].squeeze().cpu())
    embeddings.append(np.squeeze(np.asarray(tokens_embeddings.mean(0))))

  return np.matrix(embeddings)

In [18]:
# Menyimpan Embeddings yang dihasilkan BERT 
train_embeddings_bert = mean_encoding(train_data['essay'], tokenizer_bert, bert_model)

Encoding essay embeddings:


  0%|          | 0/9082 [00:00<?, ?it/s]

In [19]:
test_embeddings_bert = mean_encoding(test_data['essay'], tokenizer_bert, bert_model)

Encoding essay embeddings:


  0%|          | 0/3894 [00:00<?, ?it/s]

In [20]:
print(train_embeddings_bert.shape)

(9082, 768)


In [21]:
print(test_embeddings_bert.shape)

(3894, 768)


In [22]:
# np.shape(train_embeddings_bert)
train_embeddings_bert

matrix([[ 0.35910174, -0.17483714, -0.247729  , ...,  0.07933508,
          0.05804938,  0.1356201 ],
        [ 0.3981391 , -0.00719295, -0.27228376, ..., -0.07918605,
         -0.05534224,  0.12167048],
        [ 0.3225785 , -0.07413144, -0.01365663, ..., -0.16281046,
         -0.03161112,  0.260585  ],
        ...,
        [ 0.19724223, -0.03712742, -0.32213733, ...,  0.06405073,
          0.10556314,  0.15283988],
        [ 0.17465864, -0.01310472, -0.4814403 , ...,  0.11352181,
          0.06719531,  0.10060339],
        [ 0.3069924 ,  0.02839362, -0.18555646, ...,  0.01189573,
          0.10440755,  0.14193642]], dtype=float32)

In [23]:
test_embeddings_bert 

matrix([[ 5.2361274e-01,  6.5579027e-02, -6.9257043e-02, ...,
          8.5451230e-02, -4.7196941e-03,  3.9873242e-01],
        [ 2.0980513e-01, -5.1973633e-02, -3.1645828e-01, ...,
          9.3155988e-02,  1.4282057e-02,  2.1754856e-01],
        [ 4.7523963e-01, -4.7867130e-02, -2.6258287e-01, ...,
         -5.4310139e-02,  2.5713788e-02,  2.8365791e-01],
        ...,
        [ 1.7899011e-01, -1.9890459e-03, -4.1474131e-01, ...,
         -1.9767616e-02,  1.2144737e-01,  7.9925679e-02],
        [ 1.2217074e-01,  2.1570362e-04, -4.4737700e-01, ...,
          9.5025785e-02, -2.7180389e-02,  3.0119899e-03],
        [ 2.4533097e-01, -6.4652354e-02, -2.7101031e-01, ...,
          2.4937829e-02,  2.1111002e-02,  1.7203335e-01]], dtype=float32)

# REGRESI FCNN

INISIALISASI FCNN

In [374]:
# Menginisialisasi FCNN
class FCNN(nn.Module):
    # Fungsi untuk menentukan pengaturan layer
    def __init__(self, input_size):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 397) 
        self.dropout1 = nn.Dropout(0.3) 
        self.fc2 = nn.Linear(397, 32)          
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(32, 1)              
        self.sigmoid = nn.Sigmoid()             
    
    # Fungsi untuk untuk melakukan feedforward
    def forward(self, x):
        x = torch.relu(self.fc1(x))             
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))              
        x = self.dropout2(x) 
        x = self.fc3(x)                                    
        return self.sigmoid(x)                        

TESTING FUNCTION

In [375]:
def test_step(trained_model, cost_function, test_loader):
    trained_model.eval() # Mengatur model ke mode evaluasi (eval mode)
    test_loss = 0.
    samples = 0.
    
    with torch.no_grad():
        for step, (inputs, targets) in enumerate(test_loader):
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.squeeze(dim=1).to(device)
            
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = trained_model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
            
            # Menambahkan nilai loss dari batch ke test_loss
            test_loss += loss.item() * inputs.shape[0]
            
    # Menghitung rata-rata loss di seluruh batch (samples)
    avg_loss = test_loss / samples
    
    # Mengembalikan nilai rata-rata loss
    return avg_loss


TRAINING FUNCTION

In [376]:
# Contoh fungsi untuk training model
def training_step(model, cost_function, train_loader, test_loader, save_path, num_epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    train_losses = []
    test_losses = []
    
    for epoch in range(num_epochs):
        model.train() # Mengatur model ke mode pelatihan
        
        # Mengatur gradien parameter ke nilai nol untuk iterasi
        running_loss = 0.
        samples = 0.
        
        for inputs, targets in train_loader:
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.to(device)
        
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)

            # Mengatur gradien parameter ke nilai nol untuk iterasi berikutnya
            optimizer.zero_grad()
            
            # Melakukan backpropagation untuk menghitung gradien loss terhadap parameter model
            loss.backward()
            
            # Melakukan optimizer untuk mengupdate parameter model berdasarkan gradien
            optimizer.step()
            
            # Menambahkan nilai loss dari batch ke running_loss
            running_loss += loss.item() * inputs.shape[0]
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
        
        # Menghitung rata-rata loss pada data latih
        train_loss = running_loss / samples
        
         # Evaluasi pada data uji
        test_loss = test_step(model, cost_function, test_loader)
        
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        print('Epoch: {:}/{:}\tLoss/train: {:.5f}\tLoss/test: {:.5f}'.format(epoch+1, num_epochs, train_loss, test_loss))
    
    # Simpan model setelah pelatihan
    torch.save(model.state_dict(), save_path)
    print(f"Model saved at {save_path}")
    
    return train_losses, test_losses

SCORING FUNCTION

In [377]:
#Fungsi untuk melakukan prediksi pada data uji
def scoring(trained_model, test_loader):
    trained_model.to(device)  # Move the model to the correct device
    predictions = []
    
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            # Lakukan prediksi dengan model yang telah dilatih
            outputs = trained_model(inputs)
            
            # Menyimpan prediksi (outputs) dalam bentuk list predictions
            predictions.extend(outputs.squeeze().cpu().numpy())
            
    return predictions

# MAIN

In [378]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TRAINING

In [379]:
# Hyperparameter
input_size = 768
batch_size = 128
epochs = 30
lr = 2e-5

# TRAINING
# Inisialisasi model, loader, dan fungsi loss
model_bert = FCNN(input_size).to(device)  # Ganti dengan model Anda
cost_function = torch.nn.MSELoss()

# Dataloaders
train_loader_bert = get_loader(train_data, id2emb_train, train_embeddings_bert, batch_size, shuffle=True)
test_loader_bert = get_loader(test_data, id2emb_test, test_embeddings_bert, batch_size, shuffle=False)

print('------------------------------------------------------------------')
print(f"\t\t\tTraining model BERT: ")
print('------------------------------------------------------------------')
# Path tempat model akan disimpan dan dimuat
save_path = 'model_bert_v7-7.26.pth'

start_time_bert = time.time()
train_loss_bert, test_loss_bert = training_step(model_bert, cost_function, train_loader_bert, test_loader_bert, save_path, epochs, lr)
end_time_bert = time.time()

print('Training time:', end_time_bert - start_time_bert)

------------------------------------------------------------------
			Training model BERT: 
------------------------------------------------------------------
Epoch: 1/30	Loss/train: 0.05513	Loss/test: 0.05150
Epoch: 2/30	Loss/train: 0.05117	Loss/test: 0.04722
Epoch: 3/30	Loss/train: 0.04650	Loss/test: 0.04159
Epoch: 4/30	Loss/train: 0.04127	Loss/test: 0.03678
Epoch: 5/30	Loss/train: 0.03691	Loss/test: 0.03309
Epoch: 6/30	Loss/train: 0.03382	Loss/test: 0.03027
Epoch: 7/30	Loss/train: 0.03133	Loss/test: 0.02820
Epoch: 8/30	Loss/train: 0.02978	Loss/test: 0.02666
Epoch: 9/30	Loss/train: 0.02813	Loss/test: 0.02555
Epoch: 10/30	Loss/train: 0.02748	Loss/test: 0.02473
Epoch: 11/30	Loss/train: 0.02654	Loss/test: 0.02412
Epoch: 12/30	Loss/train: 0.02564	Loss/test: 0.02362
Epoch: 13/30	Loss/train: 0.02540	Loss/test: 0.02340
Epoch: 14/30	Loss/train: 0.02480	Loss/test: 0.02293
Epoch: 15/30	Loss/train: 0.02418	Loss/test: 0.02257
Epoch: 16/30	Loss/train: 0.02415	Loss/test: 0.02234
Epoch: 17/30	Loss/

SCORING

In [380]:
# Memuat model yang telah dilatih
model_bert_trained = model_bert 
model_bert_trained.load_state_dict(torch.load(save_path))

# Menggunakan model untuk prediksi pada data uji
print('------------------------------------------------------------------')
print(f"\t\t\tScoring Essay: ")
print('------------------------------------------------------------------')
start_time_eval = time.time()
test_predictions_bert = scoring(model_bert_trained, test_loader_bert)
end_time_eval = time.time()
print('Evaluation time:', end_time_eval - start_time_eval)

# store train_df, test_df and predictions
train_df_bert = train_data
test_df_bert = test_data
preds_bert = test_predictions_bert

------------------------------------------------------------------
			Scoring Essay: 
------------------------------------------------------------------
Evaluation time: 0.17862510681152344


# RESULT

In [381]:
def get_results_df(test_df, model_preds):

  # create new results df with model scaled preds
  preds_df = pd.DataFrame(model_preds)
  results_df = test_df.reset_index(drop=True)\
              .join(preds_df)\
              .rename(columns={0:'prediction'})\
              .sort_values(by='essay_id')\
              .reset_index(drop=True)

  # move score to last colum
  s_df = results_df.pop('score')
  results_df['score'] = s_df

  return results_df

In [382]:
get_results_df(test_df_bert, preds_bert)

Unnamed: 0,essay_id,essay_set,essay,prediction,score
0,16,1,"Dear @ORGANIZATION1, The computer blinked to l...",0.786086,1.000000
1,24,1,"Dear local newspaper, I've heard that not many...",0.798464,0.900000
2,30,1,"Dear Newspaper People, I think that computers ...",0.520079,0.600000
3,31,1,I agree that computers deffinately are an adva...,0.761780,0.800000
4,33,1,"Dear, @ORGANIZATION1 I think the effects that ...",0.495783,0.400000
...,...,...,...,...,...
3889,21613,8,"Before my best friend moved away, we would st...",0.737640,0.666667
3890,21615,8,@ORGANIZATION1 ...,0.610280,0.533333
3891,21626,8,In most stories mothers and daughters are eit...,0.672252,0.583333
3892,21628,8,I never understood the meaning laughter is th...,0.632766,0.533333


# DENORMALISASI

In [383]:
def inverse_normalize_score(score, min_max_range):
    # Mendapatkan nilai minimum dan maksimum dari rentang normalisasi
    min_score, max_score = min_max_range
    
    # Mengembalikan skor esai yang sudah dinormalisasi ke rentang aslinya
    return round(score * (max_score - min_score) + min_score)

def restore_original_scores(df, preds, min_max_ranges):
    # Membuat salinan dataframe untuk menghindari modifikasi dataframe asli
    df_copy = df.copy()
    
    # Mendapatkan kolom skor aktual
    actual_scores = df_copy['score'].values
    
    # Mendapatkan kolom essay_set
    essay_sets = df_copy['essay_set'].values
    
    # Memastikan preds memiliki panjang yang sama dengan jumlah data
    assert len(preds) == len(df_copy), "Length of predictions does not match length of dataframe"
    
    # Memulihkan skor prediksi dan skor aktual ke rentang aslinya
    restored_preds = [inverse_normalize_score(pred, min_max_ranges[essay_set]) for pred, essay_set in zip(preds, essay_sets)]
    restored_actuals = [inverse_normalize_score(actual, min_max_ranges[essay_set]) for actual, essay_set in zip(actual_scores, essay_sets)]
    
    # Mengganti kolom skor prediksi dan aktual dengan skor yang sudah dipulihkan
    df_copy['prediction'] = restored_preds
    df_copy['score'] = restored_actuals
    
    return df_copy

In [384]:
# Mengembalikan skor prediksi dan skor aktual ke rentang awalnya
restored_results_df_bert = restore_original_scores(test_df_bert, preds_bert, min_max_ranges)

# Cetak hasilnya
print("Restored Results:")
restored_results_df_bert

Restored Results:


Unnamed: 0,essay_id,essay_set,essay,score,prediction
0,827,1,I think computers have a postitive affect on p...,6,6
1,1477,1,I blive that computers have a lot of effects o...,6,7
2,234,1,Many people think that computers are not a goo...,8,9
3,801,1,"Dear Newspaper people, @CAPS1 you might heard ...",9,8
4,780,1,More and more people are using computers on a ...,9,9
...,...,...,...,...,...
3889,20920,8,In my storie I am going to tell you about a f...,32,36
3890,21107,8,Laughter is an essential component to any rel...,40,49
3891,21514,8,I think laughter should be a huge part in eve...,34,36
3892,21309,8,Some people say that laughter is the best med...,38,41


# EVALUASI QWK

In [385]:
import numpy as np

def calculate_qwk(actuals, preds):
    # Menentukan nilai minimum dan maksimum untuk rentang skor
    min_rating = min(min(actuals), min(preds))
    max_rating = max(max(actuals), max(preds))
    
    # Jumlah total kemungkinan penilaian
    num_ratings = max_rating - min_rating + 1

    # Membuat matriks bobot W
    weight_mat = np.zeros((num_ratings, num_ratings))
    for i in range(num_ratings):
        for j in range(num_ratings):
            weight_mat[i][j] = ((i - j) ** 2) / ((num_ratings - 1) ** 2)

    # Membuat matriks observasi O
    conf_mat = np.zeros((num_ratings, num_ratings))
    for actual, pred in zip(actuals, preds):
        conf_mat[actual - min_rating][pred - min_rating] += 1

    # Membuat matriks ekspektasi E
    actual_hist = np.zeros(num_ratings)
    pred_hist = np.zeros(num_ratings)
    for i in range(num_ratings):
        for j in range(num_ratings):
            actual_hist[i] += conf_mat[i][j]
            pred_hist[j] += conf_mat[i][j]

    expected_mat = np.outer(actual_hist, pred_hist) / len(actuals)

    # Menghitung nilai QWK
    num_agreements = np.sum(weight_mat * conf_mat)
    num_possible_agreements = np.sum(weight_mat * expected_mat)
    kappa_score = 1 - (num_agreements / num_possible_agreements)

    return kappa_score

In [386]:
# Fungsi untuk menghitung QWK per set
def calculate_qwk_per_set(df):
    # Menyimpan nilai QWK per set dalam dictionary
    qwk_per_set = {}
    
    # Mendapatkan unique essay_set values
    essay_sets = df['essay_set'].unique()
    
    # Iterasi melalui setiap essay_set
    for essay_set in essay_sets:
        # Filter dataframe berdasarkan essay_set
        subset_df = df[df['essay_set'] == essay_set]
        
        # Mengekstrak skor aktual dan prediksi dari subset dataframe
        actual_scores = subset_df['score'].astype(int)
        predicted_scores = subset_df['prediction'].astype(int)
        
        # Menghitung QWK untuk subset tersebut
        qwk = calculate_qwk(actual_scores, predicted_scores)
        
        # Menyimpan nilai QWK ke dalam dictionary
        qwk_per_set[f'Set {essay_set}'] = qwk
    
    return qwk_per_set


In [387]:
# Menghitung QWK per set
qwk_per_set_bert = calculate_qwk_per_set(restored_results_df_bert)

# Menampilkan nilai QWK per set
for essay_set, qwk_score in qwk_per_set_bert.items():
    print(f"Quadratic Weighted Kappa Score for {essay_set}: {qwk_score}")

# Menghitung rata-rata nilai QWK dari semua set
average_qwk_bert = np.mean(list(qwk_per_set_bert.values()))

# Menampilkan rata-rata nilai QWK
print("Average Quadratic Weighted Kappa Score:", average_qwk_bert)

Quadratic Weighted Kappa Score for Set 1: 0.7131751775901354
Quadratic Weighted Kappa Score for Set 2: 0.5228666326009198
Quadratic Weighted Kappa Score for Set 3: 0.6226772433586096
Quadratic Weighted Kappa Score for Set 4: 0.6793701322010034
Quadratic Weighted Kappa Score for Set 5: 0.7426331258411087
Quadratic Weighted Kappa Score for Set 6: 0.6714783474365356
Quadratic Weighted Kappa Score for Set 7: 0.7814468118263538
Quadratic Weighted Kappa Score for Set 8: 0.4899819181713557
Average Quadratic Weighted Kappa Score: 0.6529536736282529
