# DATA PREPOCESSING

In [2]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score as kappa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import time

In [3]:
# Set device
torch.cuda.set_device(1)
device = "cuda:%s" % torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
device

'cuda:1'

LOAD DATASET

In [4]:
# Load dataset .tsv
kaggle_dataset  = pd.read_csv('./training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")
kaggle_dataset

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,17,18,,35,,,,...,4.0,4.0,4.0,3.0,,,,,,
12972,21628,8,I never understood the meaning laughter is th...,15,17,,32,,,,...,4.0,4.0,4.0,3.0,,,,,,
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",20,26,40.0,40,,,,...,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
12974,21630,8,Trippin' on fen...,20,20,,40,,,,...,4.0,4.0,4.0,4.0,,,,,,


DATA CLEANSING

In [5]:
# Data Cleansing Function
def clean_dataset(input_dataset):
  # Remove unused column 
  dataset = pd.DataFrame(
    {
      'essay_id' : input_dataset['essay_id'],
      'essay_set' : input_dataset['essay_set'],
      'essay' : input_dataset['essay'],
      'score' : input_dataset['domain1_score']
    }
  )

  # Check missing value
  missing_values = dataset.isnull().sum()
  print("Jumlah missing values:")
  print(missing_values)

  # Remove missing value
  dataset_cleaned = dataset.dropna()
  cleaned_missing_values = dataset_cleaned.isnull().sum()
  print("\nJumlah missing values setelah data dibersihkan:")
  print(cleaned_missing_values)

  print("\nDataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:")

  return dataset_cleaned

In [6]:
dataset_cleaned = clean_dataset(kaggle_dataset)
dataset_cleaned

Jumlah missing values:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Jumlah missing values setelah data dibersihkan:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Dataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:


Unnamed: 0,essay_id,essay_set,essay,score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,35
12972,21628,8,I never understood the meaning laughter is th...,32
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40
12974,21630,8,Trippin' on fen...,40


SCORE NORMALIZATION

In [7]:
# Rentang nilai esai (nilai minimum dan maksimum pada tiap set esai)
min_max_ranges = {
    1: (2, 12),
    2: (1, 6),
    3: (0, 3),
    4: (0, 3),
    5: (0, 4),
    6: (0, 4),
    7: (0, 30),
    8: (0, 60)
}

In [8]:
#Score Normalization Function
def normalize_score(dataset, min_max_ranges):

    #Rumus min max normalization
    def min_max_normalize(score, min_score, max_score):
        return (score - min_score) / (max_score - min_score)
    
    #Normalisasi nilai skor
    for essay_set, (min_score, max_score) in min_max_ranges.items():

        # Filter dataset berdasarkan essay set
        subset = dataset[dataset['essay_set'] == essay_set]
        
        # Lakukan normalisasi skor secara manual
        normalized_scores = subset['score'].apply(lambda x: min_max_normalize(x, min_score, max_score))
        
        # Update kolom skor pada subset dataset dengan skor yang telah dinormalisasi
        dataset.loc[subset.index, 'normalized_score'] = normalized_scores

    # Ganti nilai kolom score dengan normalized_score
    dataset['score'] = dataset['normalized_score']

    # Hapus kolom normalized_score
    dataset.drop('normalized_score', axis=1, inplace=True)
    
    return dataset

In [9]:
dataset_normalized = normalize_score(dataset_cleaned, min_max_ranges)
dataset_normalized

Unnamed: 0,essay_id,essay_set,essay,score
0,1,1,"Dear local newspaper, I think effects computer...",0.600000
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",0.700000
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",0.500000
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",0.800000
4,5,1,"Dear @LOCATION1, I know having computers has a...",0.600000
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,0.583333
12972,21628,8,I never understood the meaning laughter is th...,0.533333
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",0.666667
12974,21630,8,Trippin' on fen...,0.666667


DATA SPLITTING

In [10]:
#Data Splitting Function
def data_splitting(dataset):
    # Dictionary untuk menyimpan data latih dan data uji untuk setiap essay_set
    train_data_perset = {}
    test_data_perset = {}

    # Mendefinisikan essay_set yang tersedia dalam dataset
    essay_sets = dataset['essay_set'].unique()

    for essay_set in essay_sets:
        # Filter dataset berdasarkan essay_set
        subset = dataset[dataset_cleaned['essay_set'] == essay_set]
        
        features = ['essay_id', 'essay_set', 'essay']
        X = subset.loc[:, features]
        y = subset.loc[:, ['score']]
        
        # Lakukan splitting menjadi data train (70%) dan data test (30%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
        
        # Menggabungkan X_train dan y_train menjadi dataframe data latih
        train_data_perset[essay_set] = pd.concat([X_train, y_train], axis=1)

        # Menggabungkan X_test dan y_test menjadi dataframe data uji
        test_data_perset[essay_set] = pd.concat([X_test, y_test], axis=1)

    # Menggabungkan semua data train dari setiap essay_set menjadi satu DataFrame data_train
    train_data = pd.concat(train_data_perset.values(), ignore_index=True)

    # Menggabungkan semua data test dari setiap essay_set menjadi satu DataFrame data_test
    test_data = pd.concat(test_data_perset.values(), ignore_index=True)

    return train_data, test_data

In [11]:
train_data, test_data = data_splitting(dataset_normalized)

In [12]:
# Cetak data latih
print("Train Data:")
train_data

Train Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,1514,1,"Dear Local Newspaper, In my opinion I support ...",0.600000
1,85,1,"Dear local newspaper, Computers are they a goo...",0.800000
2,1744,1,"Dear Local newspaper, @CAPS1 you have a comput...",0.900000
3,1004,1,"Dear @CAPS1, @CAPS2 people have computers in t...",0.400000
4,1580,1,"Dear @CITY1 @ORGANIZATION1, I am a @ORGANIZATI...",0.600000
...,...,...,...,...
9077,20806,8,There are a couple things that can lead stran...,0.583333
9078,20849,8,In a relationship you should be able to trust...,0.733333
9079,21055,8,Laughter is a huge part oh building friendshi...,0.583333
9080,21258,8,I think that laughter is a key element to any...,0.666667


In [13]:
# Cetak data uji
print("Test Data:")
test_data

Test Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,827,1,I think computers have a postitive affect on p...,0.400000
1,1477,1,I blive that computers have a lot of effects o...,0.400000
2,234,1,Many people think that computers are not a goo...,0.600000
3,801,1,"Dear Newspaper people, @CAPS1 you might heard ...",0.700000
4,780,1,More and more people are using computers on a ...,0.700000
...,...,...,...,...
3889,20920,8,In my storie I am going to tell you about a f...,0.533333
3890,21107,8,Laughter is an essential component to any rel...,0.666667
3891,21514,8,I think laughter should be a huge part in eve...,0.566667
3892,21309,8,Some people say that laughter is the best med...,0.633333


# EMBEDDING

Data Loader - Batch Size

In [14]:
# Fungsi untuk membuat kamus yang memetakan id ke suatu indeks
def get_id2emb(ids):

  id2emb = {}
  for n,id in enumerate(ids.to_list()):
    id2emb[id] = n

  print('Essay ids to embeddings dictionary created.')
  
  return id2emb

In [15]:
id2emb_train = get_id2emb(train_data['essay_id'])
id2emb_test = get_id2emb(test_data['essay_id'])

Essay ids to embeddings dictionary created.
Essay ids to embeddings dictionary created.


In [16]:
def get_loader(df, id2emb, essay_embeddings, batch_size, shuffle):
    
    # Extract embeddings for each essay_id using the id2emb dictionary
    embeddings = np.array([essay_embeddings[id2emb[id]] for id in df['essay_id']])
    
    # Extract scores from the DataFrame
    scores = np.array(df['score'])
    
    # Create a PyTorch TensorDataset from the embeddings and scores
    data = TensorDataset(torch.from_numpy(embeddings).float(), torch.from_numpy(scores).float())
    
    # Create a PyTorch DataLoader from the TensorDataset
    loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=2)
    
    return loader

# EMBEDDING SBERT

LOAD PRETRAINED MODEL SBERT

In [17]:
# Memuat pretrained SBERT dan tokenizer
tokenizer_sbert = AutoTokenizer.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1')
sbert_model = AutoModel.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1').to(device)

# Mencetak informasi tentang model
print("Model information:")
print(sbert_model)



Model information:
MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          

SENTENCE EMBEDDING

In [18]:
def sbert_embedding(essay_list, tokenizer, model):

    print('Encoding essay embeddings:')

    embeddings = []
    for essay in tqdm(essay_list):
        encoded_input = tokenizer(essay, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Get the token embeddings (excluding the batch dimension)
        token_embeddings = model_output.last_hidden_state.squeeze().cpu().numpy()

        # Compute mean pooling
        mean_pooling = np.mean(token_embeddings, axis=0)

        # Compute max pooling
        max_pooling = np.max(token_embeddings, axis=0)

        # Use the embedding of the CLS token (first token) for each input
        cls_pooling = model_output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

        # Concatenate mean and max pooling results
        all_pooling = np.concatenate((mean_pooling, max_pooling, cls_pooling))
        
        embeddings.append(all_pooling)

    return np.array(embeddings)

In [19]:
# Menyimpan Embeddings yang dihasilkan SBERT 
train_embeddings_sbert = sbert_embedding(train_data['essay'], tokenizer_sbert, sbert_model)

Encoding essay embeddings:


  0%|          | 0/9082 [00:00<?, ?it/s]

In [20]:
test_embeddings_sbert = sbert_embedding(test_data['essay'], tokenizer_sbert, sbert_model)

Encoding essay embeddings:


  0%|          | 0/3894 [00:00<?, ?it/s]

In [21]:
print(train_embeddings_sbert.shape)

(9082, 2304)


In [22]:
print(test_embeddings_sbert.shape)

(3894, 2304)


In [23]:
# np.shape(train_embeddings_sbert)
train_embeddings_sbert

array([[-0.09018764, -0.24690011, -0.21070379, ...,  0.14900008,
        -0.38313147, -0.12559557],
       [-0.07093333, -0.33594415, -0.21142925, ..., -0.21238948,
        -0.2107223 , -0.14934623],
       [ 0.00704844, -0.3804114 , -0.19440874, ...,  0.01559372,
        -0.44941118, -0.3568388 ],
       ...,
       [ 0.14164513, -0.27081993, -0.22254585, ...,  0.2987161 ,
         0.25670302, -0.2111376 ],
       [ 0.12121425, -0.38976884, -0.17471671, ...,  0.19508861,
         0.18679416, -0.1940057 ],
       [ 0.07585978, -0.43111196, -0.22201996, ...,  0.11846084,
         0.27284688, -0.23111457]], dtype=float32)

In [24]:
test_embeddings_sbert 

array([[-0.19062395, -0.11745113, -0.25335476, ...,  0.1922959 ,
        -0.37625661, -0.30732417],
       [-0.08491195, -0.56313676, -0.23268545, ..., -0.18872927,
        -0.21885872, -0.49129802],
       [ 0.02014007, -0.18596135, -0.16624981, ...,  0.06060598,
        -0.01330091, -0.28572068],
       ...,
       [ 0.10163353, -0.3595566 , -0.1719999 , ...,  0.10730707,
         0.2526765 , -0.22698738],
       [ 0.12851821, -0.45331287, -0.18497704, ...,  0.21253969,
         0.23311758, -0.20888892],
       [ 0.02431829, -0.2519529 , -0.24742365, ...,  0.17169283,
         0.03229843, -0.13743167]], dtype=float32)

# REGRESI FCNN

INISIALISASI FCNN

In [53]:
# Menginisialisasi FCNN
class FCNN(nn.Module):
    # Fungsi untuk menentukan pengaturan layer
    def __init__(self, input_size):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 397)  # Layer pertama: input_size -> 256
        self.dropout1 = nn.Dropout(0.3) 
        self.fc2 = nn.Linear(397, 32)           # Layer kedua: 256 -> 96
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(32, 1)              # Layer ketiga: 96 -> 1
        self.sigmoid = nn.Sigmoid()             # Fungsi aktivasi Sigmoid
    
    # Fungsi untuk untuk melakukan feedforward
    def forward(self, x):
        x = torch.relu(self.fc1(x))              # Aktivasi ReLU di layer pertama
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))              # Aktivasi ReLU di layer kedua
        x = self.dropout2(x) 
        x = self.fc3(x)                                     # Layer ketiga (output layer)
        return self.sigmoid(x)                        # Output dengan fungsi aktivasi Sigmoid

TESTING FUNCTION

In [54]:
def test_step(trained_model, cost_function, test_loader):
    trained_model.eval() # Mengatur model ke mode evaluasi (eval mode)
    test_loss = 0.
    samples = 0.
    
    with torch.no_grad():
        for step, (inputs, targets) in enumerate(test_loader):
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.squeeze(dim=1).to(device)
            
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = trained_model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
            
            # Menambahkan nilai loss dari batch ke test_loss
            test_loss += loss.item() * inputs.shape[0]
            
    # Menghitung rata-rata loss di seluruh batch (samples)
    avg_loss = test_loss / samples
    
    # Mengembalikan nilai rata-rata loss
    return avg_loss


TRAINING FUNCTION

In [55]:
# Contoh fungsi untuk training model
def training_step(model, cost_function, train_loader, test_loader, save_path, num_epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    train_losses = []
    test_losses = []
    
    for epoch in range(num_epochs):
        model.train() # Mengatur model ke mode pelatihan
        
        # Mengatur gradien parameter ke nilai nol untuk iterasi
        running_loss = 0.
        samples = 0.
        
        for inputs, targets in train_loader:
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.to(device)
        
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)

            # Mengatur gradien parameter ke nilai nol untuk iterasi berikutnya
            optimizer.zero_grad()
            
            # Melakukan backpropagation untuk menghitung gradien loss terhadap parameter model
            loss.backward()
            
            # Melakukan optimizer untuk mengupdate parameter model berdasarkan gradien
            optimizer.step()
            
            # Menambahkan nilai loss dari batch ke running_loss
            running_loss += loss.item() * inputs.shape[0]
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
        
        # Menghitung rata-rata loss pada data latih
        train_loss = running_loss / samples
        
         # Evaluasi pada data uji
        test_loss = test_step(model, cost_function, test_loader)
        
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        print('Epoch: {:}/{:}\tLoss/train: {:.5f}\tLoss/test: {:.5f}'.format(epoch+1, num_epochs, train_loss, test_loss))
    
    # Simpan model setelah pelatihan
    torch.save(model.state_dict(), save_path)
    print(f"Model saved at {save_path}")
    
    return train_losses, test_losses

SCORING FUNCTION

In [56]:
#Fungsi untuk melakukan prediksi pada data uji
def scoring(trained_model, test_loader):
    trained_model.to(device)  # Move the model to the correct device
    predictions = []
    
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            # Lakukan prediksi dengan model yang telah dilatih
            outputs = trained_model(inputs)
            
            # Menyimpan prediksi (outputs) dalam bentuk list predictions
            predictions.extend(outputs.squeeze().cpu().numpy())
            
    return predictions

# MAIN

In [57]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TRAINING

In [58]:
# Hyperparameter
input_size = 2304
batch_size = 16
lr = 3e-5
epochs = 8

# TRAINING
# Inisialisasi model, loader, dan fungsi loss
model_sbert = FCNN(input_size).to(device)  # Ganti dengan model Anda
cost_function = torch.nn.MSELoss()

# Dataloaders
train_loader_sbert = get_loader(train_data, id2emb_train, train_embeddings_sbert, batch_size, shuffle=True)
test_loader_sbert = get_loader(test_data, id2emb_test, test_embeddings_sbert, batch_size, shuffle=False)

print('------------------------------------------------------------------')
print(f"\t\t\tTraining model SBERT: ")
print('------------------------------------------------------------------')
# Path tempat model akan disimpan dan dimuat
save_path = 'model_sbert_v7-mean_max_cls_4.4.pth'

start_time_sbert = time.time()
train_loss_sbert, test_loss_sbert = training_step(model_sbert, cost_function, train_loader_sbert, test_loader_sbert, save_path, epochs, lr)
end_time_sbert = time.time()

print('Training time:', end_time_sbert - start_time_sbert)

------------------------------------------------------------------
			Training model SBERT: 
------------------------------------------------------------------
Epoch: 1/8	Loss/train: 0.04096	Loss/test: 0.02752
Epoch: 2/8	Loss/train: 0.02769	Loss/test: 0.02307
Epoch: 3/8	Loss/train: 0.02418	Loss/test: 0.02173
Epoch: 4/8	Loss/train: 0.02266	Loss/test: 0.02098
Epoch: 5/8	Loss/train: 0.02181	Loss/test: 0.02162
Epoch: 6/8	Loss/train: 0.02109	Loss/test: 0.01986
Epoch: 7/8	Loss/train: 0.02079	Loss/test: 0.02036
Epoch: 8/8	Loss/train: 0.02029	Loss/test: 0.02040
Model saved at model_sbert_v7-mean_max_cls_4.4.pth
Training time: 20.15289807319641


SCORING

In [59]:
# Memuat model yang telah dilatih
model_sbert_trained = model_sbert 
model_sbert_trained.load_state_dict(torch.load(save_path))

# Menggunakan model untuk prediksi pada data uji
print('------------------------------------------------------------------')
print(f"\t\t\tScoring Essay: ")
print('------------------------------------------------------------------')
start_time_eval = time.time()
test_predictions_sbert = scoring(model_sbert_trained, test_loader_sbert)
end_time_eval = time.time()
print('Evaluation time:', end_time_eval - start_time_eval)

# store train_df, test_df and predictions
train_df_sbert = train_data
test_df_sbert = test_data
preds_sbert = test_predictions_sbert

------------------------------------------------------------------
			Scoring Essay: 
------------------------------------------------------------------
Evaluation time: 0.504833459854126


# RESULT

In [60]:
def get_results_df(test_df, model_preds):

  # create new results df with model scaled preds
  preds_df = pd.DataFrame(model_preds)
  results_df = test_df.reset_index(drop=True)\
              .join(preds_df)\
              .rename(columns={0:'prediction'})\
              .sort_values(by='essay_id')\
              .reset_index(drop=True)

  # move score to last colum
  s_df = results_df.pop('score')
  results_df['score'] = s_df

  return results_df

In [61]:
get_results_df(test_df_sbert, preds_sbert)

Unnamed: 0,essay_id,essay_set,essay,prediction,score
0,16,1,"Dear @ORGANIZATION1, The computer blinked to l...",0.811038,1.000000
1,24,1,"Dear local newspaper, I've heard that not many...",0.808737,0.900000
2,30,1,"Dear Newspaper People, I think that computers ...",0.688999,0.600000
3,31,1,I agree that computers deffinately are an adva...,0.783868,0.800000
4,33,1,"Dear, @ORGANIZATION1 I think the effects that ...",0.521431,0.400000
...,...,...,...,...,...
3889,21613,8,"Before my best friend moved away, we would st...",0.643766,0.666667
3890,21615,8,@ORGANIZATION1 ...,0.605741,0.533333
3891,21626,8,In most stories mothers and daughters are eit...,0.774920,0.583333
3892,21628,8,I never understood the meaning laughter is th...,0.670974,0.533333


# DENORMALISASI

In [62]:
def inverse_normalize_score(score, min_max_range):
    # Mendapatkan nilai minimum dan maksimum dari rentang normalisasi
    min_score, max_score = min_max_range
    
    # Mengembalikan skor esai yang sudah dinormalisasi ke rentang aslinya
    return round(score * (max_score - min_score) + min_score)

def restore_original_scores(df, preds, min_max_ranges):
    # Membuat salinan dataframe untuk menghindari modifikasi dataframe asli
    df_copy = df.copy()
    
    # Mendapatkan kolom skor aktual
    actual_scores = df_copy['score'].values
    
    # Mendapatkan kolom essay_set
    essay_sets = df_copy['essay_set'].values
    
    # Memastikan preds memiliki panjang yang sama dengan jumlah data
    assert len(preds) == len(df_copy), "Length of predictions does not match length of dataframe"
    
    # Memulihkan skor prediksi dan skor aktual ke rentang aslinya
    restored_preds = [inverse_normalize_score(pred, min_max_ranges[essay_set]) for pred, essay_set in zip(preds, essay_sets)]
    restored_actuals = [inverse_normalize_score(actual, min_max_ranges[essay_set]) for actual, essay_set in zip(actual_scores, essay_sets)]
    
    # Mengganti kolom skor prediksi dan aktual dengan skor yang sudah dipulihkan
    df_copy['prediction'] = restored_preds
    df_copy['score'] = restored_actuals
    
    return df_copy


In [63]:
# Mengembalikan skor prediksi dan skor aktual ke rentang awalnya
restored_results_df_sbert = restore_original_scores(test_df_sbert, preds_sbert, min_max_ranges)

# Cetak hasilnya
print("Restored Results:")
restored_results_df_sbert

Restored Results:


Unnamed: 0,essay_id,essay_set,essay,score,prediction
0,827,1,I think computers have a postitive affect on p...,6,7
1,1477,1,I blive that computers have a lot of effects o...,6,6
2,234,1,Many people think that computers are not a goo...,8,9
3,801,1,"Dear Newspaper people, @CAPS1 you might heard ...",9,9
4,780,1,More and more people are using computers on a ...,9,10
...,...,...,...,...,...
3889,20920,8,In my storie I am going to tell you about a f...,32,36
3890,21107,8,Laughter is an essential component to any rel...,40,45
3891,21514,8,I think laughter should be a huge part in eve...,34,37
3892,21309,8,Some people say that laughter is the best med...,38,40


# EVALUASI QWK

In [64]:
import numpy as np

def calculate_qwk(actuals, preds):
    # Menentukan nilai minimum dan maksimum untuk rentang skor
    min_rating = min(min(actuals), min(preds))
    max_rating = max(max(actuals), max(preds))
    
    # Jumlah total kemungkinan penilaian
    num_ratings = max_rating - min_rating + 1

    # Membuat matriks bobot W
    weight_mat = np.zeros((num_ratings, num_ratings))
    for i in range(num_ratings):
        for j in range(num_ratings):
            weight_mat[i][j] = ((i - j) ** 2) / ((num_ratings - 1) ** 2)

    # Membuat matriks observasi O
    conf_mat = np.zeros((num_ratings, num_ratings))
    for actual, pred in zip(actuals, preds):
        conf_mat[actual - min_rating][pred - min_rating] += 1

    # Membuat matriks ekspektasi E
    actual_hist = np.zeros(num_ratings)
    pred_hist = np.zeros(num_ratings)
    for i in range(num_ratings):
        for j in range(num_ratings):
            actual_hist[i] += conf_mat[i][j]
            pred_hist[j] += conf_mat[i][j]

    expected_mat = np.outer(actual_hist, pred_hist) / len(actuals)

    # Menghitung nilai QWK
    num_agreements = np.sum(weight_mat * conf_mat)
    num_possible_agreements = np.sum(weight_mat * expected_mat)
    kappa_score = 1 - (num_agreements / num_possible_agreements)

    return kappa_score

In [65]:
# Fungsi untuk menghitung QWK per set
def calculate_qwk_per_set(df):
    # Menyimpan nilai QWK per set dalam dictionary
    qwk_per_set = {}
    
    # Mendapatkan unique essay_set values
    essay_sets = df['essay_set'].unique()
    
    # Iterasi melalui setiap essay_set
    for essay_set in essay_sets:
        # Filter dataframe berdasarkan essay_set
        subset_df = df[df['essay_set'] == essay_set]
        
        # Mengekstrak skor aktual dan prediksi dari subset dataframe
        actual_scores = subset_df['score'].astype(int)
        predicted_scores = subset_df['prediction'].astype(int)
        
        # Menghitung QWK untuk subset tersebut
        qwk = calculate_qwk(actual_scores, predicted_scores)
        
        # Menyimpan nilai QWK ke dalam dictionary
        qwk_per_set[f'Set {essay_set}'] = qwk
    
    return qwk_per_set


In [66]:
# Menghitung QWK per set
qwk_per_set_sbert = calculate_qwk_per_set(restored_results_df_sbert)

# Menampilkan nilai QWK per set
for essay_set, qwk_score in qwk_per_set_sbert.items():
    print(f"Quadratic Weighted Kappa Score for {essay_set}: {qwk_score}")

# Menghitung rata-rata nilai QWK dari semua set
average_qwk_sbert = np.mean(list(qwk_per_set_sbert.values()))

# Menampilkan rata-rata nilai QWK
print("Average Quadratic Weighted Kappa Score:", average_qwk_sbert)

Quadratic Weighted Kappa Score for Set 1: 0.6648262593519234
Quadratic Weighted Kappa Score for Set 2: 0.6142555190230155
Quadratic Weighted Kappa Score for Set 3: 0.5967211887562227
Quadratic Weighted Kappa Score for Set 4: 0.7106733680128814
Quadratic Weighted Kappa Score for Set 5: 0.7074610551799208
Quadratic Weighted Kappa Score for Set 6: 0.699178428481146
Quadratic Weighted Kappa Score for Set 7: 0.7810318627140574
Quadratic Weighted Kappa Score for Set 8: 0.46990522789369604
Average Quadratic Weighted Kappa Score: 0.6555066136766079
