# DATA PREPOCESSING

In [97]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
import time
from transformers import BertTokenizer, BertModel
from tqdm.auto import tqdm

In [98]:
# Set device
torch.cuda.set_device(3)
device = "cuda:%s" % torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
device

'cuda:3'

LOAD DATASET

In [99]:
# Load dataset .tsv
kaggle_dataset  = pd.read_csv('./training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")
kaggle_dataset

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,17,18,,35,,,,...,4.0,4.0,4.0,3.0,,,,,,
12972,21628,8,I never understood the meaning laughter is th...,15,17,,32,,,,...,4.0,4.0,4.0,3.0,,,,,,
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",20,26,40.0,40,,,,...,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
12974,21630,8,Trippin' on fen...,20,20,,40,,,,...,4.0,4.0,4.0,4.0,,,,,,


DATA CLEANSING

In [100]:
# Data Cleansing Function
def clean_dataset(input_dataset):
  # Remove unused column 
  dataset = pd.DataFrame(
    {
      'essay_id' : input_dataset['essay_id'],
      'essay_set' : input_dataset['essay_set'],
      'essay' : input_dataset['essay'],
      'score' : input_dataset['domain1_score']
    }
  )

  # Check missing value
  missing_values = dataset.isnull().sum()
  print("Jumlah missing values:")
  print(missing_values)

  # Remove missing value
  dataset_cleaned = dataset.dropna()
  cleaned_missing_values = dataset_cleaned.isnull().sum()
  print("\nJumlah missing values setelah data dibersihkan:")
  print(cleaned_missing_values)

  print("\nDataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:")

  return dataset_cleaned

In [101]:
dataset_cleaned = clean_dataset(kaggle_dataset)
dataset_cleaned

Jumlah missing values:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Jumlah missing values setelah data dibersihkan:
essay_id     0
essay_set    0
essay        0
score        0
dtype: int64

Dataset setelah kolom yang tidak dibutuhkan dan nilai kosong dihapus:


Unnamed: 0,essay_id,essay_set,essay,score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,35
12972,21628,8,I never understood the meaning laughter is th...,32
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40
12974,21630,8,Trippin' on fen...,40


In [102]:
# Rata-rata jumlah kata pada tiap set esai
avg_word_counts = dataset_cleaned.groupby('essay_set')['essay'].apply(lambda x: x.str.split().str.len().mean()).round().astype(int)

# Filter data berdasarkan rata-rata jumlah kata yang diinginkan
target_avg_word_count = 374
filtered_dataset = pd.DataFrame(columns=dataset_cleaned.columns)
for essay_set, avg_count in avg_word_counts.items():
    subset = dataset_cleaned[dataset_cleaned['essay_set'] == essay_set]
    filtered_subset = subset[(subset['essay'].str.split().str.len() >= target_avg_word_count - 20) & 
                             (subset['essay'].str.split().str.len() <= target_avg_word_count + 20)]
    filtered_dataset = pd.concat([filtered_dataset, filtered_subset])

# Distribusi jumlah kata setelah filtering
filtered_avg_word_counts = filtered_dataset.groupby('essay_set')['essay'].apply(lambda x: x.str.split().str.len().mean()).round().astype(int)
print("Filtered Average Word Counts per Essay Set:")
print(filtered_avg_word_counts)

Filtered Average Word Counts per Essay Set:
essay_set
1    375
2    373
3    375
4    357
6    358
7    370
8    374
Name: essay, dtype: int64


In [103]:
# Menghitung jumlah data pada setiap set esai
jumlah_data_per_set = filtered_dataset.groupby('essay_set').size()

# Menampilkan jumlah data pada setiap set esai
print("Jumlah Data per Set Esai:")
print(jumlah_data_per_set)

Jumlah Data per Set Esai:
essay_set
1    234
2    200
3      1
4      1
6      2
7     33
8     33
dtype: int64


In [104]:
sets_to_remove = [3, 4, 6]
# Filter out the specified sets
filtered_dataset = filtered_dataset[~filtered_dataset['essay_set'].isin(sets_to_remove)]

In [105]:
# Menghitung jumlah data pada setiap set esai
jumlah_data_per_set = filtered_dataset.groupby('essay_set').size()

# Menampilkan jumlah data pada setiap set esai
print("Jumlah Data per Set Esai:")
print(jumlah_data_per_set)

Jumlah Data per Set Esai:
essay_set
1    234
2    200
7     33
8     33
dtype: int64


SCORE NORMALIZATION

In [106]:
# Rentang nilai esai (nilai minimum dan maksimum pada tiap set esai)
min_max_ranges = {
    1: (2, 12),
    2: (1, 6),
    3: (0, 3),
    4: (0, 3),
    5: (0, 4),
    6: (0, 4),
    7: (0, 30),
    8: (0, 60)
}

In [107]:
#Score Normalization Function
def normalize_score(dataset, min_max_ranges):

    #Rumus min max normalization
    def min_max_normalize(score, min_score, max_score):
        return (score - min_score) / (max_score - min_score)
    
    #Normalisasi nilai skor
    for essay_set, (min_score, max_score) in min_max_ranges.items():

        # Filter dataset berdasarkan essay set
        subset = dataset[dataset['essay_set'] == essay_set]
        
        # Lakukan normalisasi skor secara manual
        normalized_scores = subset['score'].apply(lambda x: min_max_normalize(x, min_score, max_score))
        
        # Update kolom skor pada subset dataset dengan skor yang telah dinormalisasi
        dataset.loc[subset.index, 'normalized_score'] = normalized_scores

    # Ganti nilai kolom score dengan normalized_score
    dataset['score'] = dataset['normalized_score']

    # Hapus kolom normalized_score
    dataset.drop('normalized_score', axis=1, inplace=True)
    
    return dataset

In [108]:
dataset_normalized = normalize_score(filtered_dataset, min_max_ranges)
dataset_normalized

Unnamed: 0,essay_id,essay_set,essay,score
11,12,1,Dear @CAPS1 @CAPS2 I feel that computers do ta...,0.600000
17,18,1,"Dear Local Newspaper, I must admit that the ex...",0.600000
20,21,1,"Dear @CAPS1 of the @CAPS2 @CAPS3 daily, I am w...",0.600000
25,26,1,Do you think that computers are useless? Or do...,0.700000
27,28,1,"Dear Newspaper, Computers are high tec and hav...",0.700000
...,...,...,...,...
12887,21514,8,I think laughter should be a huge part in eve...,0.566667
12904,21537,8,In the @DATE1 of @NUM1' I spent two weeks at ...,0.550000
12948,21595,8,Have you ever experienced a time with your fr...,0.600000
12949,21596,8,I woke up just like any other day happy yet l...,0.516667


DATA SPLITTING

In [109]:
def data_splitting(dataset):
    # Dictionary untuk menyimpan data latih dan data uji untuk setiap essay_set
    train_data_perset = {}
    test_data_perset = {}

    # Mendefinisikan essay_set yang tersedia dalam dataset
    essay_sets = dataset['essay_set'].unique()

    for essay_set in essay_sets:
        # Filter dataset berdasarkan essay_set
        subset = dataset[dataset['essay_set'] == essay_set]
        
        features = ['essay_id', 'essay_set', 'essay']
        X = subset.loc[:, features]
        y = subset.loc[:, ['score']]
        
        # Lakukan splitting menjadi data train (70%) dan data test (30%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
        
        # Menggabungkan X_train dan y_train menjadi dataframe data latih
        train_data_perset[essay_set] = pd.concat([X_train, y_train], axis=1)

        # Menggabungkan X_test dan y_test menjadi dataframe data uji
        test_data_perset[essay_set] = pd.concat([X_test, y_test], axis=1)

    # Menggabungkan semua data train dari setiap essay_set menjadi satu DataFrame data_train
    train_data = pd.concat(train_data_perset.values(), ignore_index=True)

    # Menggabungkan semua data test dari setiap essay_set menjadi satu DataFrame data_test
    test_data = pd.concat(test_data_perset.values(), ignore_index=True)

    return train_data, test_data

In [110]:
train_data, test_data = data_splitting(dataset_normalized)

In [111]:
# Cetak data latih
print("Train Data:")
train_data

Train Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,1052,1,Did you know that @PERCENT2 out of @PERCENT1 c...,0.700000
1,645,1,The computer effects peoplelife by taking them...,0.600000
2,29,1,"Dear local newspaper, @CAPS1 people throughout...",0.700000
3,476,1,I think the computer dire a postive effect on ...,0.600000
4,1012,1,"Dear local newspaper writer, I'm writting to t...",0.700000
...,...,...,...,...
344,21598,8,"Laughter is an important part of my life, eit...",0.500000
345,21016,8,Laughter is such a great joy in my life. If p...,0.516667
346,21115,8,There are many reasons why laughter is an imp...,0.533333
347,21249,8,"It was the first day of the ninth grade, that...",0.700000


In [112]:
# Cetak data uji
print("Test Data:")
test_data

Test Data:


Unnamed: 0,essay_id,essay_set,essay,score
0,560,1,"Dear @CAPS1 Newspaper, @CAPS2 though computers...",0.600000
1,1614,1,Dear @CAPS1 I think that computers are benefic...,0.700000
2,1392,1,I think that computers have a good effect on p...,0.300000
3,73,1,"Dear The @ORGANIZATION1, ""@CAPS1, @CAPS1, @CAP...",0.600000
4,1055,1,"To whom it @MONTH1 concern, To many people it ...",0.600000
...,...,...,...,...
146,21090,8,"The @CAPS1 of Laughter @CAPS2 friends and I, a...",0.616667
147,21369,8,Laughter is the key I think that being happy ...,0.483333
148,21407,8,The @CAPS1 of @CAPS2 @CAPS3 I think of laughte...,0.666667
149,21204,8,It was the first day of sophomore year. I had...,0.516667


# EMBEDDING

Data Loader - Batch Size

In [113]:
# Fungsi untuk membuat kamus yang memetakan id ke suatu indeks
def get_id2emb(ids):

  id2emb = {}
  for n,id in enumerate(ids.to_list()):
    id2emb[id] = n

  print('Essay ids to embeddings dictionary created.')
  
  return id2emb

In [114]:
id2emb_train = get_id2emb(train_data['essay_id'])
id2emb_test = get_id2emb(test_data['essay_id'])

Essay ids to embeddings dictionary created.
Essay ids to embeddings dictionary created.


In [115]:
def get_loader(df, id2emb, essay_embeddings, batch_size, shuffle):
    
    # Extract embeddings for each essay_id using the id2emb dictionary
    embeddings = np.array([essay_embeddings[id2emb[id]] for id in df['essay_id']])
    
    # Extract scores from the DataFrame
    scores = np.array(df['score'])
    
    # Create a PyTorch TensorDataset from the embeddings and scores
    data = TensorDataset(torch.from_numpy(embeddings).float(), torch.from_numpy(scores).float())
    
    # Create a PyTorch DataLoader from the TensorDataset
    loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=2)
    
    return loader

# EMBEDDING BERT

LOAD PRETRAINED MODEL BERT

In [116]:
# Memuat pretrained BERT dan tokenizer
bert_model = BertModel.from_pretrained("bert-base-cased").to(device)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')



WORD EMBEDDING

In [117]:
def mean_encoding(essay_list, tokenizer, model):

  print('Encoding essay embeddings:')

  embeddings = []
  for essay in tqdm(essay_list):
    encoded_input = tokenizer(essay, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
      model_output = model(**encoded_input)
    tokens_embeddings = np.matrix(model_output[0].squeeze().cpu())
    embeddings.append(np.squeeze(np.asarray(tokens_embeddings.mean(0))))

  return np.matrix(embeddings)

In [118]:
# Menyimpan Embeddings yang dihasilkan BERT 
train_embeddings_bert = mean_encoding(train_data['essay'], tokenizer_bert, bert_model)

Encoding essay embeddings:


  0%|          | 0/349 [00:00<?, ?it/s]

In [119]:
test_embeddings_bert = mean_encoding(test_data['essay'], tokenizer_bert, bert_model)

Encoding essay embeddings:


  0%|          | 0/151 [00:00<?, ?it/s]

In [120]:
print(train_embeddings_bert.shape)

(349, 768)


In [121]:
print(test_embeddings_bert.shape)

(151, 768)


In [122]:
# np.shape(train_embeddings_bert)
train_embeddings_bert

matrix([[ 0.27836147, -0.12188941, -0.0191257 , ..., -0.02068899,
         -0.11260348,  0.13820279],
        [ 0.14524166,  0.02097575, -0.39603904, ..., -0.17068914,
         -0.24399517,  0.16452214],
        [ 0.4269852 , -0.07613757, -0.16116065, ..., -0.07103348,
         -0.07903457,  0.24113716],
        ...,
        [ 0.25719467, -0.11892126, -0.3284292 , ...,  0.019404  ,
         -0.00222178,  0.05745728],
        [ 0.21208706, -0.141066  , -0.04750081, ..., -0.04942276,
          0.00849014,  0.03733275],
        [ 0.17899011, -0.00198905, -0.4147413 , ..., -0.01976762,
          0.12144737,  0.07992568]], dtype=float32)

In [123]:
test_embeddings_bert 

matrix([[ 0.35158864,  0.07717412, -0.13365369, ..., -0.10886408,
          0.02994919,  0.17972253],
        [ 0.27929842, -0.1758189 , -0.26591957, ..., -0.06354713,
         -0.21227795,  0.10118586],
        [ 0.39244404,  0.01430434, -0.17147757, ..., -0.01614914,
          0.03679727,  0.2793329 ],
        ...,
        [ 0.20353985,  0.06211736, -0.30668968, ...,  0.23248325,
          0.2977854 ,  0.25267044],
        [ 0.16579686, -0.05376469, -0.15123072, ..., -0.07589218,
         -0.07633164,  0.07437171],
        [ 0.04488572,  0.03593043, -0.42828327, ...,  0.14983015,
          0.04626296, -0.02967604]], dtype=float32)

# REGRESI FCNN

INISIALISASI FCNN

In [152]:
# Menginisialisasi FCNN
class FCNN(nn.Module):
    # Fungsi untuk menentukan pengaturan layer
    def __init__(self, input_size):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 397) 
        self.dropout1 = nn.Dropout(0.3) 
        self.fc2 = nn.Linear(397, 32)          
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(32, 1)              
        self.sigmoid = nn.Sigmoid()             
    
    # Fungsi untuk untuk melakukan feedforward
    def forward(self, x):
        x = torch.relu(self.fc1(x))             
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))              
        x = self.dropout2(x) 
        x = self.fc3(x)                                    
        return self.sigmoid(x)                        

TESTING FUNCTION

In [153]:
def test_step(trained_model, cost_function, test_loader):
    trained_model.eval() # Mengatur model ke mode evaluasi (eval mode)
    test_loss = 0.
    samples = 0.
    
    with torch.no_grad():
        for step, (inputs, targets) in enumerate(test_loader):
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.squeeze(dim=1).to(device)
            
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = trained_model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
            
            # Menambahkan nilai loss dari batch ke test_loss
            test_loss += loss.item() * inputs.shape[0]
            
    # Menghitung rata-rata loss di seluruh batch (samples)
    avg_loss = test_loss / samples
    
    # Mengembalikan nilai rata-rata loss
    return avg_loss


TRAINING FUNCTION

In [154]:
# Contoh fungsi untuk training model
def training_step(model, cost_function, train_loader, test_loader, save_path, num_epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    train_losses = []
    test_losses = []
    
    for epoch in range(num_epochs):
        model.train() # Mengatur model ke mode pelatihan
        
        # Mengatur gradien parameter ke nilai nol untuk iterasi
        running_loss = 0.
        samples = 0.
        
        for inputs, targets in train_loader:
            
            # Menghapus dimensi yang tidak perlu dari inputs dan mentransfer ke device
            inputs = inputs.to(device)
        
            # Menyesuaikan dimensi targets dan mentransfer ke device
            targets = targets.reshape(targets.shape[0], 1).to(device)
            
            # Menghitung output model (prediksi) dari inputs
            outputs = model(inputs).reshape(-1, 1)
            
            # Menghitung nilai loss dengan membandingkan outputs dengan targets
            loss = cost_function(outputs, targets)

            # Mengatur gradien parameter ke nilai nol untuk iterasi berikutnya
            optimizer.zero_grad()
            
            # Melakukan backpropagation untuk menghitung gradien loss terhadap parameter model
            loss.backward()
            
            # Melakukan optimizer untuk mengupdate parameter model berdasarkan gradien
            optimizer.step()
            
            # Menambahkan nilai loss dari batch ke running_loss
            running_loss += loss.item() * inputs.shape[0]
            
            # Menghitung jumlah sampel dalam batch
            samples += inputs.shape[0]
        
        # Menghitung rata-rata loss pada data latih
        train_loss = running_loss / samples
        
         # Evaluasi pada data uji
        test_loss = test_step(model, cost_function, test_loader)
        
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        print('Epoch: {:}/{:}\tLoss/train: {:.5f}\tLoss/test: {:.5f}'.format(epoch+1, num_epochs, train_loss, test_loss))
    
    # Simpan model setelah pelatihan
    torch.save(model.state_dict(), save_path)
    print(f"Model saved at {save_path}")
    
    return train_losses, test_losses

SCORING FUNCTION

In [155]:
#Fungsi untuk melakukan prediksi pada data uji
def scoring(trained_model, test_loader):
    trained_model.to(device)  # Move the model to the correct device
    predictions = []
    
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            # Lakukan prediksi dengan model yang telah dilatih
            outputs = trained_model(inputs)
            
            # Menyimpan prediksi (outputs) dalam bentuk list predictions
            predictions.extend(outputs.squeeze().cpu().numpy())
            
    return predictions

# MAIN

In [156]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TRAINING

In [157]:
# Hyperparameter
input_size = 768
batch_size = 16
epochs = 100
lr = 2e-5

# TRAINING
# Inisialisasi model, loader, dan fungsi loss
model_bert = FCNN(input_size).to(device)  # Ganti dengan model Anda
cost_function = torch.nn.MSELoss()

# Dataloaders
train_loader_bert = get_loader(train_data, id2emb_train, train_embeddings_bert, batch_size, shuffle=True)
test_loader_bert = get_loader(test_data, id2emb_test, test_embeddings_bert, batch_size, shuffle=False)

print('------------------------------------------------------------------')
print(f"\t\t\tTraining model BERT: ")
print('------------------------------------------------------------------')
# Path tempat model akan disimpan dan dimuat
save_path = 'model_bert_v7-rata2_banyak.pth'

start_time_bert = time.time()
train_loss_bert, test_loss_bert = training_step(model_bert, cost_function, train_loader_bert, test_loader_bert, save_path, epochs, lr)
end_time_bert = time.time()

print('Training time:', end_time_bert - start_time_bert)

------------------------------------------------------------------
			Training model BERT: 
------------------------------------------------------------------
Epoch: 1/100	Loss/train: 0.02356	Loss/test: 0.02200
Epoch: 2/100	Loss/train: 0.01913	Loss/test: 0.01869
Epoch: 3/100	Loss/train: 0.01601	Loss/test: 0.01717
Epoch: 4/100	Loss/train: 0.01512	Loss/test: 0.01669
Epoch: 5/100	Loss/train: 0.01498	Loss/test: 0.01640
Epoch: 6/100	Loss/train: 0.01402	Loss/test: 0.01614
Epoch: 7/100	Loss/train: 0.01345	Loss/test: 0.01585
Epoch: 8/100	Loss/train: 0.01305	Loss/test: 0.01560
Epoch: 9/100	Loss/train: 0.01308	Loss/test: 0.01539
Epoch: 10/100	Loss/train: 0.01320	Loss/test: 0.01497
Epoch: 11/100	Loss/train: 0.01298	Loss/test: 0.01460
Epoch: 12/100	Loss/train: 0.01190	Loss/test: 0.01435
Epoch: 13/100	Loss/train: 0.01219	Loss/test: 0.01406
Epoch: 14/100	Loss/train: 0.01181	Loss/test: 0.01374
Epoch: 15/100	Loss/train: 0.01074	Loss/test: 0.01358
Epoch: 16/100	Loss/train: 0.01077	Loss/test: 0.01331
Ep

SCORING

In [158]:
# Memuat model yang telah dilatih
model_bert_trained = model_bert 
model_bert_trained.load_state_dict(torch.load(save_path))

# Menggunakan model untuk prediksi pada data uji
print('------------------------------------------------------------------')
print(f"\t\t\tScoring Essay: ")
print('------------------------------------------------------------------')
start_time_eval = time.time()
test_predictions_bert = scoring(model_bert_trained, test_loader_bert)
end_time_eval = time.time()
print('Evaluation time:', end_time_eval - start_time_eval)

# store train_df, test_df and predictions
train_df_bert = train_data
test_df_bert = test_data
preds_bert = test_predictions_bert

------------------------------------------------------------------
			Scoring Essay: 
------------------------------------------------------------------
Evaluation time: 0.18819427490234375


# RESULT

In [159]:
def get_results_df(test_df, model_preds):

  # create new results df with model scaled preds
  preds_df = pd.DataFrame(model_preds)
  results_df = test_df.reset_index(drop=True)\
              .join(preds_df)\
              .rename(columns={0:'prediction'})\
              .sort_values(by='essay_id')\
              .reset_index(drop=True)

  # move score to last colum
  s_df = results_df.pop('score')
  results_df['score'] = s_df

  return results_df

In [160]:
get_results_df(test_df_bert, preds_bert)

Unnamed: 0,essay_id,essay_set,essay,prediction,score
0,73,1,"Dear The @ORGANIZATION1, ""@CAPS1, @CAPS1, @CAP...",0.655481,0.600000
1,79,1,"Computers have caused many people money, frien...",0.665865,0.800000
2,120,1,Dear @CAPS1: The effects that computers have o...,0.617562,0.600000
3,121,1,Do you think computers are helpful? Well you s...,0.624270,0.600000
4,142,1,As more and more people are becoming accustome...,0.702907,0.800000
...,...,...,...,...,...
146,21354,8,i woke up at @NUM1 from a cal from my cousin ...,0.536605,0.583333
147,21369,8,Laughter is the key I think that being happy ...,0.557194,0.483333
148,21407,8,The @CAPS1 of @CAPS2 @CAPS3 I think of laughte...,0.651194,0.666667
149,21509,8,So one day I was at home babysitting my little...,0.625259,0.500000


# DENORMALISASI

In [161]:
def inverse_normalize_score(score, min_max_range):
    # Mendapatkan nilai minimum dan maksimum dari rentang normalisasi
    min_score, max_score = min_max_range
    
    # Mengembalikan skor esai yang sudah dinormalisasi ke rentang aslinya
    return round(score * (max_score - min_score) + min_score)

def restore_original_scores(df, preds, min_max_ranges):
    # Membuat salinan dataframe untuk menghindari modifikasi dataframe asli
    df_copy = df.copy()
    
    # Mendapatkan kolom skor aktual
    actual_scores = df_copy['score'].values
    
    # Mendapatkan kolom essay_set
    essay_sets = df_copy['essay_set'].values
    
    # Memastikan preds memiliki panjang yang sama dengan jumlah data
    assert len(preds) == len(df_copy), "Length of predictions does not match length of dataframe"
    
    # Memulihkan skor prediksi dan skor aktual ke rentang aslinya
    restored_preds = [inverse_normalize_score(pred, min_max_ranges[essay_set]) for pred, essay_set in zip(preds, essay_sets)]
    restored_actuals = [inverse_normalize_score(actual, min_max_ranges[essay_set]) for actual, essay_set in zip(actual_scores, essay_sets)]
    
    # Mengganti kolom skor prediksi dan aktual dengan skor yang sudah dipulihkan
    df_copy['prediction'] = restored_preds
    df_copy['score'] = restored_actuals
    
    return df_copy

In [162]:
# Mengembalikan skor prediksi dan skor aktual ke rentang awalnya
restored_results_df_bert = restore_original_scores(test_df_bert, preds_bert, min_max_ranges)

# Cetak hasilnya
print("Restored Results:")
restored_results_df_bert

Restored Results:


Unnamed: 0,essay_id,essay_set,essay,score,prediction
0,560,1,"Dear @CAPS1 Newspaper, @CAPS2 though computers...",8,8
1,1614,1,Dear @CAPS1 I think that computers are benefic...,9,8
2,1392,1,I think that computers have a good effect on p...,5,8
3,73,1,"Dear The @ORGANIZATION1, ""@CAPS1, @CAPS1, @CAP...",8,9
4,1055,1,"To whom it @MONTH1 concern, To many people it ...",8,8
...,...,...,...,...,...
146,21090,8,"The @CAPS1 of Laughter @CAPS2 friends and I, a...",37,42
147,21369,8,Laughter is the key I think that being happy ...,29,33
148,21407,8,The @CAPS1 of @CAPS2 @CAPS3 I think of laughte...,40,39
149,21204,8,It was the first day of sophomore year. I had...,31,39


# EVALUASI QWK

In [163]:
import numpy as np

def calculate_qwk(actuals, preds):
    # Menentukan nilai minimum dan maksimum untuk rentang skor
    min_rating = min(min(actuals), min(preds))
    max_rating = max(max(actuals), max(preds))
    
    # Jumlah total kemungkinan penilaian
    num_ratings = max_rating - min_rating + 1

    # Membuat matriks bobot W
    weight_mat = np.zeros((num_ratings, num_ratings))
    for i in range(num_ratings):
        for j in range(num_ratings):
            weight_mat[i][j] = ((i - j) ** 2) / ((num_ratings - 1) ** 2)

    # Membuat matriks observasi O
    conf_mat = np.zeros((num_ratings, num_ratings))
    for actual, pred in zip(actuals, preds):
        conf_mat[actual - min_rating][pred - min_rating] += 1

    # Membuat matriks ekspektasi E
    actual_hist = np.zeros(num_ratings)
    pred_hist = np.zeros(num_ratings)
    for i in range(num_ratings):
        for j in range(num_ratings):
            actual_hist[i] += conf_mat[i][j]
            pred_hist[j] += conf_mat[i][j]

    expected_mat = np.outer(actual_hist, pred_hist) / len(actuals)

    # Menghitung nilai QWK
    num_agreements = np.sum(weight_mat * conf_mat)
    num_possible_agreements = np.sum(weight_mat * expected_mat)
    kappa_score = 1 - (num_agreements / num_possible_agreements)

    return kappa_score

In [164]:
# Fungsi untuk menghitung QWK per set
def calculate_qwk_per_set(df):
    # Menyimpan nilai QWK per set dalam dictionary
    qwk_per_set = {}
    
    # Mendapatkan unique essay_set values
    essay_sets = df['essay_set'].unique()
    
    # Iterasi melalui setiap essay_set
    for essay_set in essay_sets:
        # Filter dataframe berdasarkan essay_set
        subset_df = df[df['essay_set'] == essay_set]
        
        # Mengekstrak skor aktual dan prediksi dari subset dataframe
        actual_scores = subset_df['score'].astype(int)
        predicted_scores = subset_df['prediction'].astype(int)
        
        # Menghitung QWK untuk subset tersebut
        qwk = calculate_qwk(actual_scores, predicted_scores)
        
        # Menyimpan nilai QWK ke dalam dictionary
        qwk_per_set[f'Set {essay_set}'] = qwk
    
    return qwk_per_set


In [165]:
# Menghitung QWK per set
qwk_per_set_bert = calculate_qwk_per_set(restored_results_df_bert)

# Menampilkan nilai QWK per set
for essay_set, qwk_score in qwk_per_set_bert.items():
    print(f"Quadratic Weighted Kappa Score for {essay_set}: {qwk_score}")

# Menghitung rata-rata nilai QWK dari semua set
average_qwk_bert = np.mean(list(qwk_per_set_bert.values()))

# Menampilkan rata-rata nilai QWK
print("Average Quadratic Weighted Kappa Score:", average_qwk_bert)

Quadratic Weighted Kappa Score for Set 1: 0.3367123832240111
Quadratic Weighted Kappa Score for Set 2: 0.2131979695431473
Quadratic Weighted Kappa Score for Set 7: 0.22077922077922085
Quadratic Weighted Kappa Score for Set 8: 0.3366336633663366
Average Quadratic Weighted Kappa Score: 0.27683080922817893
