In [1]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras_nlp
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras import mixed_precision
import string



import nltk

nltk.download('stopwords')
nltk.download('wordnet')
! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
from nltk.corpus import stopwords, cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

2024-06-23 12:45:32.870995: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-23 12:45:32.871103: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-23 12:45:33.018246: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


  pid, fd = os.forkpty()


Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [2]:
# Enable mixed precision training
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)


In [3]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [4]:
# Display the first few rows of the training data
print(train_df.head())
print(test_df.head())

  essay_id                                          full_text  score
0  000d118  Many people have car where they live. The thin...      3
1  000fe60  I am a scientist at NASA that is discussing th...      3
2  001ab80  People always wish they had the same technolog...      4
3  001bdc0  We all heard about Venus, the planet without a...      4
4  002ba53  Dear, State Senator\n\nThis is a letter to arg...      3
  essay_id                                          full_text
0  000d118  Many people have car where they live. The thin...
1  000fe60  I am a scientist at NASA that is discussing th...
2  001ab80  People always wish they had the same technolog...


In [5]:
def remove_urls(text):
    """
    Removes URLs from the text.

    Parameters:
    text (str): The text from which URLs need to be removed.

    Returns:
    text (str): Text with URLs removed.
    """

    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def data_preprocessing(df):
    """
    Preprocesses the data by performing the following steps:
    - Lowercasing
    - Removing URLs
    - Removing punctuation
    - Removing stopwords
    - Lemmatization

    Parameters:
    df (DataFrame): DataFrame containing the essays.

    Returns:
    df (DataFrame): DataFrame with preprocessed essays.
    """

    for index, row in df.iterrows():
        text_value = row['full_text']

        
        text_value = text_value.lower()
        text_value = remove_urls(text_value)
        text_value = re.sub(r'[^\w\s]', '', text_value)

        stop_words = set(stopwords.words('english'))
        text_value = ' '.join([word for word in text_value.split() if word not in stop_words])

        lemmatizer = WordNetLemmatizer()
        text_value = ' '.join([lemmatizer.lemmatize(word) for word in text_value.split()])

        df.at[index, 'full_text_preprocessed'] = text_value

    return df

In [6]:
train_df = data_preprocessing(train_df)

In [7]:
# Prepare features and labels
train_features = np.array(train_df['full_text'])
train_labels = np.array(train_df['score'])
test_features = np.array(test_df['full_text'])

In [8]:
# Define the number of folds for cross-validation
n_splits = 2  # Reduced number of folds for efficiency
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to hold the kappa scores for each fold
kappa_scores = []

# Perform cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(train_features), 1):
    X_train, X_val = train_features[train_index], train_features[val_index]
    y_train, y_val = train_labels[train_index], train_labels[val_index]

    # Pretrained classifier
    classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
        "deberta_v3_base_en",
        num_classes=1,  # Single regression output for score prediction
    )

    # Compile the classifier with appropriate loss and optimizer
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=2e-5,
        decay_steps=10000,
        decay_rate=0.9
    )
    optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_schedule)
    
    classifier.compile(
        loss=keras.losses.MeanSquaredError(),  # Use MSE for regression
        optimizer=optimizer,
        metrics=[keras.metrics.MeanAbsoluteError()],
        jit_compile=True,
    )

    # Include callbacks to save the best model and early stopping
    callbacks = [
        keras.callbacks.ModelCheckpoint(
            filepath=f'/kaggle/working/best_model_fold_{fold}.keras',
            save_best_only=True,
            monitor='val_loss',
            mode='min',
        ),
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]

    # Fine-tune the model with a suitable batch size
    classifier.fit(
        x=X_train,
        y=y_train,
        validation_data=(X_val, y_val),
        batch_size=10, 
        epochs=5,
        callbacks=callbacks,
        verbose=1  
    )

# Load the best model
    classifier.load_weights(f'/kaggle/working/best_model_fold_{fold}.keras')

# Make predictions on the validation set
    val_predictions = classifier.predict(x=X_val, batch_size=8)

# Round predictions for evaluation
    val_predictions_rounded = np.round(val_predictions).flatten()

# Calculate the quadratic weighted kappa
kappa_score = cohen_kappa_score(y_val, val_predictions_rounded, weights='quadratic')
kappa_scores.append(kappa_score)

print(f"Fold {fold}: Quadratic Weighted Kappa: {kappa_score}")

# Calculate the average kappa score across all folds
average_kappa_score = np.mean(kappa_scores)
print(f"Average Quadratic Weighted Kappa: {average_kappa_score}")

# Make predictions on the test set using the final trained model
test_predictions = classifier.predict(x=test_features, batch_size=8)

# Round predictions for submission
test_predicted_scores = np.round(test_predictions).flatten()

# Prepare the submission file with correct formatting
submission_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],  # Ensure this column name matches your test dataset
    'score': test_predicted_scores
})


Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'task.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metada

Epoch 1/5


I0000 00:00:1719147011.796968      74 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 608ms/step - loss: 0.9812 - mean_absolute_error: 0.7225

W0000 00:00:1719147670.427464      75 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m837s[0m 778ms/step - loss: 0.9807 - mean_absolute_error: 0.7224 - val_loss: 0.4383 - val_mean_absolute_error: 0.5171
Epoch 2/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m541s[0m 624ms/step - loss: 0.3950 - mean_absolute_error: 0.4883 - val_loss: 0.3824 - val_mean_absolute_error: 0.4821
Epoch 3/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m529s[0m 611ms/step - loss: 0.3342 - mean_absolute_error: 0.4515 - val_loss: 0.3989 - val_mean_absolute_error: 0.4927
Epoch 4/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m527s[0m 609ms/step - loss: 0.2786 - mean_absolute_error: 0.4137 - val_loss: 0.3904 - val_mean_absolute_error: 0.4842
Epoch 5/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m535s[0m 618ms/step - loss: 0.2429 - mean_absolute_error: 0.3859 - val_loss: 0.3751 - val_mean_absolute_error: 0.4769
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 122ms

Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'task.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/deberta_v3/keras/deberta_v3_base_en/2' to your Kaggle notebook...
Attaching 'metada

Epoch 1/5
[1m865/866[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 470ms/step - loss: 1.3402 - mean_absolute_error: 0.8058

W0000 00:00:1719151132.354175      74 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m852s[0m 786ms/step - loss: 1.3387 - mean_absolute_error: 0.8054 - val_loss: 0.4541 - val_mean_absolute_error: 0.5143
Epoch 2/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 614ms/step - loss: 0.4260 - mean_absolute_error: 0.5058 - val_loss: 0.4177 - val_mean_absolute_error: 0.4946
Epoch 3/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m529s[0m 611ms/step - loss: 0.3657 - mean_absolute_error: 0.4702 - val_loss: 0.4047 - val_mean_absolute_error: 0.4877
Epoch 4/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m533s[0m 615ms/step - loss: 0.3124 - mean_absolute_error: 0.4339 - val_loss: 0.3708 - val_mean_absolute_error: 0.4658
Epoch 5/5
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m519s[0m 600ms/step - loss: 0.2543 - mean_absolute_error: 0.3896 - val_loss: 0.5097 - val_mean_absolute_error: 0.5503
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 121ms

In [9]:
# Save the submission file
submission_file_path = '/kaggle/working/submission.csv'
submission_df.to_csv(submission_file_path, index=False)
print(f"Submission file saved to {submission_file_path}")

Submission file saved to /kaggle/working/submission.csv
