## Import

In [None]:
import re
import copy
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
from tqdm.auto import tqdm, trange
from lightgbm import log_evaluation, early_stopping
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import cohen_kappa_score, accuracy_score
import spacy
import en_core_web_sm
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
nltk.download('punkt') # Only needed if not already downloaded
nltk.download('stopwords') # Only needed if not already downloaded

nlp = en_core_web_sm.load()

## Load Data

In [None]:
PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
train = pd.read_csv(PATH + "train.csv")
test = pd.read_csv(PATH + "test.csv")
train.head(10)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    x = x.lower()
    x = removeHTML(x)
    x = re.sub("@\w+", '',x)
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

In [None]:
train['preprocessed'] = train['full_text'].apply(lambda x: dataPreprocessing(removeHTML(x)))
train.drop(['full_text'], axis =1,inplace = True)


In [None]:
def get_word_count(text):
    """Returns total word count."""
    return len(str(text).split())

def get_sentence_length_avg(text):
    """Returns average sentence length."""
    sentences = sent_tokenize(text)
    tokens = [len(word_tokenize(sentence)) for sentence in sentences]
    return np.mean(tokens)

def get_vocab_richness(text):
    """Calculates type token ratio."""
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in word_tokenize(text) if word.isalpha() and word not in stop_words]
    fdist = FreqDist(filtered_tokens)
    types = len(fdist.keys())
    tokens = sum(fdist.values())  # Correct assignment of variables
    ttr = types / tokens
    return ttr

In [None]:
import spacy
spacy.load('en_core_web_sm')

def named_entities(text):
    doc = nlp(text)
    ents = []
    for ent in doc.ents:
        ents.append({'entity': str(ent), 'label': ent.label_})
    return ents

In [None]:
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    compound_score = sid.polarity_scores(text)['compound']
    return compound_score

In [None]:
discourse_markers = ['so', 'because', 'now', 'then', 'thus', 'therefore', 'hence', 'consequently', 'accordingly', 'instead', 'otherwise', 'moreover', 'furthermore', 'nevertheless', 'nonetheless', 'notwithstanding', 'anyway', 'despite', 'regardless', 'though', 'even though', 'although', 'despite that', 'in spite of', 'due to', 'owing to', 'because of', 'as a result', 'since', 'after', 'before', 'until', 'when', 'while', 'once', 'whenever', 'if', 'provided that', 'suppose', 'assume', 'given', 'granted', 'implying', 'whereupon', 'following', 'namely', 'that is', 'specifically', 'to be precise', 'indeed', 'certainly', 'surely']
connector_words = ['but', 'or', 'nor', 'for', 'and', 'yet', 'so', 'although', 'besides', 'because', 'except', 'than', 'as', 'when', 'while', 'until', 'about', 'before', 'after', 'above', 'below', 'to', 'of', 'at', 'in', 'into', 'throughout', 'during', 'against', 'underneath', 'between', 'among', 'behind', 'across', 'via', 'from', 'onto', 'off', 'over', 'aboard', 'away', 'near', 'by', 'alongside', 'inside', 'outside', 'next', 'under', 'opposite', 'around', 'beneath', 'within', 'without', 'aboveground', 'underground', 'upstairs', 'downstairs', 'here', 'there', 'where', 'why', 'how', 'whom', 'whatever', 'whenever', 'wherever', 'whoever', 'whichever']

def check_cohesiveness(text):
    dm_found = sum(dm in text for dm in discourse_markers)
    cw_found = sum(cw in text for cw in connector_words)
    return {
        'dm_count': dm_found,
        'cw_count': cw_found
    }

In [None]:
# Add feature engineering columns
train["word_count"] = train["preprocessed"].apply(get_word_count)
train["sentence_length_avg"] = train["preprocessed"].apply(get_sentence_length_avg)
train["vocab_richness"] = train["preprocessed"].apply(get_vocab_richness)
#train['named_entities'] = train['preprocessed'].apply(named_entities)
train['vader_sentiment'] = train['preprocessed'].apply(vader_sentiment)

#train['cohesiveness'] = train['preprocessed'].apply(check_cohesiveness)
train.head(5)

In [None]:
test['preprocessed'] = test['full_text'].apply(lambda x: dataPreprocessing(removeHTML(x)))
test.drop(['full_text'], axis =1,inplace = True)
test["word_count"] = test["preprocessed"].apply(get_word_count)
test["sentence_length_avg"] = test["preprocessed"].apply(get_sentence_length_avg)
test["vocab_richness"] = test["preprocessed"].apply(get_vocab_richness)
#test['named_entities'] = test['preprocessed'].apply(named_entities)
test['vader_sentiment'] = test['preprocessed'].apply(vader_sentiment)
#test['cohesiveness'] = test['preprocessed'].apply(check_cohesiveness)
test.head(5)

In [None]:
train_engineered = train.rename(columns= {'preprocessed': 'full_text'})
test_engineered =  test.rename(columns = {'preprocessed': 'full_text'})

train_engineered.to_csv('train_engineered.csv')
test_engineered.to_csv('test_engineered.csv')

In [None]:
columns = [  
    (
        pl.col("full_text").str.split(by="\n\n").alias("paragraph")
    ),
]
PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
train = pl.read_csv(PATH + "train.csv").with_columns(columns)
test = pl.read_csv(PATH + "test.csv").with_columns(columns)
train.head(1)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    x = x.lower()
    x = removeHTML(x)
    x = re.sub("@\w+", '',x)
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

In [None]:
def Paragraph_Preprocess(tmp):
    tmp = tmp.explode('paragraph')
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
                    pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),)
    return tmp

paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']

def Paragraph_Eng(train_tmp):
    aggs = [
    
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_{i}_cnt") for i in [50,75,100,125,150,175,200,250,300,350,400,500,600,700] ], 
        *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).count().alias(f"paragraph_{i}_cnt") for i in [25,49]], 
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
tmp = Paragraph_Preprocess(train)
train_feats = Paragraph_Eng(tmp)
train_feats['score'] = train['score']

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

In [None]:
def Sentence_Preprocess(tmp):
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
    tmp = tmp.explode('sentence')
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
    tmp = tmp.filter(pl.col('sentence_len')>=15)
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))
    
    return tmp
sentence_fea = ['sentence_len','sentence_word_cnt']
def Sentence_Eng(train_tmp):
    aggs = [
        *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [15,50,100,150,200,250,300] ], 
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

tmp = Sentence_Preprocess(train)

train_feats = train_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

In [None]:
def Word_Preprocess(tmp):
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
    tmp = tmp.explode('word')
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    tmp = tmp.filter(pl.col('word_len')!=0)
    
    return tmp
def Word_Eng(train_tmp):
    aggs = [
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt") for i in range(15) ], 
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
        pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
        pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

tmp = Word_Preprocess(train)

train_feats = train_feats.merge(Word_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

In [None]:
train_engineered.drop(['score','full_text'],axis =1 , inplace =True)
train_engineered.head(3)


In [None]:
#train_engineered.drop(['Unnamed: 0','preprocessed','score'],axis =1 , inplace = True)
#test_engineered.drop(['Unnamed: 0','preprocessed'],axis =1 , inplace = True)

In [None]:
train_feats = pd.merge(train_engineered, train_feats, on='essay_id')
train_feats.head(3)

In [None]:
#train_feats.drop(['named_entities','cohesiveness','preprocessed','Unnamed: 0'], axis=1, inplace=True)

In [None]:
vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(1,3),
            min_df=0.05,
            max_df=0.95,
            sublinear_tf=True,
)

train_tfid = vectorizer.fit_transform([i for i in train['full_text']])

dense_matrix = train_tfid.toarray()

df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = train_feats['essay_id']

train_feats = train_feats.merge(df, on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score','full_text'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(3)

## Train
* I have trained and saved the model
* you can choose to retrain or load the model

In [None]:
# idea from https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective
def quadratic_weighted_kappa(y_true, y_pred):
    y_true = y_true + a
    y_pred = (y_pred + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk, True



def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess
a = 2.948
b = 1.092

In [None]:
LOAD = False
models = []
if LOAD:
    for i in range(5):
        models.append(lgb.Booster(model_file=f'../input/lal-lgb-baseline-4/fold_{i}.txt'))
else:
    oof = []
    x= train_feats
    y= train_feats['score'].values
    # 5 fold
    kfold = KFold(n_splits=5, random_state=42, shuffle=True)
    callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=75,first_metric_only=True)]
    for fold_id, (trn_idx, val_idx) in tqdm(enumerate(kfold.split(x.copy(), y.copy().astype(str)))):
            model = lgb.LGBMRegressor(
                objective = qwk_obj,
                metrics = 'None',
                learning_rate = 0.05,
                max_depth = 5,
                num_leaves = 10,
                colsample_bytree=0.5,
                reg_alpha = 0.1,
                reg_lambda = 0.8,
                n_estimators=1050,
                random_state=42,
                verbosity = - 1)
            X_train = train_feats.iloc[trn_idx][feature_names]
            Y_train = train_feats.iloc[trn_idx]['score'] - a

            X_val = train_feats.iloc[val_idx][feature_names]
            Y_val = train_feats.iloc[val_idx]['score'] - a
            print('\nFold_{} Training ================================\n'.format(fold_id+1))
            lgb_model = model.fit(X_train,
                                  Y_train,
                                  eval_names=['train', 'valid'],
                                  eval_set=[(X_train, Y_train), (X_val, Y_val)],
                                  eval_metric=quadratic_weighted_kappa,
                                  callbacks=callbacks,)
            pred_val = lgb_model.predict(
                X_val, num_iteration=lgb_model.best_iteration_)
            df_tmp = train_feats.iloc[val_idx][['essay_id', 'score']].copy()
            df_tmp['pred'] = pred_val + a
            oof.append(df_tmp)
            models.append(model.booster_)
            lgb_model.booster_.save_model(f'fold_{fold_id}.txt')
    df_oof = pd.concat(oof)

### CV

In [None]:
if LOAD:
    print('acc: ',0.6275495464263015)
    print('kappa: ',0.7990509565910948)
else:
    acc = accuracy_score(df_oof['score'], df_oof['pred'].clip(1, 6).round())
    kappa = cohen_kappa_score(df_oof['score'], df_oof['pred'].clip(1, 6).round(), weights="quadratic")
    print('acc: ',acc)
    print('kappa: ',kappa)

## Submission

In [None]:
tmp = Paragraph_Preprocess(test)
test_feats = Paragraph_Eng(tmp)
tmp = Sentence_Preprocess(test)
test_feats = test_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
tmp = Word_Preprocess(test)
test_feats = test_feats.merge(Word_Eng(tmp), on='essay_id', how='left')
test_feats = pd.merge(test_engineered, test_feats, on='essay_id')

test_tfid = vectorizer.transform([i for i in test['full_text']])
dense_matrix = test_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = test_feats['essay_id']
test_feats = test_feats.merge(df, on='essay_id', how='left')

In [None]:
test_feats

In [None]:
# Features number
feature_names = list(filter(lambda x: x not in ['essay_id','score','full_text'], test_feats.columns))
print('Features number: ',len(feature_names))
test_feats.head(3)

In [None]:
prediction = test_feats[['essay_id']].copy()
prediction['score'] = 0
pred_test = models[0].predict(test_feats[feature_names]) + a
for i in range(4):
    pred_now = models[i+1].predict(test_feats[feature_names]) + a
    pred_test = np.add(pred_test,pred_now)
pred_test = pred_test/5
print(pred_test)

In [None]:
pred_test = pred_test.clip(1, 6).round()
prediction['score'] = pred_test
prediction.to_csv('submission.csv', index=False)
prediction.head(3)