In [41]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import math
import re
import spacy

In [42]:
df = pd.read_csv('TRAIN_RELEASE_3SEP2025/train_subtask1.csv')

In [43]:
def data_split(df, seed=11, u_val=0.10, u_test=0.15, tr=0.75, vr=(0.4,0.6), ms=3, mt=1):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['user_id','timestamp']).reset_index(drop=True)

    rng = np.random.default_rng(seed)
    users = df['user_id'].unique()
    tiny = set(df.groupby('user_id').size()[lambda s: s < ms].index)

    n_users = len(users)
    n_val = max(1, int(round(n_users * u_val)))
    n_tst = max(1, int(round(n_users * u_test)))

    cand = [u for u in users if u not in tiny]
    rng.shuffle(cand)
    pool = list(tiny) + cand
    u_val_set  = set(pool[:n_val])
    u_test_set = set(pool[n_val:n_val+n_tst])

    def split_user(g):
        uid = g['user_id'].iloc[0]
        if uid in u_val_set:
            g['split'] = 'val_unseen'
            return g
        if uid in u_test_set:
            g['split'] = 'test_unseen'
            return g
        n = len(g)
        tr_end = max(1, int(np.floor(n * tr)))
        R = n - tr_end
        p = float(rng.uniform(vr[0], vr[1]))
        val_end = tr_end + int(np.floor(R * p))
        if n - val_end < mt:
            val_end = max(tr_end, n - mt)
        g['split'] = (['train']*tr_end +
                      ['val_seen']*max(0, val_end - tr_end) +
                      ['test_seen']*(n - val_end))
        return g

    df = df.groupby('user_id', group_keys=False).apply(split_user)
    df['group'] = np.where(df['split'].isin(['val_unseen','test_unseen']), 'unseen', 'seen')
    return df


In [44]:
df = data_split(df, seed=11)

train = df[df.split == 'train']
val   = df[df.split.isin(['val_seen','val_unseen'])]
test  = df[df.split.isin(['test_seen','test_unseen'])]

train.to_csv("split/train.csv", index=False)
val.to_csv("split/val.csv", index=False)
test.to_csv("split/test.csv", index=False)

  df = df.groupby('user_id', group_keys=False).apply(split_user)


In [45]:
def analyze(df: pd.DataFrame):
    df = df.copy()
    df['set'] = df['split'].map(lambda s: 'train' if s=='train' else ('val' if 'val_' in s else 'test'))

    out = {}

    out['rows_per_split']  = df['split'].value_counts().rename('rows').to_frame()
    out['users_per_split'] = df.groupby('split')['user_id'].nunique().rename('users').to_frame()
    out['users_seen_unseen_per_split'] = (
        df.groupby(['split','group'])['user_id'].nunique().rename('users').reset_index()
    )

    out['rows_per_set']  = df['set'].value_counts().rename('rows').to_frame()
    out['users_per_set'] = df.groupby('set')['user_id'].nunique().rename('users').to_frame()
    out['users_seen_unseen_per_set'] = (
        df.groupby(['set','group'])['user_id'].nunique().unstack('group', fill_value=0)
        .rename_axis(None, axis=1)
    )

    iw_split = df.pivot_table(index='split', columns='is_words', values='text_id', aggfunc='count', fill_value=0)
    iw_set   = df.pivot_table(index='set',   columns='is_words', values='text_id', aggfunc='count', fill_value=0)
    out['is_words_counts_split'] = iw_split
    out['is_words_pct_split']    = (iw_split.div(iw_split.sum(axis=1), axis=0)).round(3)
    out['is_words_counts_set']   = iw_set
    out['is_words_pct_set']      = (iw_set.div(iw_set.sum(axis=1), axis=0)).round(3)

    iw_gs = df.pivot_table(index=['group','split'], columns='is_words', values='text_id', aggfunc='count', fill_value=0)
    iw_gS = df.pivot_table(index=['group','set'],   columns='is_words', values='text_id', aggfunc='count', fill_value=0)
    out['is_words_counts_group_split'] = iw_gs
    out['is_words_pct_group_split']    = (iw_gs.div(iw_gs.sum(axis=1), axis=0)).round(3)
    out['is_words_counts_group_set']   = iw_gS
    out['is_words_pct_group_set']      = (iw_gS.div(iw_gS.sum(axis=1), axis=0)).round(3)

    for k,v in out.items():
        print(f"\n== {k} ==")
        print(v)

    return out

res = analyze(df)


== rows_per_split ==
             rows
split            
train        1529
test_unseen   539
test_seen     332
val_seen      229
val_unseen    135

== users_per_split ==
             users
split             
test_seen      102
test_unseen     21
train          102
val_seen        83
val_unseen      14

== users_seen_unseen_per_split ==
         split   group  users
0    test_seen    seen    102
1  test_unseen  unseen     21
2        train    seen    102
3     val_seen    seen     83
4   val_unseen  unseen     14

== rows_per_set ==
       rows
set        
train  1529
test    871
val     364

== users_per_set ==
       users
set         
test     123
train    102
val       97

== users_seen_unseen_per_set ==
       seen  unseen
set                
test    102      21
train   102       0
val      83      14

== is_words_counts_split ==
is_words     False  True 
split                    
test_seen      169    163
test_unseen    306    233
train          687    842
val_seen       115    1

In [None]:
NEG_BETA = 0.8
DECAY = [1.0, 0.95, 0.90]
# POS_W  = {'ADJ':1.4,'VERB':1.2,'NOUN':1.0,'ADV':1.0}
# POS_DEF = 0.5
BUT = {'but','however','though','although','yet'}
NEG = {'not',"n't",'never','no','without','none','hardly','scarcely','barely'}
INT  = {'very','really','extremely','so','super','absolutely','totally','too','quite','highly','truly','deeply','incredibly','terribly','awfully'}
DOWN = {'slightly','somewhat','rather','kinda','a bit','bit','mildly','partly'}
EXCL_BUMP = 0.05

nlp = spacy.load("en_core_web_sm", disable=["ner"])
PUNCT_STOP = {'.',',',';','?','!','—','-','…',':','–'}

def _norm_elong(s): return re.sub(r'(.)\1{2,}', r'\1\1', s)

def load_lexicon(path):
    df = pd.read_csv(path, sep="\t", engine="python")
    df.columns = [c.strip().lower() for c in df.columns]
    df = df.rename(columns={"valence":"v","arousal":"a"})[["term","v","a"]].dropna()
    df["term"] = df["term"].str.strip().str.lower()
    df["v"] = pd.to_numeric(df["v"], errors="coerce").clip(-1, 1)
    df["a"] = pd.to_numeric(df["a"], errors="coerce").clip(-1, 1)
    df = df.dropna(subset=["v","a"]).groupby("term", as_index=False).mean()
    return {r.term:(float(r.v), float(r.a)) for r in df.itertuples(index=False)}

def parse_text(x, is_words=False):
    x = _norm_elong(x or "")
    ex = x.count('!')
    if is_words:
        items = [w.strip().lower() for w in re.split(r'[,\|;/]+', x) if w.strip()]
        toks = [{'lemma':w, 'pos':'ADJ', 'txt':w, 'punct':False} for w in items]
        return toks, ex
    doc = nlp(x)
    toks = [{'lemma':t.lemma_.lower(), 'pos':t.pos_, 'txt':t.text, 'punct':t.is_punct} for t in doc if not t.is_space]
    return toks, ex

def build_idf(texts, flags, LEX):
    N = len(texts); df = defaultdict(int)
    for x, iw in zip(texts, flags):
        toks, _ = parse_text(x, bool(iw))
        seen = set(t['lemma'] for t in toks if t['lemma'] in LEX)
        for w in seen: df[w] += 1
    return {t: math.log((N+1)/ (df.get(t,0)+1)) + 1.0 for t in LEX.keys()}

def _left_idxs(toks, i, k=3):
    out, j = [], i-1
    while j >= 0 and len(out) < k:
        if toks[j]['punct'] and toks[j]['txt'] in PUNCT_STOP: break
        out.append(j); j -= 1
    return out

def adjust_va(v, a, toks, i):
    g, neg = 0.0, False
    for k, j in enumerate(_left_idxs(toks, i, 3)):
        w = toks[j]['lemma']; d = DECAY[k] if k < len(DECAY) else DECAY[-1]
        if w in INT:  g += 0.3 * d
        if w in DOWN: g += (-0.2) * d
        if w in NEG:  neg = True
    v = (1.0 + g) * v
    a = (1.0 + g) * a
    if neg: v = -NEG_BETA * v
    return float(np.clip(v, -2.0, 2.0)), float(np.clip(a, -1.0, 1.0))

def _but_idx(toks):
    for i,t in enumerate(toks):
        if t['lemma'] in BUT: return i
    return None

def _weights(toks, tf, idf, LEX):
    wV, wA, vs, as_ = [], [], [], []
    for i, t in enumerate(toks):
        lem, pos = t['lemma'], t['pos']
        if lem not in LEX:
            wV.append(0.0); wA.append(0.0); vs.append(0.0); as_.append(0.0)
            continue
        v0, a0 = LEX[lem]
        v, a = adjust_va(v0, a0, toks, i)
        wb = idf.get(lem, 1.0)
        wV.append(wb * (1 + 0.3 * abs(a)))
        wA.append(wb)
        vs.append(v); as_.append(a)
    return np.asarray(wV, float), np.asarray(wA, float), np.asarray(vs, float), np.asarray(as_, float)

def aggregate_doc(toks, idf, LEX):
    lem = [t['lemma'] for t in toks if t['lemma'] in LEX]
    if not lem: return 0.0, 0.0
    tf = Counter(lem)
    wV, wA, vs, as_ = _weights(toks, tf, idf, LEX)
    p = _but_idx(toks)
    if p is not None:
        pre = np.arange(len(toks)) < p
        post = ~pre
        wV[pre] *= 0.3; wV[post] *= 0.7
        wA[pre] *= 0.3; wA[post] *= 0.7
    eps = 1e-8
    V = float(np.nan_to_num(np.nansum(wV*vs) / (np.nansum(wV)+eps)))
    A = float(np.nan_to_num(np.nansum(wA*as_) / (np.nansum(wA)+eps)))
    V = float(np.clip(V, -2.0, 2.0))
    A = float(np.clip(A, -1.0, 1.0))
    return V, A

def bump_excl(A, ex): 
    return float(np.clip(A + EXCL_BUMP*min(3, int(ex)), -1.0, 1.0))

def fit_idf_from_train(train_df, LEX):
    return build_idf(train_df['text'].tolist(), train_df['is_words'].tolist(), LEX)

def score_df(df, idf, LEX):
    VV, AA = [], []
    for x, iw in zip(df['text'], df['is_words']):
        toks, ex = parse_text(str(x), bool(iw))
        v, a = aggregate_doc(toks, idf, LEX)
        a = bump_excl(a, ex)
        VV.append(v); AA.append(a)
    out = df.copy()
    out['V'], out['A'] = VV, AA
    return out

def fit_lin(y_true, y_pred):
    x = np.asarray(y_pred); y = np.asarray(y_true)
    x = np.c_[np.ones_like(x), x]
    w, *_ = np.linalg.lstsq(x, y, rcond=None)
    return float(w[0]), float(w[1])

def apply_lin(x, w0, w1, lo, hi):
    return np.clip(w0 + w1*np.asarray(x), lo, hi)


In [None]:
LEX = load_lexicon("NRC-VAD-Lexicon-v2.1/NRC-VAD-Lexicon-v2.1.txt")

In [None]:
def metrics(df, v_col_pred, a_col_pred, v_col_true='valence', a_col_true='arousal'):
    dv = df[v_col_true] - df[v_col_pred]
    da = df[a_col_true] - df[a_col_pred]
    out = {
        'V_MAE': float(np.mean(np.abs(dv))),
        'V_RMSE': float(np.sqrt(np.mean(dv**2))),
        'A_MAE': float(np.mean(np.abs(da))),
        'A_RMSE': float(np.sqrt(np.mean(da**2))),
        'n': int(len(df)),
        'users': int(df['user_id'].nunique())
    }
    return out

def metrics_by_group(df, v_col_pred, a_col_pred):
    all_ = metrics(df, v_col_pred, a_col_pred)
    res = {'ALL': all_}
    for g, gdf in df.groupby('group'):
        res[g.upper()] = metrics(gdf, v_col_pred, a_col_pred)
    return pd.DataFrame(res).T

def run_all_approaches(train, val, test, LEX, min_user_n=6):
    idf = fit_idf_from_train(train, LEX)

    train_s = score_df(train, idf, LEX)
    val_s   = score_df(val,   idf, LEX)
    test_s  = score_df(test,  idf, LEX)

    val_nc  = val_s.rename(columns={'V':'V_nc','A':'A_nc'})
    test_nc = test_s.rename(columns={'V':'V_nc','A':'A_nc'})

    w0v, w1v = fit_lin(train['valence'], train_s['V'])
    w0a, w1a = fit_lin(train['arousal'], train_s['A'])

    def apply_global(dfsc):
        dfsc = dfsc.copy()
        dfsc['V_g'] = apply_lin(dfsc['V'], w0v, w1v, -2,  2)
        dfsc['A_g'] = apply_lin(dfsc['A'], w0a, w1a, -1,  1)
        return dfsc

    train_g = apply_global(train_s)
    val_g   = apply_global(val_s)
    test_g  = apply_global(test_s)

    cal_v_u, cal_a_u = {}, {}
    for uid, g in train_s.groupby('user_id'):
        if len(g) >= min_user_n:
            cal_v_u[uid] = fit_lin(g['valence'], g['V'])
            cal_a_u[uid] = fit_lin(g['arousal'], g['A'])

    def apply_user(dfsc, src_V='V', src_A='A', outV='V_u', outA='A_u', fallback=('raw')):
        dfsc = dfsc.copy()
        Vh, Ah = [], []
        for r in dfsc.itertuples(index=False):
            uid = getattr(r, 'user_id')
            v_in = getattr(r, src_V); a_in = getattr(r, src_A)
            if uid in cal_v_u and uid in cal_a_u:
                w0,w1 = cal_v_u[uid]; vv = apply_lin(v_in, w0, w1, -2, 2)
                w0,w1 = cal_a_u[uid]; aa = apply_lin(a_in, w0, w1, -1, 1)
            else:
                if fallback == 'global' and ('V_g' in dfsc and 'A_g' in dfsc):
                    vv = getattr(r, 'V_g'); aa = getattr(r, 'A_g')
                else:
                    vv = v_in; aa = a_in
            Vh.append(vv); Ah.append(aa)
        dfsc[outV] = Vh; dfsc[outA] = Ah
        return dfsc

    val_u  = apply_user(val_s,  src_V='V',   src_A='A',   outV='V_u',  outA='A_u',  fallback='raw')
    test_u = apply_user(test_s, src_V='V',   src_A='A',   outV='V_u',  outA='A_u',  fallback='raw')

    cal_v_gu, cal_a_gu = {}, {}
    for uid, g in train_g.groupby('user_id'):
        if len(g) >= min_user_n:
            cal_v_gu[uid] = fit_lin(g['valence'], g['V_g'])
            cal_a_gu[uid] = fit_lin(g['arousal'], g['A_g'])

    def apply_global_user(dfsc):
        dfsc = dfsc.copy()
        if 'V_g' not in dfsc: dfsc = apply_global(dfsc)
        Vh, Ah = [], []
        for r in dfsc.itertuples(index=False):
            uid = getattr(r, 'user_id')
            v_in = getattr(r, 'V_g'); a_in = getattr(r, 'A_g')
            if uid in cal_v_gu and uid in cal_a_gu:
                w0,w1 = cal_v_gu[uid]; vv = apply_lin(v_in, w0, w1, -2, 2)
                w0,w1 = cal_a_gu[uid]; aa = apply_lin(a_in, w0, w1, -1, 1)
            else:
                vv, aa = v_in, a_in
            Vh.append(vv); Ah.append(aa)
        dfsc['V_gu'] = Vh; dfsc['A_gu'] = Ah
        return dfsc

    val_gu  = apply_global_user(val_s)
    test_gu = apply_global_user(test_s)

    rep = {}

    rep[('val','no_cal')]    = metrics_by_group(val_nc,  'V_nc','A_nc')
    rep[('val','global')]    = metrics_by_group(val_g,   'V_g', 'A_g')
    rep[('val','user')]      = metrics_by_group(val_u,   'V_u', 'A_u')
    rep[('val','glob+user')] = metrics_by_group(val_gu,  'V_gu','A_gu')

    rep[('test','no_cal')]    = metrics_by_group(test_nc,  'V_nc','A_nc')
    rep[('test','global')]    = metrics_by_group(test_g,   'V_g', 'A_g')
    rep[('test','user')]      = metrics_by_group(test_u,   'V_u', 'A_u')
    rep[('test','glob+user')] = metrics_by_group(test_gu,  'V_gu','A_gu')

    for (split, name), dfm in rep.items():
        print(f"\n=== {split.upper()} — {name} ===")
        print(dfm[['V_MAE','V_RMSE','V_r','A_MAE','A_RMSE','A_r','n','users']].round(4).to_string())

    scored = {
        'train_raw': train_s,
        'val_no_cal': val_nc, 'test_no_cal': test_nc,
        'val_global': val_g,  'test_global': test_g,
        'val_user': val_u,    'test_user': test_u,
        'val_glob_user': val_gu, 'test_glob_user': test_gu
    }
    return rep, scored

idf = fit_idf_from_train(train, LEX)
train_s = score_df(train, idf, LEX)
val_s   = score_df(val,   idf, LEX)
test_s  = score_df(test,  idf, LEX)

rep, scored = run_all_approaches(train, val, test, LEX, min_user_n=6)


=== VAL — no_cal ===
         V_MAE  V_RMSE     V_r   A_MAE  A_RMSE     A_r      n  users
ALL     0.8763  1.0789  0.4950  0.9117  1.1357  0.2972  364.0   97.0
SEEN    0.8181  1.0229  0.5877  1.0393  1.2385  0.3675  229.0   83.0
UNSEEN  0.9751  1.1678  0.3477  0.6953  0.9359  0.2343  135.0   14.0

=== VAL — global ===
         V_MAE  V_RMSE     V_r   A_MAE  A_RMSE     A_r      n  users
ALL     0.8537  1.0555  0.4950  0.6057  0.7205  0.3273  364.0   97.0
SEEN    0.7797  0.9632  0.5877  0.5914  0.7220  0.4021  229.0   83.0
UNSEEN  0.9792  1.1958  0.3477  0.6300  0.7179  0.2630  135.0   14.0

=== VAL — user ===
         V_MAE  V_RMSE     V_r   A_MAE  A_RMSE     A_r      n  users
ALL     0.7910  1.0331  0.5315  0.6188  0.8158  0.4208  364.0   97.0
SEEN    0.6826  0.9448  0.6247  0.5736  0.7359  0.4348  229.0   83.0
UNSEEN  0.9751  1.1678  0.3477  0.6953  0.9359  0.2343  135.0   14.0

=== VAL — glob+user ===
         V_MAE  V_RMSE     V_r   A_MAE  A_RMSE     A_r      n  users
ALL     0.7875