# transfer learning

In [None]:
from tqdm import tqdm
import seaborn as sns
import numpy as np
import pandas as pd
from pathlib import Path
from io import StringIO
import json
import pickle
import matplotlib.pyplot as plt
import re
from collections import Counter


In [None]:
# !unzip datasets.zip

## fine tuning

In [None]:
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
)
from datasets import load_dataset
import datasets


In [None]:
# !pip install --upgrade pandas transformers datasets

In [None]:
import torch
torch.cuda.get_device_name(0)


- https://github.com/rafalposwiata/depression-detection-lt-edi-2022

dataset

In [None]:
data_text = ""

for filename in sorted(["ds", "ts_hs", "ts_ht"]):
    with open(
        Path("datasets", "task_0", f"{filename}.tsv"), "rt", encoding="utf8"
    ) as f:
        data_text += f.read()

df = pd.read_csv(StringIO(data_text), sep="\t")
# dup_idx = df.index.difference(df.drop_duplicates().index)
# df.loc[dup_idx]
df = df.drop_duplicates().reset_index(names="old_idx").reset_index(names="new_idx")


train test indexes

In [None]:
lookup = df.set_index("old_idx")["new_idx"]
pth = Path("datasets", "task_0", "train_test_splitting.json")
idx = json.load(open(pth, "rt"))
idx = {k: [lookup[i] for i in lst if i in lookup] for k, lst in idx.items()}


In [None]:
x, y = df["pp_text"], df["label"]
x_train, x_test = x.loc[idx["train"]], x.loc[idx["test"]]
y_train, y_test = y.loc[idx["train"]], y.loc[idx["test"]]


In [None]:
dataset = {
    "train": datasets.Dataset.from_list(
        [{"label": int(y), "text": str(x)} for y, x in zip(y_train, x_train)]
    ),
    "test": datasets.Dataset.from_list(
        [{"label": int(y), "text": str(x)} for y, x in zip(y_test, x_test)]
    ),
}

dataset = datasets.DatasetDict(dataset)


In [None]:
dataset["test"][-1], dataset["test"][0]

tokens

In [None]:
tokenizer_hf = AutoTokenizer.from_pretrained("ShreyaR/finetuned-roberta-depression")

In [None]:
def to_hf_tokens(examples):
    return tokenizer_hf(
        examples["text"],
        padding="max_length",
        truncation=True,
    )


tokenized_datasets = dataset.map(to_hf_tokens, batched=True)


In [None]:
tokenized_datasets.items()

In [None]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

finetuning

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "ShreyaR/finetuned-roberta-depression",
    # "rafalposwiata/deproberta-large-depression",
)


In [None]:
training_args = TrainingArguments(
    "test-trainer",
    # no_cuda=True,
    seed=42,
    # per_device_train_batch_size=1,
    # gradient_accumulation_steps=8,
    # gradient_checkpointing=True,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["eval"],
    # data_collator=data_collator,
    # tokenizer=tokenizer,
)


In [None]:
# trainer.train()
# predictions = trainer.predict(tokenized_datasets["test"])

![image.png](attachment:image.png)

final hiddden embedding

In [None]:
voc2hidden = trainer.model.get_input_embeddings()

In [None]:
# te_lst = []
# for text in dataset["test"]["text"]:
#   input_ids = tokenizer.encode(text, return_tensors="pt").to("cuda")
#   hid = voc2hidden(input_ids)
#   te_lst.append((text, hid))

In [None]:
# te_lst = pickle.load(open("text.w.hidden.pkl", "rb"))

metrics

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
y_fine = pickle.load(open("predictions/fine.pkl", "rb"))
y_raw = pickle.load(open("predictions/raw.pkl", "rb"))

In [None]:
y_true = y_raw[1]

In [None]:
y_fine = [ 1 if p < n else 0 for p, n in y_fine[0]]
y_raw = [ 1 if p < n else 0 for p, n in y_raw[0]]

In [None]:
conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_raw)

fig, ax = plt.subplots(figsize=(3, 3))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.show()

In [None]:
print('Precision:', precision_score(y_true, y_raw))
print('Recall: ', recall_score(y_true, y_raw))
print('Accuracy: ',  accuracy_score(y_true, y_raw))
print('F1 Score: ',  f1_score(y_true, y_raw))

In [None]:
conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_fine)

fig, ax = plt.subplots(figsize=(3, 3))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.show()

In [None]:
print('Precision:', precision_score(y_true, y_fine))
print('Recall: ', recall_score(y_true, y_fine))
print('Accuracy: ',  accuracy_score(y_true, y_fine))
print('F1 Score: ',  f1_score(y_true, y_fine))

In [None]:
group = [ ]
for i, v in enumerate(zip(y_fine, y_true)):
    match v:
        case (0,0):
            group.append("TN")
        case (1,1):
            group.append("TP")
        case (0,1):
            group.append("FN")
        case (1,0):
            group.append("FP")
        case _:
            raise ValueError("!")            

In [None]:
Counter(group)

## what models do?

tf idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.spatial import distance

In [None]:
vect = TfidfVectorizer()
tfidf_mx = vect.fit_transform(x_test)
words = vect.get_feature_names_out()

In [None]:
type(tfidf_mx)

In [None]:
# a, b = tfidf_mx.shape
# tiv = np.zeros((a,b))
# for i in tqdm(range(a)):
#     for j in range(b):
#         tiv = tfidf_mx[i,j]

In [None]:
# cosine_tfidf = np.array([ [ distance.cosine(tmp[i],tmp[j]) for i in range(a)] for i in tqdm(range(a))])

sintactic

In [None]:
"like	(02 134)125/464	(02 134)126	(02 134)126	253"

In [None]:
def to_liwc_tokens(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

In [None]:
import liwc
parse, category_names = liwc.load_token_parser('dic/LIWC2007_English080730.dic')
# parse, category_names = liwc.load_token_parser('dic/LIWC2001_English.dic')

In [None]:
K = len(category_names)
N = len(x_test)

kat_lookup = dict(zip(category_names, range(K)))

In [None]:
np.array([2,3,4]) / 7

In [None]:
y_snt = []
x_sem = []
w_grp = []

for n in tqdm(range(N)):
    text = x_test.iloc[n]
    kat_freq = np.zeros(K)

    for t in to_liwc_tokens(text):
        for m in parse(t):
            k = kat_lookup[m]
            kat_freq[k] += 1

    s = kat_freq.sum()
    if not s:
        continue

    kat_freq /= s
    
    input_ids = tokenizer_hf.encode(text, return_tensors="pt").to("cuda")
    tens = voc2hidden(input_ids)
    if len(tens[0]) > 512:
        continue
    cls_token = tens[0][0]

    x_sem.append(cls_token.to("cpu").detach().numpy())
    y_snt.append(kat_freq)
    w_grp.append(group[n])


y_snt = np.array(y_snt)
x_sem = np.array(x_sem)

In [None]:
from sklearn.linear_model import ElasticNet, Ridge, LinearRegression

In [None]:
e_reg = ElasticNet(random_state=42)
e_reg.fit(x_sem, y_snt)

print(e_reg.coef_)
# print(e_reg.intercept_)

In [None]:
ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(x_sem, y_snt)

print(e_reg.coef_)
print(e_reg.intercept_)

In [None]:
category_names

In [None]:
def plot_coeff(category):
    i = category_names.index(category)
    ax = sns.heatmap(
        e_reg.coef_[i].reshape((32, 24)), cmap=sns.color_palette("vlag", as_cmap=True)
    )
    ax.set(
        xticklabels=[],
        yticklabels=[],
        title=category.upper(),
    )
    ax.tick_params(bottom=False, left=False)


In [None]:
plot_coeff("family")

In [None]:
plot_coeff("death")

In [None]:
plot_coeff("adverb")

similarities

In [None]:
M = len(y_snt)
cosine_mx = np.zeros((M,M))
for i in tqdm(range(M)):
    for j in range(M):
        cosine_mx[i,j] = distance.cosine(y_snt[i], y_snt[j])

In [None]:
# THRESHOLD = 0.5
# np.sign(cosine_mx - THRESHOLD)

In [None]:
NC = 5

In [None]:
# from sklearn.manifold import TSNE
# X_embedded = TSNE(n_components=NC, learning_rate='auto',
#                   init='random', perplexity=3).fit_transform(cosine_mx)
# X_embedded.shape


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=NC)
new_coord = pca.fit_transform([e for e,g in zip(cosine_mx,w_grp) if g != "TN"])


In [None]:
import plotly.express as px

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    new_coord, # X_embedded,
    labels=labels,
    dimensions=range(NC),
    color=[g for g in w_grp if g != "TN"],
    height=1000,
)
fig.update_traces(diagonal_visible=False)
fig.show()