# Predicting whether or not to meet again from three-word description

## Data processing

In [1]:
from fastai.imports import *

In [2]:
df = pd.read_excel('processed_data hand-corrected.xlsx')

In [3]:
df_a = df.rename(columns={'A_describe_X_in_three_words': 'text', 'A_would_you_meet_again': 'result'})[['text', 'result']]
df_b = df.rename(columns={'B_describe_X_in_three_words': 'text', 'B_would_you_meet_again': 'result'})[['text', 'result']]
new_df = pd.concat([df_a, df_b], axis=0)
new_df

Unnamed: 0,text,result
0,"Good-humoured, creative, curious.","Yeah, I’d definitely go for another drink next time he’s in London. But only if his dog Luna can come too."
1,"Bubbly, fun, intelligent.","As friends, absolutely."
2,"Enthusiastic, energetic, musical.","Yes. We spoke of it, as she does come to London and I will be house hunting in the north over the next year."
3,"Smart, American, sweet.",Hopefully. We talked about it and exchanged numbers.
4,"Lively, open and vivacious.",I would love the chance to. We talked of visiting the Winslow Homer exhibition at the National Gallery but we did not exchange contact details. Silly me – or was it deliberate on Nicole’s part? Only time will tell.
...,...,...
714,,"She didn't seem that interested, so I doubt it."
715,,I didn't detect any chemistry and we didn't swap numbers.
716,,Yes - for a drink and see how things developed.
717,,"Yes. It would be interesting to be in our own environment, where we could have a bit more craic. I reckon we could have a laugh."


In [None]:
new_df = new_df.dropna().reset_index(drop=True)
new_df

In [None]:
new_df.isna().sum()

In [None]:
new_df.loc[:, 'bool'] = new_df.loc[:, 'score'] >= 8
new_df

## from-scratch model

In [None]:
from torch import tensor

y = tensor(new_df['bool'].values).float()
y.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = tensor(vectorizer.fit_transform(new_df['text']).toarray()).float()
X.shape

In [None]:
print(vectorizer.get_feature_names_out()[X[0].numpy().nonzero()])
new_df.iloc[0]

In [None]:
import torch
# torch.manual_seed(0)
n_coeffs = X.shape[1]
coeffs = torch.rand(n_coeffs, 1) - 0.5
coeffs

In [None]:
def calc_pred(X, coeffs):
    return torch.sigmoid(X @ coeffs)

def calc_loss(pred, y):
    return torch.abs(pred - y).mean()

def init_coeffs():
    return (torch.rand(n_coeffs, 1)*0.1).requires_grad_()

def update_coeffs(coeffs, lr):
    coeffs.sub_(lr * coeffs.grad)
    coeffs.grad.zero_()

def accuracy(coeffs, X, y):
    preds = calc_pred(X, coeffs)
    return (y.bool() == (preds > 0.5)).float().mean()

def one_epoch(coeffs, lr, X, y):
    loss = calc_loss(calc_pred(X, coeffs), y)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"Loss: {loss:.4f}, Accuracy {accuracy(coeffs, X, y):.4f}", end=' ')

In [None]:
from fastai.data.transforms import RandomSplitter
trn_split, val_split = RandomSplitter()(new_df) # can set seed
X_train, X_val = X[trn_split], X[val_split]
y_train, y_val = y[trn_split][:, None], y[val_split][:, None]
len(y_train), len(y_val)

In [None]:
def train_model(epochs=30, lr=0.1, X=X_train, y=y_train):
    # torch.manual_seed(0)
    coeffs = init_coeffs()
    for i in range(epochs):
        print(f"Epoch {i+1}: ", end='')
        one_epoch(coeffs, lr, X, y)
        print()
    return coeffs

In [None]:
coeffs = train_model(lr=4)

In [None]:
def show_coeffs():
    return dict(zip(vectorizer.get_feature_names_out(), coeffs.requires_grad_(False).numpy().flatten()))

In [None]:
weights_df = pd.DataFrame.from_dict(show_coeffs(), orient='index', columns=['weight'])
weights_df.sort_values('weight', ascending=False, inplace=True)
weights_df.reset_index(inplace=True)
weights_df.rename(columns={'index': 'word'}, inplace=True)
weights_df.to_excel('weights.xlsx', index=False)
weights_df

In [None]:
val_preds = calc_pred(X_val, coeffs)
results = y_val.bool() == (val_preds > 0.5)
results.float().mean()

## Using transformers

In [None]:
from datasets import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
ds = Dataset.from_pandas(new_df)
ds = ds.rename_column('bool', 'labels')
ds = ds.cast_column('labels', Value('float32'))
dds = ds.train_test_split(seed=2005)

In [None]:
model_name = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_func(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)
dds = dds.map(tokenize_func, batched=True)

In [None]:
import evaluate
metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.round(logits)
    return metric.compute(predictions=preds, references=labels)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

bs = 16
lr = 1e-4
epochs = 5
args = TrainingArguments('outputs', learning_rate=lr, per_device_train_batch_size=bs, per_device_eval_batch_size=bs, num_train_epochs=epochs,
                         evaluation_strategy='epoch', save_strategy='epoch', weight_decay=0.01, warmup_ratio=0.1, load_best_model_at_end=True, metric_for_best_model='accuracy'
                        ) # auto_find_batch_size? (requires pip install accelerate)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dds['train'],
    eval_dataset=dds['test'],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
model = AutoModelForSequenceClassification.from_pretrained('outputs/checkpoint-117')

In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model='outputs/checkpoint-117', tokenizer=tokenizer) # you can infer the task if the model hasn't been fine-tuned, I think

text = "nice"
classifier(text)

In [None]:
tokenizer("funny, interesting, fun", return_tensors="pt")