In [None]:
%%bash
pip install -r ../../requirements.txt
pip install sentence-transformers


### Objective
Generate sentence embeddings with models like E5 and train a lightweight classifier.


In [None]:
%%bash
set -e
python - <<'PY'
from sentence_transformers import SentenceTransformer
import pandas as pd
from pathlib import Path

df = pd.read_csv('../../data/teacher_course.csv')
df['prompted'] = df['aspect'].str.lower().radd('Aspect: ').str.cat(df['comments'], sep=' | Text: ')

model = SentenceTransformer('intfloat/e5-base-v2')
embeddings = model.encode(df['prompted'].tolist(), batch_size=64, convert_to_numpy=True)

out_dir = Path('../../outputs/sentence_embeddings')
out_dir.mkdir(parents=True, exist_ok=True)
pd.DataFrame(embeddings).to_parquet(out_dir / 'e5_embeddings.parquet')
df[['sentiment']].to_csv(out_dir / 'labels.csv', index=False)
PY


In [None]:
%%bash
set -e
python - <<'PY'
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = pd.read_parquet('../../outputs/sentence_embeddings/e5_embeddings.parquet')
y = pd.read_csv('../../outputs/sentence_embeddings/labels.csv')['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)
print(classification_report(y_val, clf.predict(X_val)))
PY
