In [1]:
# a_fixed.py (training + save)
import pandas as pd
import numpy as np
import re
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm

# -------------------------
# Config
# -------------------------
DATA_PATH = "ASAP2_train_sourcetexts.csv"
ESSAY_COL = "full_text"
SCORE_COL = "score"
MODEL_PATH = "asap_ridge_model.pkl"
EMBED_PATH = "asap_embedding_model.pkl"

# -------------------------
# Load dataset
# -------------------------
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=[ESSAY_COL, SCORE_COL])
df = df[[ESSAY_COL, SCORE_COL]]
df.rename(columns={ESSAY_COL: "essay", SCORE_COL: "score"}, inplace=True)
print("Columns in dataset:", df.columns.tolist())

# -------------------------
# Preprocess text
# -------------------------
def clean_text(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,!?;:()\'"\s]', '', text)
    return text.strip()

print("Cleaning essays...")
tqdm.pandas()
df['essay_clean'] = df['essay'].progress_apply(clean_text)

# -------------------------
# Encode essays
# -------------------------
print("Encoding essays with SentenceTransformer...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
essay_embeddings = embedding_model.encode(df['essay_clean'].tolist(), show_progress_bar=True)

# -------------------------
# Train Ridge Regression
# -------------------------
X = essay_embeddings
y = df['score'].values.astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
ridge = Ridge(alpha=1.0)
print("Training Ridge Regression on ASAP embeddings (fast on CPU)...")
ridge.fit(X_train, y_train)

# -------------------------
# Save model and embedding
# -------------------------
with open(MODEL_PATH, 'wb') as f:
    pickle.dump(ridge, f)
print(f"Ridge model saved to {MODEL_PATH}")

with open(EMBED_PATH, 'wb') as f:
    pickle.dump(embedding_model, f)
print(f"SentenceTransformer embedding saved to {EMBED_PATH}")

# -------------------------
# Compute RMSE safely
# -------------------------
y_pred = ridge.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse:.3f}")


  from .autonotebook import tqdm as notebook_tqdm


Columns in dataset: ['essay', 'score']
Cleaning essays...


100%|██████████| 24728/24728 [00:03<00:00, 6628.20it/s]


Encoding essays with SentenceTransformer...


Batches: 100%|██████████| 773/773 [34:32<00:00,  2.68s/it]   


Training Ridge Regression on ASAP embeddings (fast on CPU)...
Ridge model saved to asap_ridge_model.pkl
SentenceTransformer embedding saved to asap_embedding_model.pkl
Test RMSE: 0.845
