In [1]:
import shutil, os

jdk_dir = os.path.expanduser("~/.jdk")
if os.path.exists(jdk_dir):
    shutil.rmtree(jdk_dir)

In [2]:
import jdk
import os
import os.path as osp

java_home = jdk.install('17')

os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = os.environ["PATH"] + os.pathsep + osp.join(java_home, "bin")

import subprocess, shlex
print("JAVA_HOME:", os.environ["JAVA_HOME"])
print(subprocess.check_output(shlex.split("java -version"), stderr=subprocess.STDOUT).decode())


JAVA_HOME: /home/jovyan/.jdk/jdk-17.0.17+10
openjdk version "17.0.17" 2025-10-21
OpenJDK Runtime Environment Temurin-17.0.17+10 (build 17.0.17+10)
OpenJDK 64-Bit Server VM Temurin-17.0.17+10 (build 17.0.17+10, mixed mode, sharing)



In [3]:
import numpy as np
import pandas as pd
import random
import nltk
from softmax_reg import SoftmaxRegression
from features import text_to_feature_vector

In [4]:
df = pd.read_csv("english_exam_database.csv")

In [None]:
data = np.load("cefr_softmax_model.npz")
W = data["W"] # (10, 5)
b = data["b"] # (5,)
feature_means = data["feature_means"] # (10,)
feature_stds = data["feature_stds"] # (10,)

n_features, n_classes = W.shape
model = SoftmaxRegression(
    n_features=n_features,
    n_classes=n_classes,
    learning_rate=0.1,
    num_epochs=0,
    reg_lambda=0.0,
)
model.W = W
model.b = b

n = random.randint(0, len(df))
topic = df["topic"][n]
print("Topic: " + topic)
print("Write a paragraph " + topic.lower() + ".")
print()
text = input("Respond: ")

tokens = nltk.word_tokenize(text)
wc = len(tokens)
if wc < 10:
    print(f"Input too short ({wc} words). Need at least 10 words for a reliable prediction.")
    exit()

unique_ratio = len(set(tokens)) / wc if wc > 0 else 0.0
if unique_ratio < 0.2:
    print(f"Text has very low lexical variety (unique_ratio={unique_ratio:.2f}). Prediction may not be reliable.")

x_raw = text_to_feature_vector(text) # shape (10,)
x_std = (x_raw - feature_means) / feature_stds
x_std = x_std.reshape(1, -1) # (1, 10)

# Predict class
y_pred = model.predict(x_std)[0] # 0..4

# Map to CEFR level
int_level = y_pred + 1 # 1,...,5
cefr_map = {1: "A1", 2: "A2", 3: "B1", 4: "B2", 5: "C1"}
cefr_level = cefr_map[int_level]
print("Predicted CEFR level: ", cefr_level)

Topic: Making notes for a visitor
Write a paragraph making notes for a visitor.

