In [1]:
import glob, os
from sklearn.feature_extraction.text import TfidfVectorizer

# Check working directory
print("Current working directory:", os.getcwd())

# List concept files
paths = glob.glob("math/train-medium/*.txt")
print("Found concept files:", paths)

# Load into docs list
docs = []
for path in paths:
    with open(path, encoding='utf-8') as f:
        docs.append(f.read())

print("\nLoaded docs count:", len(docs))

# Preview first document
if docs:
    print("\nPreview:", docs[0][:200], "…")

# Build TF-IDF model
concept_vec = TfidfVectorizer(stop_words='english', max_features=10000)
X_concepts = concept_vec.fit_transform(docs)
print("\nConcept matrix shape:", X_concepts.shape)

Current working directory: /home/rc/dataproblems/src/projects/version/sandbox
Found concept files: ['math/train-medium/numbers__place_value_composed.txt', 'math/train-medium/arithmetic__simplify_surd.txt', 'math/train-medium/arithmetic__mul_div_multiple.txt', 'math/train-medium/calculus__differentiate_composed.txt', 'math/train-medium/probability__swr_p_level_set.txt', 'math/train-medium/numbers__lcm.txt', 'math/train-medium/arithmetic__add_or_sub.txt', 'math/train-medium/numbers__gcd_composed.txt', 'math/train-medium/calculus__differentiate.txt', 'math/train-medium/numbers__is_prime_composed.txt', 'math/train-medium/numbers__round_number.txt', 'math/train-medium/polynomials__add.txt', 'math/train-medium/arithmetic__add_sub_multiple.txt', 'math/train-medium/algebra__polynomial_roots_composed.txt', 'math/train-medium/arithmetic__div.txt', 'math/train-medium/measurement__time.txt', 'math/train-medium/numbers__is_factor.txt', 'math/train-medium/numbers__lcm_composed.txt', 'math/train-medi

In [2]:
len(docs)

56

In [3]:
print("Matching CSV files:", glob.glob("**/qa*.csv", recursive=True))

Matching CSV files: []


In [4]:
import os, glob

print("Current Working Directory:", os.getcwd())
print("Top-level contents:", os.listdir("."))

matches = glob.glob("**/qa*.csv", recursive=True)
print("CSV matches:", matches)

Current Working Directory: /home/rc/dataproblems/src/projects/version/sandbox
Top-level contents: ['math', 'draft4.ipynb', 'edl.py', 'draft3.ipynb', 'cwd.py', 'sa_vectorizer.joblib', 'prototype.py', 'prototype2.py', 'esa_vectorizer.joblib', 'bash.sh', 'draft2.ipynb', 'checkembeddings.py', 'notes.sh', 'draft.ipynb', 'edl2.py', 'plots.py', '.ipynb_checkpoints']
CSV matches: []


In [5]:
import os

print("Working directory:", os.getcwd())
print("Listable items in sandbox root:", os.listdir("."))

Working directory: /home/rc/dataproblems/src/projects/version/sandbox
Listable items in sandbox root: ['math', 'draft4.ipynb', 'edl.py', 'draft3.ipynb', 'cwd.py', 'sa_vectorizer.joblib', 'prototype.py', 'prototype2.py', 'esa_vectorizer.joblib', 'bash.sh', 'draft2.ipynb', 'checkembeddings.py', 'notes.sh', 'draft.ipynb', 'edl2.py', 'plots.py', '.ipynb_checkpoints']


In [6]:
import os

print("Contents of 'math':", os.listdir("math"))

Contents of 'math': ['train-easy', 'train-medium', 'train-readme.txt', 'train-hard', 'extrapolate', 'interpolate']


In [7]:
import os

# List all joblib files in current dir
files = [f for f in os.listdir('.') if f.endswith('.joblib')]
print("Found vectorizer files:", files)

Found vectorizer files: ['sa_vectorizer.joblib', 'esa_vectorizer.joblib']


In [8]:
import joblib

sa_vec = joblib.load("sa_vectorizer.joblib")
esa_vec = joblib.load("esa_vectorizer.joblib")

print("sa_vectorizer vocab size:", len(sa_vec.get_feature_names_out()))
print("esa_vectorizer vocab size:", len(esa_vec.get_feature_names_out()))

sa_vectorizer vocab size: 10000
esa_vectorizer vocab size: 10000


In [9]:
# Example usage
sample_text = "Test question"
sa_vec_tfidf = sa_vec.transform([sample_text])
print("TF-IDF shape:", sa_vec_tfidf.shape)

TF-IDF shape: (1, 10000)


In [10]:
import os
from glob import glob

print("Working directory:", os.getcwd())
print("Top-level items:", os.listdir("."))

matches = glob("**/*", recursive=True)
print("All files and folders:")
for item in matches:
    print("  ", item)

Working directory: /home/rc/dataproblems/src/projects/version/sandbox
Top-level items: ['math', 'draft4.ipynb', 'edl.py', 'draft3.ipynb', 'cwd.py', 'sa_vectorizer.joblib', 'prototype.py', 'prototype2.py', 'esa_vectorizer.joblib', 'bash.sh', 'draft2.ipynb', 'checkembeddings.py', 'notes.sh', 'draft.ipynb', 'edl2.py', 'plots.py', '.ipynb_checkpoints']
All files and folders:
   math
   draft4.ipynb
   edl.py
   draft3.ipynb
   cwd.py
   sa_vectorizer.joblib
   prototype.py
   prototype2.py
   esa_vectorizer.joblib
   bash.sh
   draft2.ipynb
   checkembeddings.py
   notes.sh
   draft.ipynb
   edl2.py
   plots.py
   math/train-easy
   math/train-medium
   math/train-readme.txt
   math/train-hard
   math/extrapolate
   math/interpolate
   math/train-easy/numbers__place_value_composed.txt
   math/train-easy/arithmetic__simplify_surd.txt
   math/train-easy/arithmetic__mul_div_multiple.txt
   math/train-easy/calculus__differentiate_composed.txt
   math/train-easy/probability__swr_p_level_set.txt

In [None]:
from ydata_profiling import ProfileReport
import pandas as pd

df = pd.read_csv("math/train-medium/qa.csv")  # adjust to your actual path
profile = ProfileReport(df, title="Q/A Profiling Report", explorative=True)
profile.to_widget("qa_profile.html")

In [None]:
# moving to EDA/EDL draft 3