In [2]:
# Cell 1 (Corrected)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import config, PROJECT_ROOT
# Corrected: Use the new memory-efficient function names from your build_features.py file
from src.features.build_features import extract_keywords_sparse, calculate_dov_dod_from_sparse

sns.set_theme(style="whitegrid")

In [3]:
# Cell 2 (Updated)
# First, update the import in Cell 1 to get the new function names
# from src.features.build_features import extract_keywords_sparse, calculate_dov_dod_from_sparse

# Construct the full, absolute path to the data file
processed_data_path = PROJECT_ROOT / config.data.processed_path
print(f"Loading data from: {processed_data_path}")

# Load the processed data from the Parquet file
df = pd.read_parquet(processed_data_path)
print(f"Successfully loaded {len(df)} records.")

# 1. Extract keywords into a memory-efficient sparse matrix
print("Extracting keywords (sparse)...")
X_sparse, feature_names = extract_keywords_sparse(df['summary'])

# 2. Calculate signals directly from the sparse matrix
print("Calculating DoV and DoD signals (from sparse)...")
signals_df = calculate_dov_dod_from_sparse(X_sparse, feature_names)

print("\nTop 15 Signals by Term Frequency:")
display(signals_df.head(15))

2025-09-25 12:51:37,064 - INFO - Extracting keywords with n-gram range (1, 3)...


Loading data from: C:\Users\lib9\weak-signals-new\data\processed\cv_arxiv_data_2010-2022.parquet
Successfully loaded 8269 records.
Extracting keywords (sparse)...


2025-09-25 12:51:40,846 - INFO - Extracted 1307961 unique keywords into a sparse matrix.


Calculating DoV and DoD signals (from sparse)...


2025-09-25 12:51:41,071 - INFO - Calculated DoV and DoD for 1307961 keywords.



Top 15 Signals by Term Frequency:


Unnamed: 0,TF,DF,DoV,DoD
vision,10611,8205,1.0,0.99226
computer,9715,8188,0.915559,0.990204
computer vision,9301,8185,0.876543,0.989842
learning,8225,4013,0.775139,0.485307
image,7904,3492,0.744887,0.4223
data,6953,3247,0.655263,0.392671
based,6260,3788,0.589954,0.458097
model,5666,2999,0.533974,0.36268
deep,5174,2938,0.487607,0.355303
images,5035,2604,0.474508,0.314911
