In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm.auto import tqdm
import itertools

# Import our custom modules
from src.config import config, PROJECT_ROOT
from src.features.build_features import get_embedding_model, find_representative_ngrams

# Initialize tqdm for pandas operations (like .progress_apply)
tqdm.pandas(desc="Processing Documents")

In [7]:
# Cell 2 (Corrected)

# 1. Load the full processed dataset
processed_data_path = PROJECT_ROOT / config.data.processed_path
print(f"Loading full dataset from: {processed_data_path}")
full_df = pd.read_parquet(processed_data_path)
print(f"Successfully loaded {len(full_df)} total records.")

# 2. Filter for the year 2010
print("\nFiltering for year 2010...")
# The 'published' column is already an integer year, so we can filter directly
df = full_df[full_df['published'] == 2010].copy()
print(f"Found {len(df)} records for the year 2010.")

# 3. Load the sentence-transformer model
# This step is unchanged
embedding_model = get_embedding_model()

Loading full dataset from: C:\Users\lib9\weak-signals-new\data\processed\cv_arxiv_data_2010-2022.parquet
Successfully loaded 8269 total records.

Filtering for year 2010...
Found 25 records for the year 2010.


In [8]:
import itertools
from collections import Counter
from tqdm.auto import tqdm

# Initialize tqdm for pandas operations (like .progress_apply)
tqdm.pandas(desc="Processing Documents for 2010")

print(f"--- Processing {len(df)} documents for year 2010 ---")

# Use .progress_apply to show a progress bar for this long step
list_of_keyword_lists = df['summary'].progress_apply(
    lambda doc: find_representative_ngrams(doc, embedding_model)
)

# Flatten the list of lists into a single list of all keywords for the year
all_keywords_for_2010 = list(itertools.chain.from_iterable(list_of_keyword_lists))

# Count the frequency of each keyword and get the top 300
keyword_counts = Counter(all_keywords_for_2010)
top_300_keywords = [kw for kw, count in keyword_counts.most_common(300)]

print(f"\nExtracted top {len(top_300_keywords)} keywords for 2010.")

--- Processing 25 documents for year 2010 ---


Processing Documents for 2010:   0%|          | 0/25 [00:00<?, ?it/s]


Extracted top 249 keywords for 2010.


In [9]:
# Import the standard list of English stop words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# 1. Use a set to get unique keywords from the top 300 list
unique_keywords = set(top_300_keywords)
print(f"Total unique keywords before cleaning: {len(unique_keywords)}")

# 2. Filter out common English stop words, single-character words, and numeric-only words
final_keywords_2010 = [
    keyword for keyword in unique_keywords 
    if keyword not in ENGLISH_STOP_WORDS and len(keyword) > 1 and not keyword.isdigit()
]

# 3. Sort the final list alphabetically for consistent output
final_keywords_2010.sort()

print(f"Total keywords for 2010 after cleaning: {len(final_keywords_2010)}")
print("\nSample of final, cleaned keywords for 2010:")
print(final_keywords_2010[:25])

Total unique keywords before cleaning: 249
Total keywords for 2010 after cleaning: 249

Sample of final, cleaned keywords for 2010:
['3d model natural', 'accurately detecting pedestrians', 'alignment point sets', 'alternative online boosting', 'approach point detection', 'approximate spectral analysis', 'asymmetric boosting', 'asymmetric boosting methods', 'automatic video segmentation', 'boosting', 'boosting algorithm', 'boosting algorithms', 'boosting algorithms improve', 'boosting detectors', 'boosting like adaboost', 'bridge computer vision', 'bright mag stars', 'caenorhabditis elegans', 'caenorhabditis elegans elegans', 'calibration multi camera', 'camera positions orientations', 'cameras global coordinate', 'cascade boosting', 'cascade boosting framework', 'catalog real stars']
