In [1]:
import pandas as pd
from collections import Counter
from tqdm.auto import tqdm
import itertools

# Import our custom modules
from src.config import config, PROJECT_ROOT
# Add the new function to the import list
from src.features.build_features import get_embedding_model, find_representative_ngrams, normalize_keywords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Initialize tqdm for pandas operations
tqdm.pandas(desc="Processing Documents")

In [3]:
# Load the processed computer vision dataset
processed_data_path = PROJECT_ROOT / config.data.processed_path
df = pd.read_parquet(processed_data_path)
print(f"Loaded {len(df)} records from {processed_data_path}")

# Load the sentence-transformer model
embedding_model = get_embedding_model()

Loaded 8269 records from C:\Users\lib9\weak-signals-new\data\processed\cv_arxiv_data_2010-2022.parquet


In [4]:
# Time-slice the data by year
df['year'] = df['published']
yearly_chunks = {year: group for year, group in df.groupby('year')}
print(f"Sliced data into {len(yearly_chunks)} yearly chunks from {df['year'].min()} to {df['year'].max()}.")

# This dictionary will store the final, normalized keywords for each year
yearly_final_keywords = {}

for year, chunk_df in sorted(yearly_chunks.items()):
    print(f"\n--- Processing {len(chunk_df)} documents for year {year} ---")
    
    list_of_keyword_lists = chunk_df['summary'].progress_apply(
        lambda doc: find_representative_ngrams(doc, embedding_model)
    )
    
    all_keywords_for_year = list(itertools.chain.from_iterable(list_of_keyword_lists))
    keyword_counts = Counter(all_keywords_for_year)
    top_300 = {kw for kw, count in keyword_counts.most_common(300)}
    
    cleaned_keywords = {
        kw for kw in top_300 
        if kw not in ENGLISH_STOP_WORDS and len(kw) > 1 and not kw.isdigit()
    }
    
    # Add the new normalization step here
    normalized_keywords = normalize_keywords(cleaned_keywords)
    
    yearly_final_keywords[year] = normalized_keywords
    print(f"Stored {len(normalized_keywords)} normalized keywords for {year}.")

Sliced data into 13 yearly chunks from 2010 to 2022.

--- Processing 25 documents for year 2010 ---


Processing Documents:   0%|          | 0/25 [00:00<?, ?it/s]

Stored 190 normalized keywords for 2010.

--- Processing 20 documents for year 2011 ---


Processing Documents:   0%|          | 0/20 [00:00<?, ?it/s]

Stored 149 normalized keywords for 2011.

--- Processing 63 documents for year 2012 ---


Processing Documents:   0%|          | 0/63 [00:00<?, ?it/s]

Stored 226 normalized keywords for 2012.

--- Processing 104 documents for year 2013 ---


Processing Documents:   0%|          | 0/104 [00:00<?, ?it/s]

Stored 216 normalized keywords for 2013.

--- Processing 129 documents for year 2014 ---


Processing Documents:   0%|          | 0/129 [00:00<?, ?it/s]

Stored 226 normalized keywords for 2014.

--- Processing 229 documents for year 2015 ---


Processing Documents:   0%|          | 0/229 [00:00<?, ?it/s]

Stored 236 normalized keywords for 2015.

--- Processing 406 documents for year 2016 ---


Processing Documents:   0%|          | 0/406 [00:00<?, ?it/s]

Stored 232 normalized keywords for 2016.

--- Processing 559 documents for year 2017 ---


Processing Documents:   0%|          | 0/559 [00:00<?, ?it/s]

Stored 226 normalized keywords for 2017.

--- Processing 817 documents for year 2018 ---


Processing Documents:   0%|          | 0/817 [00:00<?, ?it/s]

Stored 213 normalized keywords for 2018.

--- Processing 1035 documents for year 2019 ---


Processing Documents:   0%|          | 0/1035 [00:00<?, ?it/s]

Stored 216 normalized keywords for 2019.

--- Processing 1341 documents for year 2020 ---


Processing Documents:   0%|          | 0/1341 [00:00<?, ?it/s]

Stored 216 normalized keywords for 2020.

--- Processing 1710 documents for year 2021 ---


Processing Documents:   0%|          | 0/1710 [00:00<?, ?it/s]

Stored 210 normalized keywords for 2021.

--- Processing 1831 documents for year 2022 ---


Processing Documents:   0%|          | 0/1831 [00:00<?, ?it/s]

Stored 215 normalized keywords for 2022.


In [5]:
# Loop through the dictionary to display the keywords for each year

for year, keywords in sorted(yearly_final_keywords.items()):
    # Convert the set to a sorted list for consistent display
    sorted_keywords = sorted(list(keywords))
    
    print(f"\n--- Keywords for {year} ({len(sorted_keywords)} total) ---")
    # Print a sample of up to 25 keywords for the year
    print(sorted_keywords[:300])


--- Keywords for 2010 (190 total) ---
['3d model natural', 'accurately detecting pedestrians', 'alignment point sets', 'alternative online boosting', 'approach point detection', 'approximate spectral analysis', 'asymmetric boosting methods', 'automatic video segmentation', 'boosting algorithms improve', 'boosting like adaboost', 'bridge computer vision', 'bright mag stars', 'caenorhabditis elegans elegans', 'calibration multi camera', 'camera positions orientations', 'cameras global coordinate', 'cascade boosting framework', 'catalog real stars', 'centroid locations rigid', 'centroids coherently', 'centroids representing point', 'classic nn classification', 'classification graphical modeling', 'classification rely deforming', 'classifiers learned unn', 'coherent point drift', 'complexity image registration', 'computational understanding appearance', 'computer human vision', 'computer vision empirical', 'computer vision natural', 'conditional random fields', 'conventional boosting like

In [7]:
# Use set.union() to efficiently combine all keyword sets from the dictionary.
# The '*' unpacks the dictionary's values (which are sets of keywords) into arguments
# for the union function.
final_unique_keywords = set.union(*yearly_final_keywords.values())

# Convert the final set to a sorted list with the requested name
cvkeywords20102022normalized = sorted(list(final_unique_keywords))

# Print the results to confirm
print(f"Created final combined list named 'cvkeywords20102022normalized' with {len(cvkeywords20102022normalized)} unique keywords.")
print("\nSample of the final list:")
print(cvkeywords20102022normalized[:])

Created final combined list named 'cvkeywords20102022normalized' with 2266 unique keywords.

Sample of the final list:
['2d photo object', '3d cnns', '3d convolutional neural', '3d correspondences', '3d detection', '3d face', '3d face reconstruction', '3d hand pose', '3d human', '3d human pose', '3d model natural', '3d motion tracking', '3d object detection', '3d object models', '3d object reconstruction', '3d point cloud', '3d point clouds', '3d pose estimation', '3d pose known', '3d pose object', '3d reconstruction', '3d reconstruction building', '3d reconstruction emerging', '3d recovery gaze', '3d semantic', '3d shape classification', '3d shape recognition', '3d shape representation', '3d shapes', '3d surface models', '6d pose estimation', 'ability lip read', 'able learn rotation', 'abstractions generalization gives', 'accuracy imagenet', 'accurately detecting pedestrians', 'acknowledged face recognition', 'acquiring gesture artificial', 'action classification', 'action detection',