# Domain Analysis & Exploration

This notebook explores domain-specific EDA: label distribution, top n-grams, keyword trends, TF-IDF overview, and a 2D projection of document embeddings (LSA + t-SNE).

In [None]:
from pathlib import Path
import pandas as pd
from src.analysis import load_data, preprocess_df, top_ngrams, class_top_ngrams, compute_tfidf_matrix, lsa_embeddings, tsne_project, plot_scatter, label_distribution, compute_keyword_trends
DATA = Path('data') / 'synthetic_texts.csv'
if not DATA.exists():
    from generate_data import generate
    generate(600)
df = load_data(DATA)
df = preprocess_df(df)
print('Rows:', len(df))

In [None]:
# Label distribution
print(label_distribution(df))

In [None]:
# Top unigrams and bigrams overall
print('Top unigrams overall:')
print(top_ngrams(df['joined_tokens'], n=15, ngram_range=(1,1)))
print('Top bigrams overall:')
print(top_ngrams(df['joined_tokens'], n=15, ngram_range=(2,2)))

In [None]:
# Per-class top terms
per_class = class_top_ngrams(df, n=10, ngram_range=(1,1))
for k,v in per_class.items():
    print(k, v[:8])

## TF-IDF -> LSA -> t-SNE projection
This projection helps visualize how documents cluster by label or domain in 2D.

In [None]:
vec, X = compute_tfidf_matrix(df['joined_tokens'], max_features=2000)
emb, svd = lsa_embeddings(X, n_components=50)
print('LSA shape:', emb.shape)
proj = tsne_project(emb, n_components=2, perplexity=30)
labels = df['label'].astype('category').cat.codes.values
plot_scatter(proj, labels, Path('results/tsne_labels.png'), title='LSA + t-SNE by label')
print('Saved results/tsne_labels.png')

## Keyword trends (example subset)
Plot keyword counts over time for a few important tokens.

In [None]:
trends = compute_keyword_trends(df, domain_col='domain', text_col='joined_tokens', top_k=6, freq='7D')
for dom, dfi in trends.items():
    print(dom, dfi.columns.tolist())
print('Keyword trend frames computed; plot with plotly or save as PNG using helper functions in src.analysis')