## 1. Setup & Configuration

## 1. Setup & Configuration

In [None]:
from pathlib import Path
import warnings
import time
import gc

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
warnings.filterwarnings('ignore')

# Configure display
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 10)

# Directories
project_root = (Path.cwd() / '..').resolve()
data_dir = project_root / 'data' / 'processed'
models_dir = project_root / 'models'
models_dir.mkdir(exist_ok=True)

print(f'‚úì Project root: {project_root}')
print(f'‚úì Data directory: {data_dir}')
print(f'‚úì Models will be saved to: {models_dir}')

## 2. Load Cleaned Dataset

In [None]:
# Load cleaned jobs dataset
print('Loading cleaned dataset...')
df = pd.read_parquet(data_dir / 'clean_jobs.parquet')

print(f'\n‚úì Dataset loaded: {len(df):,} jobs')
print(f'Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB')
print(f'\nKey columns for vectorization:')
print(f'  - clean_text: {df["clean_text"].notna().sum():,} ({df["clean_text"].notna().mean()*100:.1f}%)')
print(f'  - skills: {df["skills"].notna().sum():,} ({df["skills"].notna().mean()*100:.1f}%)')
print(f'  - industries: {df["industries"].notna().sum():,} ({df["industries"].notna().mean()*100:.1f}%)')

# Sample job
print(f'\nSample job (first row):')
sample_job = df.iloc[0]
print(f'Title: {sample_job["title"]}')
print(f'Skills: {sample_job["skills"][:100]}...' if sample_job["skills"] else 'Skills: N/A')
print(f'Industry: {sample_job["industries"][:50]}...' if sample_job["industries"] else 'Industry: N/A')
print(f'Clean text length: {len(sample_job["clean_text"])} chars')

In [None]:
# Create sample for faster experimentation (10k jobs)
SAMPLE_SIZE = 10000
df_sample = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).copy()
print(f'Working with sample of {len(df_sample):,} jobs for faster iteration')

# Prepare text column
texts = df_sample['clean_text'].fillna('').values
print(f'\nText corpus: {len(texts):,} documents')
print(f'Avg length: {np.mean([len(t) for t in texts]):.0f} chars')

## 3. TF-IDF Baseline

Implement traditional TF-IDF vectorization as baseline.

In [None]:
# TF-IDF Vectorization
print('Fitting TF-IDF vectorizer...')
start_time = time.time()

tfidf = TfidfVectorizer(
    max_features=5000,      # Limit vocabulary size
    ngram_range=(1, 2),     # Unigrams + bigrams
    min_df=5,               # Ignore terms appearing in < 5 docs
    max_df=0.8,             # Ignore terms appearing in > 80% docs
    stop_words='english',   # Remove English stopwords
    lowercase=True,
    dtype=np.float32        # Use float32 for memory efficiency
)

tfidf_matrix = tfidf.fit_transform(texts)
tfidf_time = time.time() - start_time

print(f'\n‚úì TF-IDF completed in {tfidf_time:.2f}s')
print(f'Matrix shape: {tfidf_matrix.shape}')
print(f'Vocabulary size: {len(tfidf.vocabulary_):,} terms')
print(f'Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%')
print(f'Memory usage: {tfidf_matrix.data.nbytes / 1024**2:.1f} MB')

In [None]:
# Test TF-IDF similarity search
def search_tfidf(query_text, top_k=5):
    """Search for similar jobs using TF-IDF."""
    # Transform query
    query_vec = tfidf.transform([query_text])
    
    # Compute similarities
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get top-K indices
    top_indices = similarities.argsort()[::-1][:top_k]
    
    # Return results
    results = []
    for idx in top_indices:
        results.append({
            'index': df_sample.index[idx],
            'title': df_sample.iloc[idx]['title'],
            'skills': df_sample.iloc[idx]['skills'],
            'industry': df_sample.iloc[idx]['industries'],
            'similarity': similarities[idx]
        })
    return results

# Test with sample query
query = "Python software engineer with machine learning experience"
print(f'Query: "{query}"\n')

start_time = time.time()
results = search_tfidf(query, top_k=5)
search_time = time.time() - start_time

print(f'Search completed in {search_time*1000:.1f}ms\n')
print('Top 5 Results:')
for i, result in enumerate(results, 1):
    print(f"\n{i}. {result['title']}")
    print(f"   Similarity: {result['similarity']:.3f}")
    print(f"   Skills: {result['skills'][:100]}..." if result['skills'] else "   Skills: N/A")
    print(f"   Industry: {result['industry'][:50]}..." if result['industry'] else "   Industry: N/A")

## 4. Sentence-Transformers (MiniLM)

Test modern embedding model for semantic understanding.

In [None]:
# Install sentence-transformers if needed
try:
    from sentence_transformers import SentenceTransformer
    print('‚úì sentence-transformers already installed')
except ImportError:
    print('Installing sentence-transformers...')
    import sys
    !{sys.executable} -m pip install -q sentence-transformers
    from sentence_transformers import SentenceTransformer
    print('‚úì sentence-transformers installed')

In [None]:
# Load MiniLM model
print('Loading sentence-transformers model...')
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
start_time = time.time()

model = SentenceTransformer(model_name)
load_time = time.time() - start_time

print(f'‚úì Model loaded in {load_time:.2f}s')
print(f'Model: {model_name}')
print(f'Embedding dimension: {model.get_sentence_embedding_dimension()}')
print(f'Max sequence length: {model.max_seq_length}')

In [None]:
# Encode texts with MiniLM
print(f'Encoding {len(texts):,} documents with MiniLM...')
start_time = time.time()

# Encode in batches for efficiency
embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # L2 normalization for cosine similarity
)

encode_time = time.time() - start_time

print(f'\n‚úì Encoding completed in {encode_time:.2f}s')
print(f'Embeddings shape: {embeddings.shape}')
print(f'Memory usage: {embeddings.nbytes / 1024**2:.1f} MB')
print(f'Avg encoding speed: {len(texts)/encode_time:.0f} docs/sec')

In [None]:
# Test MiniLM similarity search
def search_minilm(query_text, top_k=5):
    """Search for similar jobs using MiniLM embeddings."""
    # Encode query
    query_emb = model.encode([query_text], normalize_embeddings=True)
    
    # Compute similarities (dot product since normalized)
    similarities = np.dot(embeddings, query_emb.T).flatten()
    
    # Get top-K indices
    top_indices = similarities.argsort()[::-1][:top_k]
    
    # Return results
    results = []
    for idx in top_indices:
        results.append({
            'index': df_sample.index[idx],
            'title': df_sample.iloc[idx]['title'],
            'skills': df_sample.iloc[idx]['skills'],
            'industry': df_sample.iloc[idx]['industries'],
            'similarity': similarities[idx]
        })
    return results

# Test with same query
print(f'Query: "{query}"\n')

start_time = time.time()
results_minilm = search_minilm(query, top_k=5)
search_time_minilm = time.time() - start_time

print(f'Search completed in {search_time_minilm*1000:.1f}ms\n')
print('Top 5 Results (MiniLM):')
for i, result in enumerate(results_minilm, 1):
    print(f"\n{i}. {result['title']}")
    print(f"   Similarity: {result['similarity']:.3f}")
    print(f"   Skills: {result['skills'][:100]}..." if result['skills'] else "   Skills: N/A")
    print(f"   Industry: {result['industry'][:50]}..." if result['industry'] else "   Industry: N/A")

## 5. Comparison: TF-IDF vs MiniLM

In [None]:
# Benchmark comparison
comparison = pd.DataFrame([
    {
        'Method': 'TF-IDF',
        'Training Time (s)': tfidf_time,
        'Vector Dim': tfidf_matrix.shape[1],
        'Memory (MB)': tfidf_matrix.data.nbytes / 1024**2,
        'Search Speed (ms)': search_time * 1000,
        'Sparse': 'Yes'
    },
    {
        'Method': 'MiniLM',
        'Training Time (s)': encode_time,
        'Vector Dim': embeddings.shape[1],
        'Memory (MB)': embeddings.nbytes / 1024**2,
        'Search Speed (ms)': search_time_minilm * 1000,
        'Sparse': 'No'
    }
])

print('\n' + '='*70)
print('BENCHMARK COMPARISON: TF-IDF vs MiniLM')
print('='*70)
print(comparison.to_string(index=False))
print('='*70)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Training time
axes[0].bar(['TF-IDF', 'MiniLM'], [tfidf_time, encode_time], color=['skyblue', 'salmon'])
axes[0].set_ylabel('Time (seconds)', fontsize=10)
axes[0].set_title('Vectorization Time', fontsize=12, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Memory usage
tfidf_mem = tfidf_matrix.data.nbytes / 1024**2
minilm_mem = embeddings.nbytes / 1024**2
axes[1].bar(['TF-IDF', 'MiniLM'], [tfidf_mem, minilm_mem], color=['skyblue', 'salmon'])
axes[1].set_ylabel('Memory (MB)', fontsize=10)
axes[1].set_title('Memory Usage', fontsize=12, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Vector dimensions
axes[2].bar(['TF-IDF', 'MiniLM'], [tfidf_matrix.shape[1], embeddings.shape[1]], color=['skyblue', 'salmon'])
axes[2].set_ylabel('Dimensions', fontsize=10)
axes[2].set_title('Vector Dimensionality', fontsize=12, fontweight='bold')
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'images' / 'model_comparison.png', dpi=150, bbox_inches='tight')
print('‚úì Saved comparison plot to images/model_comparison.png')
plt.show()

## 6. Quality Evaluation

Test with multiple queries to assess recommendation quality.

In [None]:
# Test queries
test_queries = [
    "Python backend developer with API experience",
    "Registered nurse with emergency room experience",
    "Sales manager with B2B software experience",
    "Data scientist machine learning deep learning",
    "Project manager agile scrum certification"
]

print('Testing recommendation quality...\n')

for i, q in enumerate(test_queries, 1):
    print(f'\n{"="*70}')
    print(f'Query {i}: "{q}"')
    print('='*70)
    
    # TF-IDF results
    print('\n[TF-IDF Results]')
    tfidf_results = search_tfidf(q, top_k=3)
    for j, r in enumerate(tfidf_results, 1):
        print(f'{j}. {r["title"]} (sim: {r["similarity"]:.3f})')
    
    # MiniLM results
    print('\n[MiniLM Results]')
    minilm_results = search_minilm(q, top_k=3)
    for j, r in enumerate(minilm_results, 1):
        print(f'{j}. {r["title"]} (sim: {r["similarity"]:.3f})')

## 7. Save Models & Embeddings

In [None]:
import pickle
from scipy.sparse import save_npz

# Save TF-IDF
print('Saving models and embeddings...')

# TF-IDF vectorizer
with open(models_dir / 'tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
print('‚úì Saved TF-IDF vectorizer')

# TF-IDF matrix (sparse)
save_npz(models_dir / 'tfidf_matrix.npz', tfidf_matrix)
print('‚úì Saved TF-IDF matrix')

# MiniLM embeddings (dense)
np.save(models_dir / 'minilm_embeddings.npy', embeddings)
print('‚úì Saved MiniLM embeddings')

# Sample indices for reference
sample_indices = df_sample.index.tolist()
with open(models_dir / 'sample_indices.pkl', 'wb') as f:
    pickle.dump(sample_indices, f)
print('‚úì Saved sample indices')

print(f'\nAll artifacts saved to: {models_dir}')

## 8. FAISS Integration

For faster similarity search on larger datasets.

In [None]:
# Install FAISS if needed
try:
    import faiss
    print('‚úì FAISS already installed')
except ImportError:
    print('Installing FAISS-CPU...')
    import sys
    !{sys.executable} -m pip install -q faiss-cpu
    import faiss
    print('‚úì FAISS installed')

In [None]:
# Build FAISS index for MiniLM embeddings
print('Building FAISS index...')
start_time = time.time()

# Use IndexFlatIP for inner product (cosine similarity with normalized vectors)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings.astype('float32'))

build_time = time.time() - start_time

print(f'‚úì FAISS index built in {build_time:.2f}s')
print(f'Index size: {index.ntotal:,} vectors')
print(f'Index dimension: {index.d}')

In [None]:
# Test FAISS search
def search_faiss(query_text, top_k=5):
    """Search using FAISS index."""
    # Encode query
    query_emb = model.encode([query_text], normalize_embeddings=True).astype('float32')
    
    # Search
    similarities, indices = index.search(query_emb, top_k)
    
    # Return results
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'index': df_sample.index[idx],
            'title': df_sample.iloc[idx]['title'],
            'skills': df_sample.iloc[idx]['skills'],
            'industry': df_sample.iloc[idx]['industries'],
            'similarity': similarities[0][i]
        })
    return results

# Test
print(f'Query: "{query}"\n')
start_time = time.time()
faiss_results = search_faiss(query, top_k=5)
search_time_faiss = time.time() - start_time

print(f'FAISS search completed in {search_time_faiss*1000:.2f}ms\n')
print('Top 5 Results:')
for i, result in enumerate(faiss_results, 1):
    print(f"{i}. {result['title']} (sim: {result['similarity']:.3f})")

In [None]:
# Save FAISS index
faiss.write_index(index, str(models_dir / 'faiss_index.bin'))
print(f'‚úì FAISS index saved to {models_dir / "faiss_index.bin"}')

## 9. Summary & Recommendations

In [None]:
print('\n' + '='*70)
print('DAY 4 SUMMARY: VECTORIZATION EXPERIMENTS')
print('='*70)

print('\nüìä Models Tested:')
print('  1. TF-IDF (Baseline)')
print('  2. MiniLM (sentence-transformers/all-MiniLM-L6-v2)')
print('  3. FAISS (Fast similarity search)')

print('\n‚ö° Performance:')
print(f'  - TF-IDF: {tfidf_time:.1f}s training, {tfidf_mem:.1f} MB')
print(f'  - MiniLM: {encode_time:.1f}s encoding, {minilm_mem:.1f} MB')
print(f'  - FAISS: {build_time:.2f}s indexing, {search_time_faiss*1000:.2f}ms search')

print('\nüí° Recommendations:')
print('  ‚úì Use MiniLM for semantic understanding')
print('  ‚úì Use FAISS for fast search on full dataset')
print('  ‚úì TF-IDF as fallback for keyword matching')
print('  ‚úì Combine both: hybrid ranking (TF-IDF + MiniLM)')

print('\nüìÅ Saved Artifacts:')
print(f'  - {models_dir / "tfidf_vectorizer.pkl"}')
print(f'  - {models_dir / "tfidf_matrix.npz"}')
print(f'  - {models_dir / "minilm_embeddings.npy"}')
print(f'  - {models_dir / "faiss_index.bin"}')
print(f'  - {models_dir / "sample_indices.pkl"}')

print('\nüöÄ Next Steps (Day 5):')
print('  1. Create src/vector_store.py module')
print('  2. Implement get_recommendations() function')
print('  3. Add filtering (location, work type, salary)')
print('  4. Write unit tests')
print('  5. Evaluate Precision@K')

print('\n' + '='*70)
print('‚úÖ Day 4 Complete - Ready for Recommendation Engine')
print('='*70)

# Day 4: Model Experimentation - Vectorization

**Date**: November 24, 2025  
**Goal**: Compare TF-IDF vs MiniLM embeddings for job recommendation

## Objectives
1. Implement TF-IDF baseline vectorization
2. Test sentence-transformers MiniLM model
3. Benchmark quality, speed, and memory usage
4. Create vector store with FAISS
5. Evaluate recommendation quality

In [3]:
from pathlib import Path
import warnings
import time
import gc

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
warnings.filterwarnings('ignore')

# Configure display
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 10)

# Directories
project_root = (Path.cwd() / '..').resolve()
data_dir = project_root / 'data' / 'processed'
models_dir = project_root / 'models'
models_dir.mkdir(exist_ok=True)

print(f'‚úì Project root: {project_root}')
print(f'‚úì Data directory: {data_dir}')
print(f'‚úì Models will be saved to: {models_dir}')

‚úì Project root: /home/sakana/Code/DS-RS
‚úì Data directory: /home/sakana/Code/DS-RS/data/processed
‚úì Models will be saved to: /home/sakana/Code/DS-RS/models


## 2. Load Cleaned Dataset

## 3. TF-IDF Baseline

Implement traditional TF-IDF vectorization as baseline.

## 4. Sentence-Transformers (MiniLM)

Test modern embedding model for semantic understanding.

## 5. Comparison: TF-IDF vs MiniLM

In [4]:
# Load cleaned jobs dataset
print('Loading cleaned dataset...')
df = pd.read_parquet(data_dir / 'clean_jobs.parquet')

print(f'\n‚úì Dataset loaded: {len(df):,} jobs')
print(f'Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB')
print(f'\nKey columns for vectorization:')
print(f'  - clean_text: {df["clean_text"].notna().sum():,} ({df["clean_text"].notna().mean()*100:.1f}%)')
print(f'  - skills: {df["skills"].notna().sum():,} ({df["skills"].notna().mean()*100:.1f}%)')
print(f'  - industries: {df["industries"].notna().sum():,} ({df["industries"].notna().mean()*100:.1f}%)')

# Sample job
print(f'\nSample job (first row):')
sample_job = df.iloc[0]
print(f'Title: {sample_job["title"]}')
print(f'Skills: {sample_job["skills"][:100]}...' if sample_job["skills"] else 'Skills: N/A')
print(f'Industry: {sample_job["industries"][:50]}...' if sample_job["industries"] else 'Industry: N/A')
print(f'Clean text length: {len(sample_job["clean_text"])} chars')

Loading cleaned dataset...

‚úì Dataset loaded: 123,842 jobs

‚úì Dataset loaded: 123,842 jobs
Memory usage: 2.09 GB

Key columns for vectorization:
Memory usage: 2.09 GB

Key columns for vectorization:


KeyError: 'clean_text'

In [None]:
# Create sample for faster experimentation (10k jobs)
SAMPLE_SIZE = 10000
df_sample = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).copy()
print(f'Working with sample of {len(df_sample):,} jobs for faster iteration')

# Prepare text column
texts = df_sample['clean_text'].fillna('').values
print(f'\nText corpus: {len(texts):,} documents')
print(f'Avg length: {np.mean([len(t) for t in texts]):.0f} chars')

In [None]:
# TF-IDF Vectorization
print('Fitting TF-IDF vectorizer...')
start_time = time.time()

tfidf = TfidfVectorizer(
    max_features=5000,      # Limit vocabulary size
    ngram_range=(1, 2),     # Unigrams + bigrams
    min_df=5,               # Ignore terms appearing in < 5 docs
    max_df=0.8,             # Ignore terms appearing in > 80% docs
    stop_words='english',   # Remove English stopwords
    lowercase=True,
    dtype=np.float32        # Use float32 for memory efficiency
)

tfidf_matrix = tfidf.fit_transform(texts)
tfidf_time = time.time() - start_time

print(f'\n‚úì TF-IDF completed in {tfidf_time:.2f}s')
print(f'Matrix shape: {tfidf_matrix.shape}')
print(f'Vocabulary size: {len(tfidf.vocabulary_):,} terms')
print(f'Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%')
print(f'Memory usage: {tfidf_matrix.data.nbytes / 1024**2:.1f} MB')

In [None]:
# Test TF-IDF similarity search
def search_tfidf(query_text, top_k=5):
    """Search for similar jobs using TF-IDF."""
    # Transform query
    query_vec = tfidf.transform([query_text])
    
    # Compute similarities
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get top-K indices
    top_indices = similarities.argsort()[::-1][:top_k]
    
    # Return results
    results = []
    for idx in top_indices:
        results.append({
            'index': df_sample.index[idx],
            'title': df_sample.iloc[idx]['title'],
            'skills': df_sample.iloc[idx]['skills'],
            'industry': df_sample.iloc[idx]['industries'],
            'similarity': similarities[idx]
        })
    return results

# Test with sample query
query = "Python software engineer with machine learning experience"
print(f'Query: "{query}"\n')

start_time = time.time()
results = search_tfidf(query, top_k=5)
search_time = time.time() - start_time

print(f'Search completed in {search_time*1000:.1f}ms\n')
print('Top 5 Results:')
for i, result in enumerate(results, 1):
    print(f"\n{i}. {result['title']}")
    print(f"   Similarity: {result['similarity']:.3f}")
    print(f"   Skills: {result['skills'][:100]}..." if result['skills'] else "   Skills: N/A")
    print(f"   Industry: {result['industry'][:50]}..." if result['industry'] else "   Industry: N/A")

In [None]:
# Install sentence-transformers if needed
try:
    from sentence_transformers import SentenceTransformer
    print('‚úì sentence-transformers already installed')
except ImportError:
    print('Installing sentence-transformers...')
    import sys
    !{sys.executable} -m pip install -q sentence-transformers
    from sentence_transformers import SentenceTransformer
    print('‚úì sentence-transformers installed')

In [None]:
# Load MiniLM model
print('Loading sentence-transformers model...')
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
start_time = time.time()

model = SentenceTransformer(model_name)
load_time = time.time() - start_time

print(f'‚úì Model loaded in {load_time:.2f}s')
print(f'Model: {model_name}')
print(f'Embedding dimension: {model.get_sentence_embedding_dimension()}')
print(f'Max sequence length: {model.max_seq_length}')

In [None]:
# Encode texts with MiniLM
print(f'Encoding {len(texts):,} documents with MiniLM...')
start_time = time.time()

# Encode in batches for efficiency
embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # L2 normalization for cosine similarity
)

encode_time = time.time() - start_time

print(f'\n‚úì Encoding completed in {encode_time:.2f}s')
print(f'Embeddings shape: {embeddings.shape}')
print(f'Memory usage: {embeddings.nbytes / 1024**2:.1f} MB')
print(f'Avg encoding speed: {len(texts)/encode_time:.0f} docs/sec')

In [None]:
# Test MiniLM similarity search
def search_minilm(query_text, top_k=5):
    """Search for similar jobs using MiniLM embeddings."""
    # Encode query
    query_emb = model.encode([query_text], normalize_embeddings=True)
    
    # Compute similarities (dot product since normalized)
    similarities = np.dot(embeddings, query_emb.T).flatten()
    
    # Get top-K indices
    top_indices = similarities.argsort()[::-1][:top_k]
    
    # Return results
    results = []
    for idx in top_indices:
        results.append({
            'index': df_sample.index[idx],
            'title': df_sample.iloc[idx]['title'],
            'skills': df_sample.iloc[idx]['skills'],
            'industry': df_sample.iloc[idx]['industries'],
            'similarity': similarities[idx]
        })
    return results

# Test with same query
print(f'Query: "{query}"\n')

start_time = time.time()
results_minilm = search_minilm(query, top_k=5)
search_time_minilm = time.time() - start_time

print(f'Search completed in {search_time_minilm*1000:.1f}ms\n')
print('Top 5 Results (MiniLM):')
for i, result in enumerate(results_minilm, 1):
    print(f"\n{i}. {result['title']}")
    print(f"   Similarity: {result['similarity']:.3f}")
    print(f"   Skills: {result['skills'][:100]}..." if result['skills'] else "   Skills: N/A")
    print(f"   Industry: {result['industry'][:50]}..." if result['industry'] else "   Industry: N/A")

## 8. FAISS Integration

For faster similarity search on larger datasets.

In [None]:
# Install FAISS if needed
try:
    import faiss
    print('‚úì FAISS already installed')
except ImportError:
    print('Installing FAISS-CPU...')
    import sys
    !{sys.executable} -m pip install -q faiss-cpu
    import faiss
    print('‚úì FAISS installed')

In [None]:
# Build FAISS index for MiniLM embeddings
print('Building FAISS index...')
start_time = time.time()

# Use IndexFlatIP for inner product (cosine similarity with normalized vectors)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings.astype('float32'))

build_time = time.time() - start_time

print(f'‚úì FAISS index built in {build_time:.2f}s')
print(f'Index size: {index.ntotal:,} vectors')
print(f'Index dimension: {index.d}')

In [None]:
# Test FAISS search
def search_faiss(query_text, top_k=5):
    """Search using FAISS index."""
    # Encode query
    query_emb = model.encode([query_text], normalize_embeddings=True).astype('float32')
    
    # Search
    similarities, indices = index.search(query_emb, top_k)
    
    # Return results
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'index': df_sample.index[idx],
            'title': df_sample.iloc[idx]['title'],
            'skills': df_sample.iloc[idx]['skills'],
            'industry': df_sample.iloc[idx]['industries'],
            'similarity': similarities[0][i]
        })
    return results

# Test
print(f'Query: "{query}"\n')
start_time = time.time()
faiss_results = search_faiss(query, top_k=5)
search_time_faiss = time.time() - start_time

print(f'FAISS search completed in {search_time_faiss*1000:.2f}ms\n')
print('Top 5 Results:')
for i, result in enumerate(faiss_results, 1):
    print(f"{i}. {result['title']} (sim: {result['similarity']:.3f})")

In [None]:
# Save FAISS index
faiss.write_index(index, str(models_dir / 'faiss_index.bin'))
print(f'‚úì FAISS index saved to {models_dir / "faiss_index.bin"}')

## 9. Summary & Recommendations

In [None]:
print('\n' + '='*70)
print('DAY 4 SUMMARY: VECTORIZATION EXPERIMENTS')
print('='*70)

print('\nüìä Models Tested:')
print('  1. TF-IDF (Baseline)')
print('  2. MiniLM (sentence-transformers/all-MiniLM-L6-v2)')
print('  3. FAISS (Fast similarity search)')

print('\n‚ö° Performance:')
print(f'  - TF-IDF: {tfidf_time:.1f}s training, {tfidf_mem:.1f} MB')
print(f'  - MiniLM: {encode_time:.1f}s encoding, {minilm_mem:.1f} MB')
print(f'  - FAISS: {build_time:.2f}s indexing, {search_time_faiss*1000:.2f}ms search')

print('\nüí° Recommendations:')
print('  ‚úì Use MiniLM for semantic understanding')
print('  ‚úì Use FAISS for fast search on full dataset')
print('  ‚úì TF-IDF as fallback for keyword matching')
print('  ‚úì Combine both: hybrid ranking (TF-IDF + MiniLM)')

print('\nüìÅ Saved Artifacts:')
print(f'  - {models_dir / "tfidf_vectorizer.pkl"}')
print(f'  - {models_dir / "tfidf_matrix.npz"}')
print(f'  - {models_dir / "minilm_embeddings.npy"}')
print(f'  - {models_dir / "faiss_index.bin"}')
print(f'  - {models_dir / "sample_indices.pkl"}')

print('\nüöÄ Next Steps (Day 5):')
print('  1. Create src/vector_store.py module')
print('  2. Implement get_recommendations() function')
print('  3. Add filtering (location, work type, salary)')
print('  4. Write unit tests')
print('  5. Evaluate Precision@K')

print('\n' + '='*70)
print('‚úÖ Day 4 Complete - Ready for Recommendation Engine')
print('='*70)

In [None]:
# Benchmark comparison
comparison = pd.DataFrame([
    {
        'Method': 'TF-IDF',
        'Training Time (s)': tfidf_time,
        'Vector Dim': tfidf_matrix.shape[1],
        'Memory (MB)': tfidf_matrix.data.nbytes / 1024**2,
        'Search Speed (ms)': search_time * 1000,
        'Sparse': 'Yes'
    },
    {
        'Method': 'MiniLM',
        'Training Time (s)': encode_time,
        'Vector Dim': embeddings.shape[1],
        'Memory (MB)': embeddings.nbytes / 1024**2,
        'Search Speed (ms)': search_time_minilm * 1000,
        'Sparse': 'No'
    }
])

print('\n' + '='*70)
print('BENCHMARK COMPARISON: TF-IDF vs MiniLM')
print('='*70)
print(comparison.to_string(index=False))
print('='*70)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Training time
axes[0].bar(['TF-IDF', 'MiniLM'], [tfidf_time, encode_time], color=['skyblue', 'salmon'])
axes[0].set_ylabel('Time (seconds)', fontsize=10)
axes[0].set_title('Vectorization Time', fontsize=12, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Memory usage
tfidf_mem = tfidf_matrix.data.nbytes / 1024**2
minilm_mem = embeddings.nbytes / 1024**2
axes[1].bar(['TF-IDF', 'MiniLM'], [tfidf_mem, minilm_mem], color=['skyblue', 'salmon'])
axes[1].set_ylabel('Memory (MB)', fontsize=10)
axes[1].set_title('Memory Usage', fontsize=12, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Vector dimensions
axes[2].bar(['TF-IDF', 'MiniLM'], [tfidf_matrix.shape[1], embeddings.shape[1]], color=['skyblue', 'salmon'])
axes[2].set_ylabel('Dimensions', fontsize=10)
axes[2].set_title('Vector Dimensionality', fontsize=12, fontweight='bold')
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'images' / 'model_comparison.png', dpi=150, bbox_inches='tight')
print('‚úì Saved comparison plot to images/model_comparison.png')
plt.show()

## 6. Quality Evaluation

Test with multiple queries to assess recommendation quality.

In [None]:
# Test queries
test_queries = [
    "Python backend developer with API experience",
    "Registered nurse with emergency room experience",
    "Sales manager with B2B software experience",
    "Data scientist machine learning deep learning",
    "Project manager agile scrum certification"
]

print('Testing recommendation quality...\n')

for i, q in enumerate(test_queries, 1):
    print(f'\n{"="*70}')
    print(f'Query {i}: "{q}"')
    print('='*70)
    
    # TF-IDF results
    print('\n[TF-IDF Results]')
    tfidf_results = search_tfidf(q, top_k=3)
    for j, r in enumerate(tfidf_results, 1):
        print(f'{j}. {r["title"]} (sim: {r["similarity"]:.3f})')
    
    # MiniLM results
    print('\n[MiniLM Results]')
    minilm_results = search_minilm(q, top_k=3)
    for j, r in enumerate(minilm_results, 1):
        print(f'{j}. {r["title"]} (sim: {r["similarity"]:.3f})')

## 7. Save Models & Embeddings

In [None]:
import pickle
from scipy.sparse import save_npz

# Save TF-IDF
print('Saving models and embeddings...')

# TF-IDF vectorizer
with open(models_dir / 'tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
print('‚úì Saved TF-IDF vectorizer')

# TF-IDF matrix (sparse)
save_npz(models_dir / 'tfidf_matrix.npz', tfidf_matrix)
print('‚úì Saved TF-IDF matrix')

# MiniLM embeddings (dense)
np.save(models_dir / 'minilm_embeddings.npy', embeddings)
print('‚úì Saved MiniLM embeddings')

# Sample indices for reference
sample_indices = df_sample.index.tolist()
with open(models_dir / 'sample_indices.pkl', 'wb') as f:
    pickle.dump(sample_indices, f)
print('‚úì Saved sample indices')

print(f'\nAll artifacts saved to: {models_dir}')