In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import time

# Load data
data = "/home/SaiKashyap/ner/translation_data.csv"
df = pd.read_csv(data)

# Fix the text column preparation
# Check if English column contains strings or lists
if df['English'].dtype == 'object' and isinstance(df['English'].iloc[0], str):
    # If already strings, use as is
    df['text'] = df['English']
else:
    # If lists or other structures, join them
    df['text'] = df['English'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

    # Fill NaN values with empty string

# Or drop rows with NaN values
df = df.dropna(subset=['text'])

# Then vectorize
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])

# OPTION 1: FASTEST FOR VERY LARGE DATASETS (>1M rows)
start_time = time.time()

# Step 1: Generate term frequencies with hashing (memory efficient)
hasher = HashingVectorizer(
    n_features=2**20,            # 1M features to minimize collisions
    ngram_range=(1, 2),          # Include bigrams for better context
    alternate_sign=False,        # No negative values for better interpretability
    dtype=np.float32             # Use float32 to reduce memory by 50% vs float64
)
X_counts = hasher.transform(df['text'])

# Step 2: Convert counts to TF-IDF
tfidf_transformer = TfidfTransformer(
    sublinear_tf=True,           # Apply 1+log(tf) scaling
    use_idf=True,                # Apply inverse document frequency weighting
    smooth_idf=True              # Add 1 to document frequencies to prevent division by zero
)
X_tfidf = tfidf_transformer.fit_transform(X_counts)

print(f"HashingVectorizer + TfidfTransformer time: {time.time() - start_time:.2f} seconds")
print(f"Sparse matrix shape: {X_tfidf.shape}")
print(f"Sparse matrix memory: {X_tfidf.data.nbytes / 1024 / 1024:.2f} MB")

# OPTION 2: RECOMMENDED FOR DATASETS <1M ROWS (cleaner, with feature names)
start_time = time.time()

# One-step TF-IDF computation
vectorizer = TfidfVectorizer(
    max_features=50_000,         # Limit vocabulary size
    sublinear_tf=True,           # Apply 1+log(tf) scaling
    ngram_range=(1, 2),          # Include unigrams and bigrams
    min_df=2,                    # Ignore terms appearing in less than 2 documents
    max_df=0.95,                 # Ignore terms appearing in more than 95% of documents
    dtype=np.float32             # Use float32 for memory efficiency
)
X_tfidf_direct = vectorizer.fit_transform(df['text'])

print(f"TfidfVectorizer time: {time.time() - start_time:.2f} seconds")

# Example query function
def search_similar_documents(query_text, top_n=5):
    query_vec = vectorizer.transform([query_text])
    # Compute cosine similarity
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(query_vec, X_tfidf_direct).flatten()
    # Get top N indices
    top_indices = similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['English', 'text']], similarities[top_indices]

# Example usage:
# similar_docs, scores = search_similar_documents("your query text", top_n=5)
# print(similar_docs)
# print(scores)



HashingVectorizer + TfidfTransformer time: 4.00 seconds
Sparse matrix shape: (141356, 1048576)
Sparse matrix memory: 23.02 MB
TfidfVectorizer time: 9.40 seconds


In [2]:
# 1. Process dataframe in chunks to reduce memory pressure
chunk_size = 10000
X_list = []

for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i+chunk_size]
    X_chunk = vectorizer.transform(chunk['text'])
    X_list.append(X_chunk)

from scipy import sparse
X_tfidf = sparse.vstack(X_list)

# 2. Use multiprocessing for parallel processing
from joblib import Parallel, delayed

def process_chunk(chunk):
    return vectorizer.transform(chunk)

chunks = [df['text'][i:i+chunk_size] for i in range(0, len(df), chunk_size)]
X_list = Parallel(n_jobs=-1)(delayed(process_chunk)(chunk) for chunk in chunks)
X_tfidf = sparse.vstack(X_list)

# 3. Save the sparse matrix efficiently for later use
from scipy.sparse import save_npz
save_npz('tfidf_matrix.npz', X_tfidf)

# 4. Load the matrix when needed
from scipy.sparse import load_npz
X_tfidf = load_npz('tfidf_matrix.npz')


In [3]:
 similar_docs, scores = search_similar_documents("Narada rishi", top_n=5)
print(similar_docs)
print(scores)


                                                  English  \
126227                      The lute of Narada is Mahati.   
136042                              This is a rishi/sage.   
67658   There Sthulaksha, Shavalaksha, Kanwa, Medhatit...   
110566                            Narada wandered in sky.   
133350           Saint Narada visits hermitage of Valmiki   

                                                     text  
126227                      The lute of Narada is Mahati.  
136042                              This is a rishi/sage.  
67658   There Sthulaksha, Shavalaksha, Kanwa, Medhatit...  
110566                            Narada wandered in sky.  
133350           Saint Narada visits hermitage of Valmiki  
[0.42471656 0.3312608  0.32149124 0.30741057 0.28697023]


In [4]:
# Libraries
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For Milvus integration
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

# Load data
data = "/home/SaiKashyap/ner/translation_data.csv"
df = pd.read_csv(data)

# Fix the text column preparation
if df['English'].dtype == 'object' and isinstance(df['English'].iloc[0], str):
    df['text'] = df['English']
else:
    df['text'] = df['English'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Drop rows with NaN values
df = df.dropna(subset=['text'])
print(f"Dataset size: {len(df)} documents")

# Reset index to use as document IDs
df.reset_index(drop=True, inplace=True)

#########################
# PART 1: SCIKIT-LEARN TFIDF IMPLEMENTATION
#########################

print("\n--- SCIKIT-LEARN IMPLEMENTATION ---")

# OPTION 1: HashingVectorizer + TfidfTransformer (memory efficient)
start_time = time.time()

hasher = HashingVectorizer(
    n_features=2**18,            # 262K features (reduced for comparison fairness)
    ngram_range=(1, 2),          
    alternate_sign=False,        
    dtype=np.float32             
)
X_counts = hasher.transform(df['text'])

tfidf_transformer = TfidfTransformer(
    sublinear_tf=True,           
    use_idf=True,                
    smooth_idf=True              
)
X_tfidf = tfidf_transformer.fit_transform(X_counts)

hashing_time = time.time() - start_time
print(f"HashingVectorizer + TfidfTransformer time: {hashing_time:.2f} seconds")
print(f"Sparse matrix shape: {X_tfidf.shape}")
print(f"Sparse matrix memory: {X_tfidf.data.nbytes / 1024 / 1024:.2f} MB")

# OPTION 2: One-step TfidfVectorizer (with feature names)
start_time = time.time()

vectorizer = TfidfVectorizer(
    max_features=50_000,         
    sublinear_tf=True,           
    ngram_range=(1, 2),          
    min_df=2,                    
    max_df=0.95,                 
    dtype=np.float32             
)
X_tfidf_direct = vectorizer.fit_transform(df['text'])

tfidf_time = time.time() - start_time
print(f"TfidfVectorizer time: {tfidf_time:.2f} seconds")

# Test scikit-learn search speed
def sklearn_search(query_text, top_n=5):
    start_time = time.time()
    query_vec = vectorizer.transform([query_text])
    similarities = cosine_similarity(query_vec, X_tfidf_direct).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    search_time = time.time() - start_time
    return df.iloc[top_indices][['English', 'text']], similarities[top_indices], search_time

# Test search with a sample query
test_query = "Narada Maharishi"
results, scores, search_time = sklearn_search(test_query, top_n=5)
print(f"\nScikit-learn search time: {search_time:.4f} seconds")

#########################
# PART 2: MILVUS VECTOR DATABASE IMPLEMENTATION
#########################

print("\n--- MILVUS VECTOR DATABASE IMPLEMENTATION ---")

try:
    # Connect to Milvus (make sure Milvus is running)
    connections.connect("default", host="localhost", port="19530")
    
    # Define collection schema
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=X_tfidf_direct.shape[1])
    ]
    schema = CollectionSchema(fields, "TF-IDF vectors for text search")
    
    # Create or recreate collection
    collection_name = "tfidf_vectors"
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
    
    collection = Collection(collection_name, schema)
    
    # Insert vectors into Milvus
    start_time = time.time()
    
    # Convert sparse matrix to dense for Milvus
    # Note: For large datasets, process in batches to avoid memory issues
    batch_size = 1000
    total_batches = (len(df) + batch_size - 1) // batch_size
    
    for i in range(total_batches):
        start_idx = i * batch_size
        end_idx = min((i+1) * batch_size, len(df))
        
        # Extract batch of vectors and convert to dense
        batch_vectors = X_tfidf_direct[start_idx:end_idx].toarray()
        
        # Prepare data for insertion
        entities = [
            # Document IDs
            list(range(start_idx, end_idx)),
            # TF-IDF vectors
            batch_vectors.tolist()
        ]
        
        # Insert into collection
        collection.insert(entities)
        
        if (i + 1) % 10 == 0 or (i + 1) == total_batches:
            print(f"Processed batch {i+1}/{total_batches}")
    
    # Create IVF_FLAT index (efficient for medium-sized datasets)
    index_params = {
        "metric_type": "IP",  # Inner product for cosine similarity
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128}  # Number of clusters
    }
    
    print("Building Milvus index...")
    collection.create_index("embedding", index_params)
    
    # Load collection into memory for search
    collection.load()
    
    milvus_build_time = time.time() - start_time
    print(f"Milvus index build time: {milvus_build_time:.2f} seconds")
    
    # Define Milvus search function
    def milvus_search(query_text, top_n=5):
        start_time = time.time()
        
        # Convert query to vector
        query_vec = vectorizer.transform([query_text]).toarray()
        
        # Search parameters
        search_params = {
            "metric_type": "IP",  # Inner product for cosine similarity
            "params": {"nprobe": 16}  # Number of clusters to search
        }
        
        # Perform search
        results = collection.search(
            data=query_vec.tolist(),
            anns_field="embedding",
            param=search_params,
            limit=top_n,
            output_fields=[]
        )
        
        # Extract results
        doc_ids = [hit.id for hit in results[0]]
        scores = [hit.score for hit in results[0]]
        
        search_time = time.time() - start_time
        return df.iloc[doc_ids][['English', 'text']], scores, search_time
    
    # Test Milvus search with the same query
    milvus_results, milvus_scores, milvus_search_time = milvus_search(test_query, top_n=5)
    print(f"Milvus search time: {milvus_search_time:.4f} seconds")
    
    # Comparison of both methods
    print("\n--- PERFORMANCE COMPARISON ---")
    print(f"{'Method':<20} {'Build Time (s)':<15} {'Search Time (s)':<15}")
    print(f"{'-'*50}")
    print(f"{'Scikit-learn TF-IDF':<20} {tfidf_time:<15.2f} {search_time:<15.6f}")
    print(f"{'Milvus Vector DB':<20} {milvus_build_time:<15.2f} {milvus_search_time:<15.6f}")
    print(f"{'Speedup Ratio':<20} {'N/A':<15} {search_time/milvus_search_time if milvus_search_time > 0 else 'N/A':<15.2f}")
    
    # Optional: Show detailed results comparison
    print("\n--- SEARCH RESULTS COMPARISON ---")
    print("\nScikit-learn top 3 results:")
    print(results[['English']].head(3))
    
    print("\nMilvus top 3 results:")
    print(milvus_results[['English']].head(3))
    
    # Close Milvus connection
    collection.release()
    connections.disconnect("default")

except Exception as e:
    print(f"\nError connecting to Milvus: {e}")
    print("To use Milvus, make sure it's properly installed and running. You can install it via:")
    print("  pip install pymilvus")
    print("And start Milvus using Docker with:")
    print("  docker run -d --name milvus -p 19530:19530 -p 19121:19121 milvusdb/milvus:latest")

#########################
# PART 3: SCALING CONSIDERATIONS
#########################

print("\n--- SCALING CONSIDERATIONS ---")
print("1. For datasets < 1M documents: TfidfVectorizer + Scikit-learn is sufficient")
print("2. For 1M-10M documents: Milvus with IVF_FLAT provides better search latency")
print("3. For >10M documents: Consider these optimizations:")
print("   - Use HashingVectorizer to reduce memory usage")
print("   - Process in batches of 10K-100K documents")
print("   - Use HNSW index in Milvus for better search performance")
print("   - Consider distributed deployment with Milvus shards")

# Batch processing example for very large datasets
def process_large_dataset(file_path, batch_size=10000):
    # Initialize vectorizer
    vectorizer = HashingVectorizer(
        n_features=2**20, 
        ngram_range=(1, 2), 
        alternate_sign=False
    )
    
    # Process in batches
    for chunk in pd.read_csv(file_path, chunksize=batch_size):
        # Process chunk
        X_chunk = vectorizer.transform(chunk['text_column'].fillna(''))
        
        # Here you would insert into Milvus or other storage
        # collection.insert([chunk.index.tolist(), X_chunk.toarray().tolist()])
        
    print(f"Processed dataset in batches of {batch_size}")

# Note: Uncomment to process a very large file in batches
# process_large_dataset("/path/to/large/file.csv", batch_size=50000)


Dataset size: 141356 documents

--- SCIKIT-LEARN IMPLEMENTATION ---
HashingVectorizer + TfidfTransformer time: 4.05 seconds
Sparse matrix shape: (141356, 262144)
Sparse matrix memory: 23.02 MB


2025-03-20 16:01:24,089 [ERROR][handler]: RPC error: [__internal_register], <MilvusException: (code=1, message=Incorrect port or sdk is incompatible with server, please check your port or downgrade your sdk or upgrade your server)>, <Time:{'RPC start': '2025-03-20 16:01:24.088293', 'RPC error': '2025-03-20 16:01:24.089565'}> (decorators.py:140)


TfidfVectorizer time: 9.37 seconds

Scikit-learn search time: 0.1321 seconds

--- MILVUS VECTOR DATABASE IMPLEMENTATION ---

Error connecting to Milvus: <MilvusException: (code=1, message=Incorrect port or sdk is incompatible with server, please check your port or downgrade your sdk or upgrade your server)>
To use Milvus, make sure it's properly installed and running. You can install it via:
  pip install pymilvus
And start Milvus using Docker with:
  docker run -d --name milvus -p 19530:19530 -p 19121:19121 milvusdb/milvus:latest

--- SCALING CONSIDERATIONS ---
1. For datasets < 1M documents: TfidfVectorizer + Scikit-learn is sufficient
2. For 1M-10M documents: Milvus with IVF_FLAT provides better search latency
3. For >10M documents: Consider these optimizations:
   - Use HashingVectorizer to reduce memory usage
   - Process in batches of 10K-100K documents
   - Use HNSW index in Milvus for better search performance
   - Consider distributed deployment with Milvus shards


In [1]:
# Libraries
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# GPU libraries
import cudf
import cupy as cp
from cuml.feature_extraction.text import HashingVectorizer as cuHashingVectorizer
from cuml.feature_extraction.text import TfidfTransformer as cuTfidfTransformer
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidfVectorizer

# Load data
data = "/home/SaiKashyap/ner/translation_data.csv"

# --------- CPU VERSION (ORIGINAL) ---------
print("=== CPU VERSION (scikit-learn) ===")
start_time = time.time()

# Load data
df = pd.read_csv(data)

# Fix the text column preparation
if df['English'].dtype == 'object' and isinstance(df['English'].iloc[0], str):
    df['text'] = df['English']
else:
    df['text'] = df['English'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Drop rows with NaN values
df = df.dropna(subset=['text'])
cpu_load_time = time.time() - start_time
print(f"CPU data loading time: {cpu_load_time:.4f} seconds")

# OPTION 1: HashingVectorizer + TfidfTransformer
start_time = time.time()
hasher = HashingVectorizer(
    n_features=2**18,
    ngram_range=(1, 2),
    alternate_sign=False,
    dtype=np.float32
)
X_counts = hasher.transform(df['text'])

tfidf_transformer = TfidfTransformer(
    sublinear_tf=True,
    use_idf=True,
    smooth_idf=True
)
X_tfidf = tfidf_transformer.fit_transform(X_counts)
cpu_hashing_time = time.time() - start_time
print(f"CPU HashingVectorizer + TfidfTransformer time: {cpu_hashing_time:.4f} seconds")
print(f"Sparse matrix shape: {X_tfidf.shape}")
print(f"Sparse matrix memory: {X_tfidf.data.nbytes / 1024 / 1024:.2f} MB")

# OPTION 2: One-step TfidfVectorizer
start_time = time.time()
vectorizer = TfidfVectorizer(
    max_features=50_000,
    sublinear_tf=True,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    dtype=np.float32
)
X_tfidf_direct = vectorizer.fit_transform(df['text'])
cpu_tfidf_time = time.time() - start_time
print(f"CPU TfidfVectorizer time: {cpu_tfidf_time:.4f} seconds")

# Test search time
def cpu_search(query_text, top_n=5):
    start_time = time.time()
    query_vec = vectorizer.transform([query_text])
    similarities = cosine_similarity(query_vec, X_tfidf_direct).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    search_time = time.time() - start_time
    return df.iloc[top_indices][['English', 'text']], similarities[top_indices], search_time

test_query = "machine learning algorithms"
cpu_results, cpu_scores, cpu_search_time = cpu_search(test_query, top_n=5)
print(f"CPU search time: {cpu_search_time:.6f} seconds")

# --------- GPU VERSION (cuDF/cuML) ---------
print("\n=== GPU VERSION (cuDF/cuML) ===")
start_time = time.time()

# Load data with cuDF
gpu_df = cudf.read_csv(data)

# Fix the text column preparation for GPU
if gpu_df['English'].dtype == 'object':
    gpu_df['text'] = gpu_df['English']
else:
    # Convert to pandas for complex operations, then back to cuDF
    temp_df = gpu_df.to_pandas()
    temp_df['text'] = temp_df['English'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
    gpu_df = cudf.DataFrame.from_pandas(temp_df)

# Drop rows with NaN values
gpu_df = gpu_df.dropna(subset=['text'])
gpu_load_time = time.time() - start_time
print(f"GPU data loading time: {gpu_load_time:.4f} seconds")

# OPTION 1: cuML HashingVectorizer + TfidfTransformer
start_time = time.time()
cu_hasher = cuHashingVectorizer(
    n_features=2**18,
    ngram_range=(1, 2),
    alternate_sign=False
)
X_cu_counts = cu_hasher.transform(gpu_df['text'])

cu_tfidf_transformer = cuTfidfTransformer(
    sublinear_tf=True,
    use_idf=True,
    smooth_idf=True
)
X_cu_tfidf = cu_tfidf_transformer.fit_transform(X_cu_counts)
gpu_hashing_time = time.time() - start_time
print(f"GPU HashingVectorizer + TfidfTransformer time: {gpu_hashing_time:.4f} seconds")
print(f"GPU matrix shape: {X_cu_tfidf.shape}")
print(f"GPU matrix memory: {X_cu_tfidf.nbytes / 1024 / 1024 if hasattr(X_cu_tfidf, 'nbytes') else 'N/A'} MB")

# OPTION 2: One-step cuML TfidfVectorizer
start_time = time.time()
cu_vectorizer = cuTfidfVectorizer(
    max_features=50_000,
    sublinear_tf=True,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)
X_cu_tfidf_direct = cu_vectorizer.fit_transform(gpu_df['text'])
gpu_tfidf_time = time.time() - start_time
print(f"GPU TfidfVectorizer time: {gpu_tfidf_time:.4f} seconds")

# Modified GPU Search Implementation with CuPy Compatibility


def gpu_search(query_text, top_n=5):
    start_time = time.time()
    
    try:
        # --------------------------
        # Step 1: Check CuPy Installation
        # --------------------------
        try:
            import cupy as cp
            from cupyx.scipy.sparse import csr_matrix as cp_csr_matrix
        except ImportError:
            raise RuntimeError("CuPy not installed. Install with: conda install -c conda-forge cupy")

        # --------------------------
        # Step 2: Query Vectorization
        # --------------------------
        query_series = cudf.Series([query_text])
        query_vec = cu_vectorizer.transform(query_series).astype(cp.float32)
        
        # --------------------------
        # Step 3: Sparse Matrix Conversion
        # --------------------------
        # Convert to CSR format for cuSPARSE compatibility
        query_csr = query_vec.tocsr()
        corpus_csr = X_cu_tfidf_direct.tocsr()
        
        # Convert to CuPy CSR matrices
        query_gpu = cp_csr_matrix(query_csr)
        corpus_gpu = cp_csr_matrix(corpus_csr)
        
        # --------------------------
        # Step 4: Sparse Matrix Multiplication
        # --------------------------
        similarities = (query_gpu * corpus_gpu.T).todense().ravel()
        
        # --------------------------
        # Step 5: Results Processing
        # --------------------------
        top_indices = cp.argsort(-similarities)[:top_n].get()
        
        search_time = time.time() - start_time
        return (
            gpu_df.iloc[top_indices][['English', 'text']].to_pandas(),
            similarities[top_indices].get(),
            search_time
        )
    
    except Exception as e:
        print(f"GPU search error: {str(e)}")
        print("Falling back to CPU implementation")
        return cpu_search(query_text, top_n)

    
from cuml.neighbors import NearestNeighbors
from cuml.preprocessing import normalize

def gpu_search(query_text, top_n=5):
    start_time = time.time()
    
    try:
        # Normalize vectors during preprocessing
        X_normalized = normalize(X_cu_tfidf_direct, norm='l2')
        
        # Create GPU-optimized index
        nn_model = NearestNeighbors(n_neighbors=top_n, metric='cosine')
        nn_model.fit(X_normalized)
        
        # Vectorize query
        query_vec = normalize(cu_vectorizer.transform(cudf.Series([query_text])), norm='l2')
        
        # GPU-accelerated similarity search
        distances, indices = nn_model.kneighbors(query_vec)
        
        search_time = time.time() - start_time
        return (
            gpu_df.iloc[indices[0].get()][['English', 'text']].to_pandas(),
            1 - distances[0].get(),  # Convert cosine distance to similarity
            search_time
        )
    
    except Exception as e:
        print(f"GPU search error: {str(e)}")
        return cpu_search(query_text, top_n)

    
# Optimized CPU Fallback Implementation
def cpu_search(query_text, top_n=5):
    start_time = time.time()
    
    # Use sparse matrix operations
    query_vec = vectorizer.transform([query_text])
    similarities = (query_vec * X_tfidf_direct.T).toarray().flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    search_time = time.time() - start_time
    return (
        df.iloc[top_indices][['English', 'text']],
        similarities[top_indices],
        search_time
    )

# Add this before the performance comparison section
# Execute GPU search
gpu_results, gpu_scores, gpu_search_time = gpu_search(test_query, top_n=5)
print(f"\nGPU search time: {gpu_search_time:.6f} seconds")

# Then keep the existing performance comparison block
print("\n=== PERFORMANCE COMPARISON ===")
print(f"{'Operation':<25} {'CPU Time (s)':<15} {'GPU Time (s)':<15} {'Speedup':<10}")
print(f"{'-'*65}")
print(f"{'Data Loading':<25} {cpu_load_time:<15.4f} {gpu_load_time:<15.4f} {cpu_load_time/gpu_load_time if gpu_load_time > 0 else 'N/A':<10.2f}x")
print(f"{'Hashing + TF-IDF':<25} {cpu_hashing_time:<15.4f} {gpu_hashing_time:<15.4f} {cpu_hashing_time/gpu_hashing_time if gpu_hashing_time > 0 else 'N/A':<10.2f}x")
print(f"{'TfidfVectorizer':<25} {cpu_tfidf_time:<15.4f} {gpu_tfidf_time:<15.4f} {cpu_tfidf_time/gpu_tfidf_time if gpu_tfidf_time > 0 else 'N/A':<10.2f}x")
print(f"{'Search':<25} {cpu_search_time:<15.6f} {gpu_search_time:<15.6f} {cpu_search_time/gpu_search_time if gpu_search_time > 0 else 'N/A':<10.2f}x")


--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy, cupy-cuda11x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



=== CPU VERSION (scikit-learn) ===
CPU data loading time: 0.7754 seconds
CPU HashingVectorizer + TfidfTransformer time: 3.3145 seconds
Sparse matrix shape: (141356, 262144)
Sparse matrix memory: 23.02 MB
CPU TfidfVectorizer time: 7.2097 seconds
CPU search time: 0.110851 seconds

=== GPU VERSION (cuDF/cuML) ===
GPU data loading time: 4.4059 seconds
GPU HashingVectorizer + TfidfTransformer time: 0.5421 seconds
GPU matrix shape: (141356, 262144)
GPU matrix memory: N/A MB
GPU TfidfVectorizer time: 0.2742 seconds

GPU search time: 0.315697 seconds

=== PERFORMANCE COMPARISON ===
Operation                 CPU Time (s)    GPU Time (s)    Speedup   
-----------------------------------------------------------------
Data Loading              0.7754          4.4059          0.18      x
Hashing + TF-IDF          3.3145          0.5421          6.11      x
TfidfVectorizer           7.2097          0.2742          26.29     x
Search                    0.110851        0.315697        0.35      x




In [2]:
pip install pymilvus==2.0.2


Collecting pymilvus==2.0.2
  Downloading pymilvus-2.0.2-py3-none-any.whl.metadata (3.9 kB)
Collecting grpcio==1.37.1 (from pymilvus==2.0.2)
  Downloading grpcio-1.37.1.tar.gz (21.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting grpcio-tools==1.37.1 (from pymilvus==2.0.2)
  Downloading grpcio-tools-1.37.1.tar.gz (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ujson<=5.1.0,>=2.0.0 (from pymilvus==2.0.2)
  Downloading ujson-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting mmh3<=3.0.0,>=2.0 (from pymilvus==2.0.2)
  Downloading mmh3-3.0.0.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading pymilvu