In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/embeddingstrials/embeddings_output (2).csv


In [9]:
import faiss
import numpy as np
import pandas as pd

def create_optimized_index(embeddings, embedding_dimension=768, n_clusters=200):
    """Optimized FAISS index creation with improved parameters"""
    # Convert to contiguous array FIRST before normalization
    embeddings = np.ascontiguousarray(embeddings.astype('float32'))
    faiss.normalize_L2(embeddings)  # Normalization after ensuring contiguity
    
    # Enhanced clustering parameters
    n_clusters = min(int(np.sqrt(len(embeddings)) * 6), 4096)
    
    # GPU configuration
    if faiss.get_num_gpus() > 0:
        print("Using GPU for FAISS operations")
        res = faiss.StandardGpuResources()
        
        # Create CPU index first
        quantizer = faiss.IndexFlatIP(embedding_dimension)
        cpu_index = faiss.IndexIVFFlat(quantizer, embedding_dimension, n_clusters, faiss.METRIC_INNER_PRODUCT)
        
        # Move to GPU
        index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
    else:
        print("Using CPU for FAISS operations")
        quantizer = faiss.IndexFlatIP(embedding_dimension)
        index = faiss.IndexIVFFlat(quantizer, embedding_dimension, n_clusters, faiss.METRIC_INNER_PRODUCT)
    
    # Enhanced training parameters
    index.nprobe = min(200, n_clusters)
    
    # Train with full dataset subset
    index.train(embeddings[np.random.choice(len(embeddings), min(150000, len(embeddings)), replace=False)])
    index.add(embeddings)
    return index

def find_similar_trials(target_nct, embeddings_df, k=100):
    """Find top-k similar trials for a specific NCT number"""
    # Locate target NCT
    target_row = embeddings_df[embeddings_df['NCT Number'] == target_nct]
    if target_row.empty:
        raise ValueError(f"NCT number {target_nct} not found in dataset")
    
    # Prepare embeddings with CONTIGUOUS check
    nct_ids = embeddings_df['NCT Number'].values
    embeddings = embeddings_df.iloc[:, 1:769].values.astype('float32')
    embeddings = np.ascontiguousarray(embeddings)  # 🔑 Critical fix
    
    # Create index (normalization happens inside create_optimized_index)
    index = create_optimized_index(embeddings)
    
    # Prepare target embedding with CONTIGUOUS check
    target_embedding = target_row.iloc[:, 1:769].values.astype('float32')
    target_embedding = np.ascontiguousarray(target_embedding)  # 🔑 Critical fix
    faiss.normalize_L2(target_embedding)
    
    # Search
    similarities, indices = index.search(target_embedding, k + 1)
    
    # Process results
    results = []
    for idx, score in zip(indices[0], similarities[0]):
        if nct_ids[idx] != target_nct:
            results.append({'NCT_Number': nct_ids[idx], 'Similarity': score})
    
    return pd.DataFrame(results[:k])

# Example usage
if __name__ == "__main__":
    embeddings_df = pd.read_csv('/kaggle/input/embeddingstrials/embeddings_output (2).csv')
    
    similar_trials_df = find_similar_trials(
        target_nct="NCT03518073",
        embeddings_df=embeddings_df,
        k=100
    )
    
    similar_trials_df.to_csv("top_100_similar_trials3.csv", index=False)
    print("Results saved successfully")

Using GPU for FAISS operations
Results saved successfully
