# Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# GPU Libraries
import cudf
import cupy as cp
from cuml.feature_extraction.text import HashingVectorizer as cuHashingVectorizer
from cuml.feature_extraction.text import TfidfTransformer as cuTfidfTransformer
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidfVectorizer


--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy, cupy-cuda11x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



# Loading the Dataset

In [2]:
data = "/home/SaiKashyap/ner/translation_data.csv"
df = pd.read_csv(data)
gpu_df = cudf.read_csv(data)
print("=== CPU VERSION (scikit-learn) ===")
start_time = time.time()

=== CPU VERSION (scikit-learn) ===


# Preprocessing Text Data

In [3]:
if df['English'].dtype == 'object' and isinstance(df['English'].iloc[0], str):
    df['text'] = df['English']
else:
    df['text'] = df['English'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

df = df.dropna(subset=['text'])


In [4]:
gpu_df['text'] = gpu_df['English'].astype(str)
gpu_df = gpu_df.dropna(subset=['text'])

In [5]:
cpu_load_time = time.time() - start_time
print(f"CPU data loading time: {cpu_load_time:.4f} seconds")

CPU data loading time: 0.1783 seconds


# CPU Version
HashingVectorizer: Converts text into high-dimensional vectors (bag-of-words)

TfidfTransformer: Converts these counts into TF-IDF scores.

In [6]:
hasher = HashingVectorizer(n_features=2**18, ngram_range=(1, 2), alternate_sign=False, dtype=np.float32)
X_counts = hasher.transform(df['text'])

tfidf_transformer = TfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True)
X_tfidf = tfidf_transformer.fit_transform(X_counts)


In [7]:
vectorizer = TfidfVectorizer(max_features=50_000, sublinear_tf=True, ngram_range=(1, 2), min_df=2, max_df=0.95, dtype=np.float32)
X_tfidf_direct = vectorizer.fit_transform(df['text'])


# GPU Version


In [8]:
cu_hasher = cuHashingVectorizer(n_features=2**18, ngram_range=(1, 2), alternate_sign=False)
X_cu_counts = cu_hasher.transform(gpu_df['text'])

cu_tfidf_transformer = cuTfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True)
X_cu_tfidf = cu_tfidf_transformer.fit_transform(X_cu_counts)


In [10]:
cu_vectorizer = cuTfidfVectorizer(max_features=50_000, sublinear_tf=True, ngram_range=(1, 2), min_df=2, max_df=0.95)
X_cu_tfidf_direct = cu_vectorizer.fit_transform(gpu_df['text'])


MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/SaiKashyap/.conda/envs/llama_factory/include/rmm/mr/device/cuda_memory_resource.hpp

# CPU Search

In [11]:
def cpu_search(query_text, top_n=5):
    query_vec = vectorizer.transform([query_text])
    similarities = cosine_similarity(query_vec, X_tfidf_direct).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['English', 'text']], similarities[top_indices]


# GPU Search (CuPy + cuML)

In [12]:
def gpu_search(query_text, top_n=5):
    query_series = cudf.Series([query_text])
    query_vec = cu_vectorizer.transform(query_series).astype(cp.float32)
    
    from cupyx.scipy.sparse import csr_matrix as cp_csr_matrix
    query_csr = query_vec.tocsr()
    corpus_csr = X_cu_tfidf_direct.tocsr()
    
    query_gpu = cp_csr_matrix(query_csr)
    corpus_gpu = cp_csr_matrix(corpus_csr)

    similarities = (query_gpu * corpus_gpu.T).todense().ravel()
    top_indices = cp.argsort(-similarities)[:top_n].get()

    return gpu_df.iloc[top_indices][['English', 'text']].to_pandas(), similarities[top_indices].get()


In [13]:
from cuml.neighbors import NearestNeighbors
from cuml.preprocessing import normalize

def gpu_search(query_text, top_n=5):
    X_normalized = normalize(X_cu_tfidf_direct, norm='l2')
    
    nn_model = NearestNeighbors(n_neighbors=top_n, metric='cosine')
    nn_model.fit(X_normalized)
    
    query_vec = normalize(cu_vectorizer.transform(cudf.Series([query_text])), norm='l2')
    
    distances, indices = nn_model.kneighbors(query_vec)
    
    return (
        gpu_df.iloc[indices[0].get()][['English', 'text']].to_pandas(),
        1 - distances[0].get(),  # Convert cosine distance to similarity
    )


# Performance Comparison

In [14]:
print("\n=== PERFORMANCE COMPARISON ===")
print(f"{'Operation':<25} {'CPU Time (s)':<15} {'GPU Time (s)':<15} {'Speedup':<10}")
print(f"{'-'*65}")
print(f"{'Data Loading':<25} {cpu_load_time:<15.4f} {gpu_load_time:<15.4f} {cpu_load_time/gpu_load_time:.2f}x")
print(f"{'TfidfVectorizer':<25} {cpu_tfidf_time:<15.4f} {gpu_tfidf_time:<15.4f} {cpu_tfidf_time/gpu_tfidf_time:.2f}x")
print(f"{'Search':<25} {cpu_search_time:<15.6f} {gpu_search_time:<15.6f} {cpu_search_time/gpu_search_time:.2f}x")



=== PERFORMANCE COMPARISON ===
Operation                 CPU Time (s)    GPU Time (s)    Speedup   
-----------------------------------------------------------------


NameError: name 'gpu_load_time' is not defined