| 
 | 1 | +package com.redis.vl.notebooks;  | 
 | 2 | + | 
 | 3 | +import static org.assertj.core.api.Assertions.*;  | 
 | 4 | + | 
 | 5 | +import com.redis.vl.BaseIntegrationTest;  | 
 | 6 | +import com.redis.vl.index.SearchIndex;  | 
 | 7 | +import com.redis.vl.query.VectorQuery;  | 
 | 8 | +import com.redis.vl.schema.IndexSchema;  | 
 | 9 | +import com.redis.vl.schema.VectorField;  | 
 | 10 | +import com.redis.vl.utils.vectorize.BaseVectorizer;  | 
 | 11 | +import com.redis.vl.utils.vectorize.LangChain4JVectorizer;  | 
 | 12 | +import com.redis.vl.utils.vectorize.SentenceTransformersVectorizer;  | 
 | 13 | +import dev.langchain4j.model.cohere.CohereEmbeddingModel;  | 
 | 14 | +import dev.langchain4j.model.openai.OpenAiEmbeddingModel;  | 
 | 15 | +import dev.langchain4j.model.voyageai.VoyageAiEmbeddingModel;  | 
 | 16 | +import java.util.*;  | 
 | 17 | +import org.junit.jupiter.api.BeforeEach;  | 
 | 18 | +import org.junit.jupiter.api.Tag;  | 
 | 19 | +import org.junit.jupiter.api.Test;  | 
 | 20 | +import redis.clients.jedis.search.schemafields.VectorField.VectorAlgorithm;  | 
 | 21 | + | 
 | 22 | +/**  | 
 | 23 | + * Integration test reproducing the 04_vectorizers.ipynb notebook.  | 
 | 24 | + *  | 
 | 25 | + * <p>Ported from:  | 
 | 26 | + * /Users/brian.sam-bodden/Code/redis/py/redis-vl-python/docs/user_guide/04_vectorizers.ipynb  | 
 | 27 | + *  | 
 | 28 | + * <p>Uses same models and data as Python version: - Test sentences: "That is a happy dog", "That  | 
 | 29 | + * is a happy person", "Today is a sunny day" - OpenAI: text-embedding-ada-002 - HuggingFace:  | 
 | 30 | + * sentence-transformers/all-mpnet-base-v2 - Cohere: embed-english-v3.0 - VoyageAI: voyage-law-2  | 
 | 31 | + */  | 
 | 32 | +@Tag("integration")  | 
 | 33 | +public class VectorizersNotebookIntegrationTest extends BaseIntegrationTest {  | 
 | 34 | + | 
 | 35 | +  // Same test sentences as Python notebook  | 
 | 36 | +  private List<String> sentences;  | 
 | 37 | + | 
 | 38 | +  @BeforeEach  | 
 | 39 | +  public void setUp() {  | 
 | 40 | +    sentences =  | 
 | 41 | +        Arrays.asList("That is a happy dog", "That is a happy person", "Today is a sunny day");  | 
 | 42 | +  }  | 
 | 43 | + | 
 | 44 | +  @Test  | 
 | 45 | +  public void testOpenAIVectorizer() {  | 
 | 46 | +    String apiKey = System.getenv("OPENAI_API_KEY");  | 
 | 47 | +    if (apiKey == null) {  | 
 | 48 | +      System.out.println("Skipping OpenAI test - OPENAI_API_KEY not set");  | 
 | 49 | +      return;  | 
 | 50 | +    }  | 
 | 51 | + | 
 | 52 | +    // Create a vectorizer using OpenAI's text-embedding-ada-002 model (same as Python)  | 
 | 53 | +    var openaiModel =  | 
 | 54 | +        OpenAiEmbeddingModel.builder()  | 
 | 55 | +            .apiKey(apiKey)  | 
 | 56 | +            .modelName("text-embedding-ada-002")  | 
 | 57 | +            .build();  | 
 | 58 | +    var oai = new LangChain4JVectorizer("text-embedding-ada-002", openaiModel);  | 
 | 59 | + | 
 | 60 | +    // Embed a single sentence  | 
 | 61 | +    float[] test = oai.embed("This is a test sentence.");  | 
 | 62 | +    System.out.println("OpenAI Vector dimensions: " + test.length);  | 
 | 63 | +    assertThat(test.length).isEqualTo(1536); // text-embedding-ada-002 produces 1536 dims  | 
 | 64 | + | 
 | 65 | +    // Print first 10 dimensions (like Python notebook)  | 
 | 66 | +    System.out.println("First 10 dimensions: " + Arrays.toString(Arrays.copyOfRange(test, 0, 10)));  | 
 | 67 | + | 
 | 68 | +    // Create many embeddings at once  | 
 | 69 | +    List<float[]> embeddings = oai.embedBatch(sentences);  | 
 | 70 | +    assertThat(embeddings).hasSize(3);  | 
 | 71 | +    System.out.println("Number of embeddings: " + embeddings.size());  | 
 | 72 | +    System.out.println(  | 
 | 73 | +        "First embedding (first 10): "  | 
 | 74 | +            + Arrays.toString(Arrays.copyOfRange(embeddings.get(0), 0, 10)));  | 
 | 75 | +  }  | 
 | 76 | + | 
 | 77 | +  @Test  | 
 | 78 | +  public void testHuggingFaceVectorizer() {  | 
 | 79 | +    // Create a vectorizer using HuggingFace Sentence Transformers (same as Python)  | 
 | 80 | +    var hf = new SentenceTransformersVectorizer("sentence-transformers/all-mpnet-base-v2");  | 
 | 81 | + | 
 | 82 | +    // Embed a sentence  | 
 | 83 | +    float[] hfTest = hf.embed("This is a test sentence.");  | 
 | 84 | +    System.out.println("HF Vector dimensions: " + hfTest.length);  | 
 | 85 | +    assertThat(hfTest.length).isEqualTo(768); // all-mpnet-base-v2 produces 768 dims  | 
 | 86 | +    System.out.println(  | 
 | 87 | +        "First 10 dimensions: " + Arrays.toString(Arrays.copyOfRange(hfTest, 0, 10)));  | 
 | 88 | + | 
 | 89 | +    // Create many embeddings at once  | 
 | 90 | +    List<float[]> hfEmbeddings = hf.embedBatch(sentences);  | 
 | 91 | +    assertThat(hfEmbeddings).hasSize(3);  | 
 | 92 | +    System.out.println("Created " + hfEmbeddings.size() + " embeddings");  | 
 | 93 | +  }  | 
 | 94 | + | 
 | 95 | +  @Test  | 
 | 96 | +  public void testCohereVectorizer() {  | 
 | 97 | +    String apiKey = System.getenv("COHERE_API_KEY");  | 
 | 98 | +    if (apiKey == null) {  | 
 | 99 | +      System.out.println("Skipping Cohere test - COHERE_API_KEY not set");  | 
 | 100 | +      return;  | 
 | 101 | +    }  | 
 | 102 | + | 
 | 103 | +    // Create a vectorizer using Cohere (same model as Python)  | 
 | 104 | +    var cohereModel =  | 
 | 105 | +        CohereEmbeddingModel.builder().apiKey(apiKey).modelName("embed-english-v3.0").build();  | 
 | 106 | +    var co = new LangChain4JVectorizer("embed-english-v3.0", cohereModel);  | 
 | 107 | + | 
 | 108 | +    // Embed a search query  | 
 | 109 | +    float[] queryEmbed = co.embed("This is a test sentence.");  | 
 | 110 | +    System.out.println("Cohere Query vector dimensions: " + queryEmbed.length);  | 
 | 111 | +    assertThat(queryEmbed.length).isEqualTo(1024); // embed-english-v3.0 produces 1024 dims  | 
 | 112 | +    System.out.println(  | 
 | 113 | +        "First 10 dimensions: " + Arrays.toString(Arrays.copyOfRange(queryEmbed, 0, 10)));  | 
 | 114 | + | 
 | 115 | +    // Note: LangChain4j Cohere doesn't expose input_type in the same way Python does  | 
 | 116 | +    // The model handles query vs document distinction internally  | 
 | 117 | +  }  | 
 | 118 | + | 
 | 119 | +  @Test  | 
 | 120 | +  public void testVoyageAIVectorizer() {  | 
 | 121 | +    String apiKey = System.getenv("VOYAGE_API_KEY");  | 
 | 122 | +    if (apiKey == null) {  | 
 | 123 | +      System.out.println("Skipping VoyageAI test - VOYAGE_API_KEY not set");  | 
 | 124 | +      return;  | 
 | 125 | +    }  | 
 | 126 | + | 
 | 127 | +    // Create a vectorizer using VoyageAI (same model as Python)  | 
 | 128 | +    var voyageModel =  | 
 | 129 | +        VoyageAiEmbeddingModel.builder().apiKey(apiKey).modelName("voyage-law-2").build();  | 
 | 130 | +    var vo = new LangChain4JVectorizer("voyage-law-2", voyageModel);  | 
 | 131 | + | 
 | 132 | +    // Embed a search query  | 
 | 133 | +    float[] voyageQuery = vo.embed("This is a test sentence.");  | 
 | 134 | +    System.out.println("VoyageAI vector dimensions: " + voyageQuery.length);  | 
 | 135 | +    assertThat(voyageQuery.length).isEqualTo(1024); // voyage-law-2 produces 1024 dims  | 
 | 136 | +    System.out.println(  | 
 | 137 | +        "First 10 dimensions: " + Arrays.toString(Arrays.copyOfRange(voyageQuery, 0, 10)));  | 
 | 138 | +  }  | 
 | 139 | + | 
 | 140 | +  @Test  | 
 | 141 | +  public void testCustomVectorizer() {  | 
 | 142 | +    // Create a simple custom vectorizer (same as Python notebook)  | 
 | 143 | +    class CustomVectorizer extends BaseVectorizer {  | 
 | 144 | +      public CustomVectorizer() {  | 
 | 145 | +        super("custom-model", 768, "float32");  | 
 | 146 | +      }  | 
 | 147 | + | 
 | 148 | +      @Override  | 
 | 149 | +      protected float[] generateEmbedding(String text) {  | 
 | 150 | +        float[] embedding = new float[768];  | 
 | 151 | +        Arrays.fill(embedding, 0.101f);  | 
 | 152 | +        return embedding;  | 
 | 153 | +      }  | 
 | 154 | + | 
 | 155 | +      @Override  | 
 | 156 | +      protected List<float[]> generateEmbeddingsBatch(List<String> texts, int batchSize) {  | 
 | 157 | +        return texts.stream().map(this::generateEmbedding).toList();  | 
 | 158 | +      }  | 
 | 159 | +    }  | 
 | 160 | + | 
 | 161 | +    var customVectorizer = new CustomVectorizer();  | 
 | 162 | +    float[] customEmbed = customVectorizer.embed("This is a test sentence.");  | 
 | 163 | +    assertThat(customEmbed.length).isEqualTo(768);  | 
 | 164 | +    assertThat(customEmbed[0]).isEqualTo(0.101f);  | 
 | 165 | +    System.out.println(  | 
 | 166 | +        "Custom vectorizer: " + Arrays.toString(Arrays.copyOfRange(customEmbed, 0, 10)));  | 
 | 167 | +  }  | 
 | 168 | + | 
 | 169 | +  @Test  | 
 | 170 | +  public void testSearchWithProviderEmbeddings() {  | 
 | 171 | +    // Use HuggingFace vectorizer (same as Python notebook)  | 
 | 172 | +    var hf = new SentenceTransformersVectorizer("sentence-transformers/all-mpnet-base-v2");  | 
 | 173 | + | 
 | 174 | +    // Create the schema (same as Python notebook YAML)  | 
 | 175 | +    var schema =  | 
 | 176 | +        IndexSchema.builder()  | 
 | 177 | +            .name("vectorizers")  | 
 | 178 | +            .prefix("doc")  | 
 | 179 | +            .storageType(IndexSchema.StorageType.HASH)  | 
 | 180 | +            .addTextField("sentence", textField -> {})  | 
 | 181 | +            .addVectorField(  | 
 | 182 | +                "embedding",  | 
 | 183 | +                768,  | 
 | 184 | +                vectorField ->  | 
 | 185 | +                    vectorField  | 
 | 186 | +                        .algorithm(VectorAlgorithm.FLAT)  | 
 | 187 | +                        .distanceMetric(VectorField.DistanceMetric.COSINE)  | 
 | 188 | +                        .dataType(VectorField.VectorDataType.FLOAT32))  | 
 | 189 | +            .build();  | 
 | 190 | + | 
 | 191 | +    // Create the index  | 
 | 192 | +    var index = new SearchIndex(schema, unifiedJedis);  | 
 | 193 | +    index.create(true); // overwrite if exists  | 
 | 194 | +    System.out.println("Index created: " + index.getName());  | 
 | 195 | + | 
 | 196 | +    try {  | 
 | 197 | +      // Create embeddings for our sentences (same sentences as Python)  | 
 | 198 | +      List<float[]> sentenceEmbeddings = hf.embedBatch(sentences);  | 
 | 199 | + | 
 | 200 | +      // Prepare data for loading  | 
 | 201 | +      List<Map<String, Object>> data = new ArrayList<>();  | 
 | 202 | +      for (int i = 0; i < sentences.size(); i++) {  | 
 | 203 | +        Map<String, Object> doc = new HashMap<>();  | 
 | 204 | +        doc.put("sentence", sentences.get(i));  | 
 | 205 | +        doc.put("embedding", sentenceEmbeddings.get(i));  | 
 | 206 | +        data.add(doc);  | 
 | 207 | +      }  | 
 | 208 | + | 
 | 209 | +      // Load data into the index  | 
 | 210 | +      index.load(data);  | 
 | 211 | +      System.out.println("Loaded " + data.size() + " documents");  | 
 | 212 | + | 
 | 213 | +      // Use the HuggingFace vectorizer to create a query embedding  | 
 | 214 | +      // Query: "That is a happy cat" (same as Python notebook)  | 
 | 215 | +      float[] queryEmbedding = hf.embed("That is a happy cat");  | 
 | 216 | + | 
 | 217 | +      // Create and execute a vector query  | 
 | 218 | +      var query =  | 
 | 219 | +          VectorQuery.builder()  | 
 | 220 | +              .vector(queryEmbedding)  | 
 | 221 | +              .field("embedding")  | 
 | 222 | +              .returnFields(List.of("sentence"))  | 
 | 223 | +              .numResults(3)  | 
 | 224 | +              .build();  | 
 | 225 | + | 
 | 226 | +      List<Map<String, Object>> results = index.query(query);  | 
 | 227 | +      assertThat(results).hasSize(3);  | 
 | 228 | + | 
 | 229 | +      System.out.println("\nSearch results for: 'That is a happy cat'");  | 
 | 230 | +      for (var doc : results) {  | 
 | 231 | +        System.out.println(doc.get("sentence") + " - Distance: " + doc.get("vector_distance"));  | 
 | 232 | +      }  | 
 | 233 | + | 
 | 234 | +      // Verify first result is about a happy dog (most similar to happy cat)  | 
 | 235 | +      String firstResult = (String) results.get(0).get("sentence");  | 
 | 236 | +      assertThat(firstResult).isEqualTo("That is a happy dog");  | 
 | 237 | + | 
 | 238 | +    } finally {  | 
 | 239 | +      // Cleanup  | 
 | 240 | +      index.delete(true);  | 
 | 241 | +      System.out.println("Index deleted");  | 
 | 242 | +    }  | 
 | 243 | +  }  | 
 | 244 | + | 
 | 245 | +  @Test  | 
 | 246 | +  public void testDataTypeSelection() {  | 
 | 247 | +    // Test different data types (same as Python notebook)  | 
 | 248 | + | 
 | 249 | +    // Create vectorizer with default FLOAT32  | 
 | 250 | +    var vectorizer32 = new SentenceTransformersVectorizer("sentence-transformers/all-mpnet-base-v2");  | 
 | 251 | + | 
 | 252 | +    float[] float32Embed = vectorizer32.embed("test sentence");  | 
 | 253 | +    assertThat(float32Embed.length).isEqualTo(768);  | 
 | 254 | + | 
 | 255 | +    // Note: Python supports float16 and float64, but Java ONNX runtime may have limitations  | 
 | 256 | +    // For now, we verify that FLOAT32 works correctly  | 
 | 257 | +    System.out.println("FLOAT32 embedding created successfully");  | 
 | 258 | +  }  | 
 | 259 | +}  | 
0 commit comments