In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import pandas as pd
import re
from langchain.schema import Document
import numpy as np
import ast

from dotenv import load_dotenv
load_dotenv()


True

In [49]:
df = pd.read_csv('articles_with_text_0214.csv')
df["author_list"] = df["author_list"].apply(ast.literal_eval)



In [51]:
df['month'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['month'].fillna(0, inplace=True)


In [53]:
df['month'] = df['month'].astype(int)

In [None]:
def validate_month(value):
    try:
        value = int(value)  # Attempt to convert to integer
        return value if 1 <= value <= 12 else np.nan  # Check range
    except ValueError:
        return np.nan  # Return NaN for non-numeric values

# Apply the function to the column
df['month'] = df['month'].apply(validate_month)
df['month'] = df['month'].fillna(0)
df['month'] = df['month'].astype(int)

In [55]:
metadata_columns = ['author', 'year', 'month', 'title', 'journal', 'doi', 'id', 'url', 'abstract', 'keywords','pages', 'volume', 'chapter', 'issn']

In [56]:


documents = []
for _, row in df.iterrows():
    # Extract metadata as a dictionary
    metadata = {col: row[col] for col in metadata_columns}
    # Create a Document object
    doc = Document(page_content=row['text'], metadata=metadata)
    documents.append(doc)
    dd

In [57]:
chunk_size = 1024
chunk_overlap = 128

In [58]:

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""]
)

In [59]:

splits = r_splitter.split_documents(documents)

In [2]:
from langchain_google_vertexai import VertexAIEmbeddings
embeddings = VertexAIEmbeddings(model="text-embedding-005")

In [3]:
from langchain_milvus import Milvus
URI = "./milvus_vector_database_0215.db"
embeddings = VertexAIEmbeddings(model="text-embedding-005")

vector_store = Milvus(embeddings, connection_args={"uri": URI})

In [6]:

results = vector_store.similarity_search(
    "contributions to technology entrepreneurship",
    k=5,
    expr="((( year >= 2013 ) and ( year <= 2017 )) and ( author LIKE '%Stoyan%' ))"
)
results

[Document(metadata={'abstract': 'This article summarizes the insights from a systematic study of the research literature focusing on the innovation aspects of born-global firms – ventures that were launched to exploit a global niche from the earliest days of their operations. The authors provide a snapshot of opinions on the different aspects of innovation in the way they were conceptualized in the academic literature. The insights are based on a selection of 32 peer-reviewed journal articles addressing the different challenges associated with early internationalization and innovation in such ventures. The article emphasizes that the early internationalization of new ventures should be considered as an innovation process in itself and that innovation and internationalization have a positive effect on each other. In addition, it points out the role of knowledge acquisition and networking capabilities as key innovation enablers and refers to the emergence of the lean startup perspective 

In [7]:
for result in results:
    print(f"title: {result.metadata['title']}")
    print(f"author: {result.metadata['author']}")
    print(f"year: {result.metadata['year']}")

title: Conceptualizing Innovation in Born-Global Firms
author: Erik Zijdemans and Stoyan Tanev
year: 2014
title: The Emergence of the Lean Global Startup as a New Type of Firm
author: Erik Stavnsager Rasmussen and Stoyan Tanev
year: 2015
title: The Emergence of the Lean Global Startup as a New Type of Firm
author: Erik Stavnsager Rasmussen and Stoyan Tanev
year: 2015
title: The Emergence of the Lean Global Startup as a New Type of Firm
author: Erik Stavnsager Rasmussen and Stoyan Tanev
year: 2015
title: Is There a Lean Future for Global Startups?
author: Stoyan Tanev
year: 2017


In [135]:
from langchain.chains.query_constructor.ir import (
    Comparator,
    Comparison,
    Operation,
    Operator,
)
from langchain.retrievers.self_query.milvus import MilvusTranslator

# Step 1: Define the Comparisons
year_comparison_gte = Comparison(
    comparator=Comparator.GTE,
    attribute="year",
    value=2013
)

year_comparison_lte = Comparison(
    comparator=Comparator.LTE,
    attribute="year",
    value=2017
)

author_comparison = Comparison(
    comparator=Comparator.LIKE,
    attribute="author",
    value="%Mika Westerlund%"
)

year_comparison_qe = Comparison(
    comparator=Comparator.EQ,
    attribute="year",
    value=2019
)

abstract_comparison = Comparison(
    comparator=Comparator.IN,
    attribute="abstract",
    value="this"
)

# Step 2: Combine Comparisons with Logical Operators
year_range_operation = Operation(
    operator=Operator.AND,
    arguments=[year_comparison_gte, year_comparison_lte]
)

combined_operation = Operation(
    operator=Operator.AND,
    arguments=[year_range_operation, author_comparison]
)

# Step 3: Translate to Chroma Filter Format
translator_milvus = MilvusTranslator()
filter = translator_milvus.visit_operation(combined_operation)
#filter = translator_milvus.visit_comparison(author_comparison_like)

print(filter)  

((( year >= 2013 ) and ( year <= 2017 )) and ( author like "%Mika Westerlund%%" ))


In [136]:
results = vector_store.similarity_search(
    "What is innovative entrepreneurship?",
    k=5,
    expr=filter
)
results

[Document(metadata={'abstract': 'This article investigates how entrepreneurial marketing can encourage resellers to adopt smart micro-grid technology. An online survey based on the literature on user adoption and entrepreneurial marketing was used to gather data from 99 power systems resellers. The data were analyzed using the partial least squares method to validate a model of the relationships between reseller\\textquoterights antecedents and intention to adopt smart micro-grid technology, and the role of vendor\\textquoterights entrepreneurial marketing in the adoption. The results suggest that user adoption models can only partially be applied to the reseller context, and future research should develop models that can further explain reseller\\textquoterights decision making with regards to becoming involved in an emerging technology. As to the implications for practice, vendors need to demonstrate proactive entrepreneurial marketing, particularly entrepreneurial orientation, to in

In [137]:
for result in results:
    print(f"year: {result.metadata['year']}")
    print(f"author: {result.metadata['author']}")
    print('=====================')

year: 2015
author: Hamidreza Kavandi and Mika Westerlund
year: 2015
author: Hamidreza Kavandi and Mika Westerlund
year: 2015
author: Hamidreza Kavandi and Mika Westerlund
year: 2015
author: Hamidreza Kavandi and Mika Westerlund
year: 2015
author: Hamidreza Kavandi and Mika Westerlund
