## Importing Dependencies

In [1]:
import pandas as pd
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TfidfRetriever
from haystack.pipelines import DocumentSearchPipeline
from haystack import Document
from haystack.utils import print_documents

  from .autonotebook import tqdm as notebook_tqdm


## Loading Dataset

In [2]:
website_df = pd.read_csv('../data/plaksha website - Sheet2m.csv')
website_df.head()

Unnamed: 0,Crisp,Detailed
0,"Plaksha University, founded in 2019, emerged a...",Plaksha University is the culmination of a vis...
1,Plaksha University's framework rests upon thre...,Plaksha University's mission is underpinned by...
2,Plaksha University's founders represent a dive...,The driving force behind Plaksha University co...
3,"Back in 2017, Plaksha University formed an Aca...","In 2017, Plaksha University took a significant..."
4,Plaksha University has forged partnerships wit...,Plaksha University's commitment to fostering t...


## Creating a Instore Data Store

In [None]:
document_store_instore = InMemoryDocumentStore(use_bm25=False, use_gpu=True)

### Casting data into Document object

The structure of Document Class is

```python
class Document:
    content: Union[str, pd.DataFrame]
    content_type: Literal["text", "table", "image"]
    id: str
    meta: Dict[str, Any]
    score: Optional[float] = None
    embedding: Optional[np.ndarray] = None
    id_hash_keys: Optional[List[str]] = None
```

In [None]:
document_list = []

for i in website_df["Crisp"]:
    document = Document(content=i, content_type='text')
    document_list.append(document)

In [None]:
document_store_instore.write_documents(document_list)

In [None]:
document_list[0].content

In [None]:
document_list[0].embedding

## Initializing the Retriever (TF-IDF)

TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions:

- Documents that have more lexical overlap with the query are more likely to be relevant.
- Words that occur in fewer documents are more significant than words that occur in many documents.


In [None]:
retriever_tfidf = TfidfRetriever(document_store_instore, top_k=3)

In [None]:
search_pipeline = DocumentSearchPipeline(retriever_tfidf)

In [None]:
result = search_pipeline.run(
    query = "btech degrees fee",
    params={"Retriever": {"top_k":3}}
)

In [None]:
print_documents(result)