## Read Data

In [1]:
import pandas as pd
data = pd.read_csv('Mental_Health_FAQ.csv')

In [2]:
data

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,6361820,What causes mental illness?,It is estimated that mental illness affects 1 ...
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."
...,...,...,...
93,4373204,How do I know if I'm drinking too much?,Sorting out if you are drinking too much can b...
94,7807643,"If cannabis is dangerous, why are we legalizin...","Cannabis smoke, for example, contains cancer-c..."
95,4352464,How can I convince my kids not to use drugs?,You can't. But you can influence their capacit...
96,6521784,What is the legal status (and evidence) of CBD...,Cannabidiol or CBD is a naturally occurring co...


## Generate Embeddings

In [3]:
!pip install -U sentence-transformers



In [4]:
from sentence_transformers import SentenceTransformer
question_emb_model = SentenceTransformer('thenlper/gte-base')

data['question_emb'] = data['Questions'].apply(lambda x: question_emb_model.encode(x, normalize_embeddings=True))

In [5]:
answer_emb_model = SentenceTransformer('BAAI/bge-large-en-v1.5')

data['answer_emb'] = data['Answers'].apply(lambda x: answer_emb_model.encode(x, normalize_embeddings=True))

## Index documents

In [6]:
!pip install elasticsearch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
from elasticsearch import Elasticsearch

from ssl import create_default_context

context = create_default_context(cafile=r"/Users/satishsilveri/Desktop/certs/http_ca.crt")
es = Elasticsearch('https://localhost:9200',
    http_auth=('elastic', 'UorBHSXM5aoBabDwbi0P'),
    ssl_context=context,
)

  es = Elasticsearch('https://localhost:9200',


In [9]:
es

<Elasticsearch(['https://localhost:9200'])>

In [8]:
index_name="faq-index"
def generate_docs():
    for index, row in data.iterrows():
        doc = {
                "_index": index_name,
                "_source": {
                    "faq_id":row['Question_ID'],
                    "question":row['Questions'],
                    "answer":row['Answers'],
                    "question_emb": row['question_emb'],
                    "answer_emb": row['answer_emb']
                },
            }

        yield doc

In [10]:
es.delete_by_query(index=index_name, query={"match_all":{}})

ObjectApiResponse({'took': 367, 'timed_out': False, 'total': 882, 'deleted': 882, 'batches': 1, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []})

In [11]:
import tqdm
from elasticsearch.helpers import streaming_bulk
number_of_docs=len(data)
progress = tqdm.tqdm(unit="docs", total=number_of_docs)
successes = 0
for ok, action in streaming_bulk(client=es, index=index_name, actions=generate_docs()):
    progress.update(1)
    successes += ok

print("Indexed %d/%d documents" % (successes, number_of_docs))

  1%|▍                                         | 1/98 [00:00<00:54,  1.77docs/s]

Indexed 98/98 documents


In [12]:
def faq_search(query=""):
    
    if query is not None and len(query) == 0:
        print('Query cannot be empty')
        return None
    else:
        query_question_emb = question_emb_model.encode(query, normalize_embeddings=True)

        instruction="Represent this sentence for searching relevant passages: "

        query_answer_emb = answer_emb_model.encode(instruction + query, normalize_embeddings=True)

        payload = {
          "query": {
            "match": {
              "title": {
                "query": query,
                "boost": 0.3
              }
            }
          },
          "knn": [ {
            "field": "question_emb",
            "query_vector": query_question_emb,
            "k": 5,
            "num_candidates": 50,
            "boost": 0.3
          },
          {
            "field": "answer_emb",
            "query_vector": query_answer_emb,
            "k": 10,
            "num_candidates": 10,
            "boost": 0.5
          }],
          "size": 10,
          "_source":["faq_id","question", "answer"]
        }

        response = es.search(index=index_name, body=payload)['hits']['hits']

        return response

In [13]:
faq_search(query="how to check if I have mental health issues?")

  response = es.search(index=index_name, body=payload)['hits']['hits']


[{'_index': 'faq-index',
  '_id': '9dfqTosB4gDzD-Xi4p8W',
  '_score': 0.70398307,
  '_source': {'faq_id': 4283807,
   'question': 'What causes mental health problems?',
   'answer': 'Challenges or problems with your mental health can arise from psychological, biological, and social, issues, as well as life events.'}},
 {'_index': 'faq-index',
  '_id': 'CdfqTosB4gDzD-Xi4qAW',
  '_score': 0.42625695,
  '_ignored': ['answer.keyword'],
  '_source': {'faq_id': 1833460,
   'question': 'Where can older adults find help for mental health concerns?',
   'answer': "Mental health concerns are a serious concern at any age, and everyone deserves help and support. If you’re concerned about your mental health, you can: \n Talk to your family doctor or go to a walk-in clinic \n Call the Mental Health Support Line at 310-6789 (no area code) for information about services in your area \n Find your local mental health centre or program at www.gov.bc.ca/mentalhealth (you may need a doctor’s referral to ac

## Evaluate the system

In [14]:
!pip install google

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
!pip install protobuf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
from transformers import AutoModelWithLMHead, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-small-finetuned-quora-for-paraphrasing")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-small-finetuned-quora-for-paraphrasing")

def paraphrase(question, number_of_questions=3, max_length=128):
    input_ids = tokenizer.encode(question, return_tensors="pt", add_special_tokens=True)

    generated_ids = model.generate(input_ids=input_ids, num_return_sequences=number_of_questions, num_beams=5, max_length=max_length, no_repeat_ngram_size=2, repetition_penalty=3.5, length_penalty=1.0, early_stopping=True)

    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

    return preds
  
preds = paraphrase("paraphrase: Where can older adults find help for mental health concerns?")

for pred in preds:
    print(pred)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transforme

How can I help older adults with mental health issues?
Where can I find help for mental health issues?
Where can I find mental health help for older adults?


In [17]:
temp_data = data[['Question_ID','Questions']]

In [18]:
# import torch
# from transformers import T5ForConditionalGeneration,T5Tokenizer


# def set_seed(seed):
#     torch.manual_seed(seed)
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed_all(seed)

# set_seed(42)

# model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
# tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_paraphraser')

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)


# def paraphrase(query, number_of_questions=4, max_len=256):
#     encoding = tokenizer.encode_plus(query,pad_to_max_length=True, return_tensors="pt")
#     input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


#     # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
#     beam_outputs = model.generate(
#         input_ids=input_ids, attention_mask=attention_masks,
#         do_sample=True,
#         max_length=256,
#         top_k=120,
#         top_p=0.98,
#         early_stopping=True,
#         num_return_sequences=number_of_questions
#     )

#     final_outputs =[]
#     for beam_output in beam_outputs:
#         sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
#         if sent.lower() != query.lower() and sent not in final_outputs:
#             final_outputs.append(sent)

#     return final_outputs


In [21]:
eval_data = []

for index, row in temp_data.iterrows():
    preds = paraphrase(question = row['Questions'])
    
    for pred in preds:
        temp={}
        temp['Question'] = pred
        temp['FAQ_ID'] = row['Question_ID']
        eval_data.append(temp)
    
eval_data = pd.DataFrame(eval_data)

In [22]:
eval_data

Unnamed: 0,Question,FAQ_ID
0,What does it mean to have a mental illness?,1590140
1,What does it mean to have mental illness?,1590140
2,What is it like to have a mental illness?,1590140
3,How does mental illness affect?,2110618
4,How does mental illness affect you?,2110618
...,...,...
289,What is legal status of CBD oil?,6521784
290,What are the legal status of CBD oil?,6521784
291,What is the evidence of vaping?,3221856
292,What is the evidence for vaping?,3221856


In [23]:
def get_faq_id_s1(query="", k=5, num_candidates=10):
    
    if query is not None and len(query) == 0:
        print('Query cannot be empty')
        return None
    else:
        instruction="Represent this sentence for searching relevant passages: "

        query_answer_emb = answer_emb_model.encode(instruction + query, normalize_embeddings=True)

        payload = {
          "knn": [
          {
            "field": "answer_emb",
            "query_vector": query_answer_emb,
            "k": k,
            "num_candidates": num_candidates,
          }],
          "size": 1,
          "_source":["faq_id"]
        }

        response = es.search(index=index_name, body=payload)['hits']['hits']

        return response[0]['_source']['faq_id']

In [24]:
get_faq_id_s1(query="legal status of CBD oil?")

  response = es.search(index=index_name, body=payload)['hits']['hits']


6521784

In [25]:
def get_faq_id_s2(query="", k=5, num_candidates=10):
    
    if query is not None and len(query) == 0:
        print('Query cannot be empty')
        return None
    else:
        query_question_emb = question_emb_model.encode(query, normalize_embeddings=True)

        instruction="Represent this sentence for searching relevant passages: "

        query_answer_emb = answer_emb_model.encode(instruction + query, normalize_embeddings=True)
        
        payload = {
          "query": {
            "match": {
              "title": {
                "query": query,
                "boost": 0.2
              }
            }
          },
          "knn": [ {
            "field": "question_emb",
            "query_vector": query_question_emb,
            "k": k,
            "num_candidates": num_candidates,
            "boost": 0.3
          },
          {
            "field": "answer_emb",
            "query_vector": query_answer_emb,
            "k": k,
            "num_candidates": num_candidates,
            "boost": 0.5
          }],
          "size": 1,
          "_source":["faq_id"]
        }

        response = es.search(index=index_name, body=payload)['hits']['hits']

        return response[0]['_source']['faq_id']

In [26]:
get_faq_id_s2(query="legal status of CBD oil", k=10, num_candidates=10)

  response = es.search(index=index_name, body=payload)['hits']['hits']


6521784

In [None]:
#eval_data=eval_data.sample(frac=1).reset_index(drop=True)

In [27]:
eval_data['PRED_FAQ_ID_S1'] = eval_data['Question'].apply(lambda x: get_faq_id_s1(query=x, k=10, num_candidates=10))

  response = es.search(index=index_name, body=payload)['hits']['hits']


In [28]:
eval_data

Unnamed: 0,Question,FAQ_ID,PRED_FAQ_ID_S1
0,What does it mean to have a mental illness?,1590140,7995219
1,What does it mean to have mental illness?,1590140,7995219
2,What is it like to have a mental illness?,1590140,1590140
3,How does mental illness affect?,2110618,2110618
4,How does mental illness affect you?,2110618,1590140
...,...,...,...
289,What is legal status of CBD oil?,6521784,6521784
290,What are the legal status of CBD oil?,6521784,6521784
291,What is the evidence of vaping?,3221856,3221856
292,What is the evidence for vaping?,3221856,3221856


In [29]:
eval_data['PRED_FAQ_ID_S2'] = eval_data['Question'].apply(lambda x: get_faq_id_s2(query=x, k=10, num_candidates=10))

  response = es.search(index=index_name, body=payload)['hits']['hits']


In [30]:
eval_data

Unnamed: 0,Question,FAQ_ID,PRED_FAQ_ID_S1,PRED_FAQ_ID_S2
0,What does it mean to have a mental illness?,1590140,7995219,1590140
1,What does it mean to have mental illness?,1590140,7995219,1590140
2,What is it like to have a mental illness?,1590140,1590140,1590140
3,How does mental illness affect?,2110618,2110618,2110618
4,How does mental illness affect you?,2110618,1590140,2110618
...,...,...,...,...
289,What is legal status of CBD oil?,6521784,6521784,6521784
290,What are the legal status of CBD oil?,6521784,6521784,6521784
291,What is the evidence of vaping?,3221856,3221856,3221856
292,What is the evidence for vaping?,3221856,3221856,3221856


### System 1 (using KNN search only)

In [31]:
from sklearn.metrics import accuracy_score

ground_truth = eval_data["FAQ_ID"].values
predictions_s1 = eval_data["PRED_FAQ_ID_S1"].values

accuracy = accuracy_score(ground_truth, predictions_s1)

In [32]:
accuracy

0.7210884353741497

### System 2(using Query, Question KNN and Answer KNN)

In [33]:
predictions_s2 = eval_data["PRED_FAQ_ID_S2"].values

accuracy = accuracy_score(ground_truth, predictions_s2)

In [34]:
accuracy

0.8571428571428571