In [1]:
import pandas as pd
import numpy as np
import matplotlib_inline as plt
import seaborn as sns
import datasets
import nltk

from tqdm import tqdm
from presidio_analyzer import AnalyzerEngine
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
from nltk.corpus import stopwords

from quantity_maxim import extract_named_entities, count_words, count_elements, calculate_overlap_metrics, calculate_information_density, anonymized_text

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonamazzarino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonamazzarino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonamazzarino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonamazzarino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
orca = load_dataset("Open-Orca/OpenOrca")
databricks = load_dataset("databricks/databricks-dolly-15k")

In [4]:
df = databricks["train"].to_pandas()
df

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa
...,...,...,...,...
15006,How do i accept the change,,Embrace the change and see the difference,brainstorming
15007,What is a laser and who created it?,A laser is a device that emits light through a...,A laser is a device that emits light from an e...,summarization
15008,What is the difference between a road bike and...,,Road bikes are built to be ridden on asphalt a...,open_qa
15009,How does GIS help in the real estate investmen...,,"Real estate investors depend on precise, accur...",general_qa


In [5]:
openq = df[df['category']=='open_qa']
openq = openq[['instruction', 'response']]
openq.reset_index(inplace=True)
openq.drop(columns=['index'], inplace=True)
openq

Unnamed: 0,instruction,response
0,Why can camels survive for long without water?,Camels use the fat in their humps to keep them...
1,"Alice's parents have three daughters: Amy, Jes...",The name of the third daughter is Alice
2,Who gave the UN the land in NY to build their HQ,John D Rockerfeller
3,What is a polygon?,A polygon is a form in Geometry. It is a sing...
4,Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na..."
...,...,...
3737,"What does troll the respawn, Jeremy mean?",Trolling the respawn refers to when in video g...
3738,Who won the World Chess Championship in 2021?,Magnus Carlsen defeated Ian Nepomniachtchi 7.5...
3739,Why is it a good idea to walk every day?,Walking is a good exercise for burning calorie...
3740,Who is the creator of Python?,Guido van Rossum is the father of Python. And ...


In [2]:
openq = pd.read_csv("databricks_dolly_open_questions.csv")
openq

Unnamed: 0,instruction,response
0,Why can camels survive for long without water?,Camels use the fat in their humps to keep them...
1,"Alice's parents have three daughters: Amy, Jes...",The name of the third daughter is Alice
2,Who gave the UN the land in NY to build their HQ,John D Rockerfeller
3,What is a polygon?,A polygon is a form in Geometry. It is a sing...
4,Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na..."
...,...,...
3737,"What does troll the respawn, Jeremy mean?",Trolling the respawn refers to when in video g...
3738,Who won the World Chess Championship in 2021?,Magnus Carlsen defeated Ian Nepomniachtchi 7.5...
3739,Why is it a good idea to walk every day?,Walking is a good exercise for burning calorie...
3740,Who is the creator of Python?,Guido van Rossum is the father of Python. And ...


## Named Entity Extraction from instruction and response

In [7]:
for column in tqdm(openq.columns):
    new_column_name = f'{column}_entities'
    openq[new_column_name] = openq[column].apply(extract_named_entities)


100%|██████████| 2/2 [01:27<00:00, 43.59s/it]


## Word count for instruction and response

In [8]:
openq['instruction_words'] = openq['instruction'].apply(count_words)
openq['response_words'] = openq['response'].apply(count_words)

## Named Entities count for each values in instruction and response columns

In [9]:
for column in openq.filter(like='_entities').columns:
    count_column_name = f'{column}_count'
    openq[count_column_name] = openq[column].apply(count_elements)

## Calculate the ratio between number of words and entities in instruction and number of entities in response
Ratio > 0: question has more words/entities than response \
Ratio = 0: same number of words/entities in question and response \
Ratio < 0: question has less words/entities than response \
Ratio = Nan/Error: there aren't any words/entities in response


In [10]:
openq['ratio_entities_words_instruction'] = openq['instruction_entities_count']/openq['instruction_words']
openq['ratio_entities_words_response'] = openq['response_entities_count']/openq['response_words']

## If instruction contains WHO, than check if PERSON entity is found in response

Coherence WHO = 1 if instruction contains WHO and PERSON entity is found in response \
Coherence WHO = -1 if instruction contains WHO and PERSON entity is NOT found in response \
Coherence WHO = 0 if instruction doesn't contain WHO and PERSON entity is NOT found in response \
Coherence WHO = 0 if instruction doesn't contain WHO and PERSON entity is found in response \

In [11]:
openq['contains_who'] = openq['instruction'].str.contains(r'\bWho\b', case=False, na=False)
openq['has_person_entity'] = openq.apply(
    lambda row: any('PERSON' in entities for entities in row['response_entities']) if row['contains_who'] else False,
    axis=1
)

openq['cohere_who'] = openq['contains_who']

for i in range(len(openq)):
        if (openq['contains_who'][i] == True and openq['has_person_entity'][i] == True):
            openq['cohere_who'][i] = 1
        elif (openq['contains_who'][i] == True and openq['has_person_entity'][i] == False):
            openq['cohere_who'][i] = -1
        elif (openq['contains_who'][i] == False and openq['has_person_entity'][i] == False):
            openq['cohere_who'][i] = 0
        elif (openq['contains_who'][i] == False and openq['has_person_entity'][i] == True):
            openq['cohere_who'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  openq['cohere_who'][i] = 0


## If instruction contains WHERE, than check if LOCATION entity is found in response

Coherence WHERE = 1 if instruction contains WHERE and LOCATION entity is found in response \
Coherence WHERE = -1 if instruction contains WHERE and LOCATION entity is NOT found in response \
Coherence WHERE = 0 if instruction doesn't contain WHERE and LOCATION entity is NOT found in response \
Coherence WHERE = 0 if instruction doesn't contain WHERE and LOCATION entity is found in response \

In [12]:
openq['contains_where'] = openq['instruction'].str.contains(r'\bWhere\b', case=False, na=False)
openq['has_location_entity'] = openq.apply(
    lambda row: any('LOCATION' in entities for entities in row['response_entities']) if row['contains_where'] else False,
    axis=1
)

openq['cohere_where'] = openq['contains_where']

for i in range(len(openq)):
        if (openq['contains_where'][i] == True and openq['has_location_entity'][i] == True):
            openq['cohere_where'][i] = 1
        elif (openq['contains_where'][i] == True and openq['has_location_entity'][i] == False):
            openq['cohere_where'][i] = -1
        elif (openq['contains_where'][i] == False and openq['has_location_entity'][i] == False):
            openq['cohere_where'][i] = 0
        elif (openq['contains_where'][i] == False and openq['has_location_entity'][i] == True):
            openq['cohere_where'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  openq['cohere_where'][i] = 0


In [13]:
openq['contains_when'] = openq['instruction'].str.contains(r'\bWhen\b', case=False, na=False)
openq['has_datetime_entity'] = openq.apply(
    lambda row: any('DATE_TIME' in entities for entities in row['response_entities']) if row['contains_when'] else False,
    axis=1
)

openq['cohere_when'] = openq['contains_when']

for i in range(len(openq)):
        if (openq['contains_when'][i] == True and openq['has_datetime_entity'][i] == True):
            openq['cohere_when'][i] = 1
        elif (openq['contains_when'][i] == True and openq['has_datetime_entity'][i] == False):
            openq['cohere_when'][i] = -1
        elif (openq['contains_where'][i] == False and openq['has_datetime_entity'][i] == False):
            openq['cohere_when'][i] = 0
        elif (openq['contains_when'][i] == False and openq['has_datetime_entity'][i] == True):
            openq['cohere_when'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  openq['cohere_when'][i] = 0


## Overlap Metrics
Count the number of overlapping words or n-grams (sequences of n words) between the question and the response. A higher overlap may suggest that the response is more directly related to the question. It considers only content words.

## Information Density
Assess the information density by considering the ratio of informative content to the total length of the response. This could involve counting content-bearing words (excluding common words like "is," "the," etc.) relative to the total word count.

### Delta Information Density
Calculate the difference between question information density and response information density.
Delta > 0: question more dense than response
Delta = 0: question as dense as response
Delta < 0: question less dense than response

In [14]:
openq['overlap_words'] = openq.apply(lambda row: calculate_overlap_metrics(row['instruction'], row['response']), axis=1)
openq['information_density_question'] = openq['instruction'].apply(calculate_information_density)
openq['information_density_response'] = openq['response'].apply(calculate_information_density)
openq['delta_information_density'] = openq['information_density_question'] - openq['information_density_response']
openq


Unnamed: 0,instruction,response,instruction_entities,response_entities,instruction_words,response_words,instruction_entities_count,response_entities_count,ratio_entities_words_instruction,ratio_entities_words_response,...,contains_where,has_location_entity,cohere_where,contains_when,has_datetime_entity,cohere_when,overlap_words,information_density_question,information_density_response,delta_information_density
0,Why can camels survive for long without water?,Camels use the fat in their humps to keep them...,[],[],9,21,0,0,0.000000,0.000000,...,False,False,0,False,False,0,2,0.555556,0.523810,0.031746
1,"Alice's parents have three daughters: Amy, Jes...",The name of the third daughter is Alice,"[PERSON, PERSON, PERSON]",[PERSON],22,8,3,1,0.136364,0.125000,...,False,False,0,False,False,0,4,0.409091,0.500000,-0.090909
2,Who gave the UN the land in NY to build their HQ,John D Rockerfeller,[LOCATION],[PERSON],12,3,1,1,0.083333,0.333333,...,False,False,0,False,False,0,0,0.500000,0.666667,-0.166667
3,What is a polygon?,A polygon is a form in Geometry. It is a sing...,[],[],5,82,0,0,0.000000,0.000000,...,False,False,0,False,False,0,1,0.200000,0.426829,-0.226829
4,Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na...",[PERSON],[DATE_TIME],14,23,1,1,0.071429,0.043478,...,False,False,0,False,False,0,3,0.571429,0.434783,0.136646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3737,"What does troll the respawn, Jeremy mean?",Trolling the respawn refers to when in video g...,[PERSON],"[PERSON, PERSON]",9,65,1,2,0.111111,0.030769,...,False,False,0,False,False,0,4,0.444444,0.492308,-0.047863
3738,Who won the World Chess Championship in 2021?,Magnus Carlsen defeated Ian Nepomniachtchi 7.5...,[DATE_TIME],"[PERSON, PERSON, DATE_TIME]",9,16,1,3,0.111111,0.187500,...,False,False,0,False,False,0,3,0.444444,0.625000,-0.180556
3739,Why is it a good idea to walk every day?,Walking is a good exercise for burning calorie...,[DATE_TIME],[],11,40,1,0,0.090909,0.000000,...,False,False,0,False,False,0,2,0.454545,0.500000,-0.045455
3740,Who is the creator of Python?,Guido van Rossum is the father of Python. And ...,[],"[PERSON, DATE_TIME]",7,20,0,2,0.000000,0.100000,...,False,False,0,False,False,0,1,0.285714,0.500000,-0.214286


In [15]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_text = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_text)


# Apply the function to the 'instruction' column
openq['instruction_no_stops'] = openq['instruction'].apply(remove_stopwords)
openq['response_no_stops'] = openq['response'].apply(remove_stopwords)

openq['len_instruction_no_stops'] = openq['instruction_no_stops'].apply(count_words)
openq['len_response_no_stops'] = openq['response_no_stops'].apply(count_words)

openq

Unnamed: 0,instruction,response,instruction_entities,response_entities,instruction_words,response_words,instruction_entities_count,response_entities_count,ratio_entities_words_instruction,ratio_entities_words_response,...,has_datetime_entity,cohere_when,overlap_words,information_density_question,information_density_response,delta_information_density,instruction_no_stops,response_no_stops,len_instruction_no_stops,len_response_no_stops
0,Why can camels survive for long without water?,Camels use the fat in their humps to keep them...,[],[],9,21,0,0,0.000000,0.000000,...,False,0,2,0.555556,0.523810,0.031746,camels survive long without water ?,Camels use fat humps keep filled energy hydrat...,6,12
1,"Alice's parents have three daughters: Amy, Jes...",The name of the third daughter is Alice,"[PERSON, PERSON, PERSON]",[PERSON],22,8,3,1,0.136364,0.125000,...,False,0,4,0.409091,0.500000,-0.090909,"Alice 's parents three daughters : Amy , Jessy...",name third daughter Alice,15,4
2,Who gave the UN the land in NY to build their HQ,John D Rockerfeller,[LOCATION],[PERSON],12,3,1,1,0.083333,0.333333,...,False,0,0,0.500000,0.666667,-0.166667,gave UN land NY build HQ,John Rockerfeller,6,2
3,What is a polygon?,A polygon is a form in Geometry. It is a sing...,[],[],5,82,0,0,0.000000,0.000000,...,False,0,1,0.200000,0.426829,-0.226829,polygon ?,polygon form Geometry . single dimensional pla...,2,45
4,Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na...",[PERSON],[DATE_TIME],14,23,1,1,0.071429,0.043478,...,False,0,3,0.571429,0.434783,0.136646,episodes season four Game Thrones Michelle Mac...,directed `` Oathkeeper '' `` First Name '' fou...,9,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3737,"What does troll the respawn, Jeremy mean?",Trolling the respawn refers to when in video g...,[PERSON],"[PERSON, PERSON]",9,65,1,2,0.111111,0.030769,...,False,0,4,0.444444,0.492308,-0.047863,"troll respawn , Jeremy mean ?",Trolling respawn refers video games player han...,6,36
3738,Who won the World Chess Championship in 2021?,Magnus Carlsen defeated Ian Nepomniachtchi 7.5...,[DATE_TIME],"[PERSON, PERSON, DATE_TIME]",9,16,1,3,0.111111,0.187500,...,False,0,3,0.444444,0.625000,-0.180556,World Chess Championship 2021 ?,Magnus Carlsen defeated Ian Nepomniachtchi 7.5...,5,14
3739,Why is it a good idea to walk every day?,Walking is a good exercise for burning calorie...,[DATE_TIME],[],11,40,1,0,0.090909,0.000000,...,False,0,2,0.454545,0.500000,-0.045455,good idea walk every day ?,Walking good exercise burning calories without...,6,22
3740,Who is the creator of Python?,Guido van Rossum is the father of Python. And ...,[],"[PERSON, DATE_TIME]",7,20,0,2,0.000000,0.100000,...,False,0,1,0.285714,0.500000,-0.214286,creator Python ?,Guido van Rossum father Python . 1st version P...,3,12


In [5]:
openq['instruction_anonymized'] = openq['instruction'].apply(anonymized_text)
openq['response_anonymized'] = openq['response'].apply(anonymized_text)

In [6]:
openq

Unnamed: 0,instruction,response,instruction_anonymized,response_anonymized
0,Why can camels survive for long without water?,Camels use the fat in their humps to keep them...,Why can camels survive for long without water?,Camels use the fat in their humps to keep them...
1,"Alice's parents have three daughters: Amy, Jes...",The name of the third daughter is Alice,"PERSON's parents have three daughters: PERSON,...",The name of the third daughter is PERSON
2,Who gave the UN the land in NY to build their HQ,John D Rockerfeller,Who gave the UN the land in LOCATION to build ...,PERSON
3,What is a polygon?,A polygon is a form in Geometry. It is a sing...,What is a polygon?,A polygon is a form in Geometry. It is a sing...
4,Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na...",Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na..."
...,...,...,...,...
3737,"What does troll the respawn, Jeremy mean?",Trolling the respawn refers to when in video g...,"What does troll the respawn, PERSON mean?",Trolling the respawn refers to when in video g...
3738,Who won the World Chess Championship in 2021?,Magnus Carlsen defeated Ian Nepomniachtchi 7.5...,Who won the World Chess Championship in DATE_T...,PERSON defeated PERSON 7.5 - 3.5 to become the...
3739,Why is it a good idea to walk every day?,Walking is a good exercise for burning calorie...,Why is it a good idea to walk DATE_TIME?,Walking is a good exercise for burning calorie...
3740,Who is the creator of Python?,Guido van Rossum is the father of Python. And ...,Who is the creator of Python?,PERSON is the father of Python. And the 1st ve...


In [16]:
def boost_calculation(n_entities, n_content_words, ):
    boost = 1 - (n_entities / n_content_words)


In [17]:
words = [a, b, c, d, e, f]
stops = [g, h, i]
NER = [a, f]

boost1 = 1 - (len(NER) / len(words))

real_w = 0
stop_w = len(stops)

for word in words:
    if word in NER:
        real_w += 1 + boost1
    else:
        real_w += 1

real_w / (real_w + stop_w)

NameError: name 'a' is not defined

In [None]:
def calculate_information_density(response):
    response_tokens = nltk.word_tokenize(response.lower())
    stop_words = set(stopwords.words('english'))
    content_tokens = [token for token in response_tokens if token.isalnum() and token not in stop_words]

    information_density = len(content_tokens) / len(response_tokens) if len(response_tokens) > 0 else 0

    return information_density

In [7]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig, RecognizerResult


# Analyzer output
analyzer_results = [
    RecognizerResult(entity_type="PERSON", start=11, end=15, score=0.8),
    RecognizerResult(entity_type="PERSON", start=17, end=27, score=0.8),
]

text_to_anonymize = "My name is Bond, James Bond"

anonymizer = AnonymizerEngine()

# Define anonymization operators
operators = {
    "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}),
    "PHONE_NUMBER": OperatorConfig(
        "mask",
        {
            "type": "mask",
            "masking_char": "*",
            "chars_to_mask": 12,
            "from_end": True,
        },
    ),
    "TITLE": OperatorConfig("redact", {}),
}

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize, analyzer_results=analyzer_results, operators=operators
)

print(f"text: {anonymized_results.text}")
print("detailed result:")

text: My name is <ANONYMIZED>, <ANONYMIZED>
detailed result:
