<a href="https://colab.research.google.com/github/prasadsawant7/KeyBERT_API/blob/main/keyword_extractor_with_keybert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install keybert pandas

In [119]:
from keybert import KeyBERT
import pandas as pd
import re
import json

In [120]:
df = pd.read_csv("./drive/MyDrive/Datasets/Emotion_final.csv")
df_test = pd.read_csv("./drive/MyDrive/Datasets/chat_emotion_keyword_labelled.csv")
d = {
    "Text": df["Text"],
    "Keywords": None,
    "Emotion": df["Emotion"]
}
df = pd.DataFrame(d)
print(df.isnull().sum())

Text            0
Keywords    21459
Emotion         0
dtype: int64


In [121]:
model = KeyBERT(model="distilbert-base-nli-mean-tokens")

def extract_keywords_from_text(text: str, labeled_keywords: list):
  text = re.sub(r"\b(Im|im|I'm|Im|didn't|didnt|did|do|wasn't|wasnt|was)\b", "", text)

  keywords = model.extract_keywords(
    text,
    top_n=10,
    stop_words="english"
  )

  keywords_distance = [[keyword, distance] for keyword, distance in keywords if distance >= 0]
  extracted_keywords = [keyword for keyword, distance in keywords if distance >= 0]

  evaluation_metrics = calculate_metrics(extracted_keywords, labeled_keywords)

  result = {
    "keywords": dict(keywords_distance),
    "evaluation_metrics": evaluation_metrics
  }

  return result

def calculate_metrics(extracted_keywords: list, labeled_keywords: list):
  accuracy = 0

  matches = len(set(extracted_keywords).intersection(set(labeled_keywords)))

  accuracy = (matches / len(labeled_keywords)) * 100

  evaluation_metrics = {
    "accuracy": format(accuracy, ".2f")
  }

  return evaluation_metrics

In [135]:
accuracies = []

for i in range(len(df_test)):
  text = df['Text'][i]
  labeled_keywords = df_test["Keywords"][i].split(", ")
  result = extract_keywords_from_text(text, labeled_keywords)
  accuracies.append(float(result['evaluation_metrics']['accuracy']))
  json_data = json.dumps(result, indent=4)
  print(json_data)

{
    "keywords": {
        "humiliated": 0.9278,
        "feel": 0.5876
    },
    "evaluation_metrics": {
        "accuracy": "100.00"
    }
}
{
    "keywords": {
        "hopeless": 0.45,
        "hopeful": 0.4371,
        "cares": 0.4187,
        "feeling": 0.3729,
        "awake": 0.369,
        "damned": 0.357,
        "just": 0.2487
    },
    "evaluation_metrics": {
        "accuracy": "71.43"
    }
}
{
    "keywords": {
        "greedy": 0.7477,
        "wrong": 0.5326,
        "grabbing": 0.4714,
        "minute": 0.4501,
        "post": 0.3772,
        "feel": 0.3513
    },
    "evaluation_metrics": {
        "accuracy": "100.00"
    }
}
{
    "keywords": {
        "fireplace": 0.4318,
        "nostalgic": 0.3366,
        "property": 0.2878,
        "know": 0.2258,
        "feeling": 0.2022
    },
    "evaluation_metrics": {
        "accuracy": "83.33"
    }
}
{
    "keywords": {
        "grouchy": 0.872,
        "feeling": 0.6839
    },
    "evaluation_metrics": {
        "

In [137]:
overall_accuracy = sum(accuracies) / len(accuracies)
print(f"Overall Accuracy: {overall_accuracy:.2f}%")

Overall Accuracy: 88.72%
