In [1]:
%pip install -q selenium==4.31.0 webdriver-manager==4.0.2 numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

import concurrent.futures
import time

# **Scrapping predicted label**

In [3]:
iframe_title = "streamlitApp" 
textarea = "//textarea[@aria-label='Please enter your request here:']" 
classify_button = "//button[.//p[text()='Classify Intent']]" 
label = "//code[text() and not(contains(text(), ' '))]"
timeout = 20
url = 'https://intent-classification-in-banking-cs221.streamlit.app/'

In [4]:
class Get_label:
    count = 0
    def __init__(self):
        self.iframe_title = iframe_title
        self.text_area = textarea
        self.classify_button = classify_button
        self.label = label
        self.timeout = timeout
        self.url = url
        self.driver, self.wait = self._initialize_driver()

    def _initialize_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--window-size=1920,1080")
        
        driver = webdriver.Chrome(options=options)
        driver.get(self.url)
        
        wait = WebDriverWait(driver, self.timeout)
        iframe = wait.until(EC.presence_of_element_located((By.XPATH, f'//iframe[@title="{self.iframe_title}"]')))
        driver.switch_to.frame(iframe) 

        return driver, wait

    def enter_query(self, query):
        placeholder = self.wait.until(EC.presence_of_element_located((By.XPATH, self.text_area)))
        placeholder.send_keys(query)

        classify_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, self.classify_button)))
        classify_button.click()
        
    def get_predicted_label(self):
        code_element = self.wait.until(
            EC.presence_of_element_located((By.XPATH, self.label))
        )

        predicted_label = code_element.text.strip()
        return predicted_label
    
    @staticmethod
    def process_single_query(query):
        instance = None
        instance = Get_label()
        instance.enter_query(query)
        label = instance.get_predicted_label()
        
        if instance:
            instance.close()

        Get_label.count+=1

        if Get_label.count % 10 == 0:
            print(f"Get {Get_label.count} predicted label")
        
        return (query, label)

    def close(self):
        if self.driver:
            self.driver.quit()

In [5]:
df = pd.read_csv('1925_rows_dataset.csv')
query_list = [x for x in df['text']]

In [6]:
results=list()

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = executor.map(Get_label.process_single_query, query_list)

    for query_result in futures:
        results.append(query_result)

_, final_label_list = zip(*results)
final_label_list = list(final_label_list)

Get 10 predicted label
Get 20 predicted label
Get 30 predicted label
Get 40 predicted label
Get 50 predicted label
Get 60 predicted label
Get 70 predicted label
Get 80 predicted label
Get 90 predicted label
Get 100 predicted label
Get 110 predicted label
Get 120 predicted label
Get 130 predicted label
Get 140 predicted label
Get 150 predicted label
Get 160 predicted label
Get 170 predicted label
Get 180 predicted label
Get 190 predicted label
Get 200 predicted label
Get 210 predicted label
Get 220 predicted label
Get 230 predicted label
Get 240 predicted label
Get 250 predicted label
Get 260 predicted label
Get 270 predicted label
Get 280 predicted label
Get 290 predicted label
Get 300 predicted label
Get 310 predicted label
Get 320 predicted label
Get 330 predicted label
Get 340 predicted label
Get 350 predicted label
Get 360 predicted label
Get 370 predicted label
Get 380 predicted label
Get 390 predicted label
Get 400 predicted label
Get 410 predicted label
Get 420 predicted label
G

In [7]:
df['Predicted_label'] = [label for label in final_label_list]

df.to_csv("Predicted_labels.csv", index=False)

# **Module Evaluate**

In [8]:
df = pd.read_csv('Predicted_labels.csv')

In [9]:
y_true = df['label_text']
y_pred = df['Predicted_label']

labels = np.unique(y_true)

print(f"Class numbers: {len(labels)}")

Class numbers: 76


In [10]:
precision, recall, f1_score, support = precision_recall_fscore_support(
    y_true, 
    y_pred, 
    labels=np.unique(y_true), 
    average=None, 
    zero_division=0
)

overall_accuracy = accuracy_score(y_true, y_pred)

In [11]:
metrics_df = pd.DataFrame({
    'Label': labels,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1_score,
    'Support': support #
})

summary_row = pd.DataFrame({
    'Label': ['Overall Accuracy'],
    'Precision': [np.nan],
    'Recall': [np.nan],
    'F1-Score': [np.nan],
    'Support': [len(y_true)]
})

summary_row['F1-Score'] = overall_accuracy 
metrics_df = pd.concat([metrics_df, summary_row], ignore_index=True)

In [12]:
evaluate__result = 'Evaluating Results on New Test set.csv'
metrics_df.to_csv(evaluate__result, index=False, float_format='%.4f')