In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchWindowException, ElementNotInteractableException, JavascriptException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
import time
from time import sleep
import io
import json
import random
import unicodedata

In [14]:
################################################################
def process_text(text: str) -> str:
    """
    Remove leading and trailing spaces
    Args:
        text (str): Input text string
    Returns:
        processed_text: Processed text
    """
    text = text.strip()
    processed_text = ' '.join(text.split()) 
    return processed_text

###############################################################

def preprocess_text_for_file(text: str) -> str:
    """
    Processing for keeping special characters intact
    Args:
        text (str): Input text string
    Returns:
        encoded_text: Processed text with special characters
    """
    # Normalize the text to NFKD form to handle special characters
    normalized_text = unicodedata.normalize('NFKD', text)
    # Encode the text in UTF-8 and then decode it back to string, ignoring errors
    encoded_text = normalized_text.encode('utf-8', 'ignore').decode('utf-8')
    return encoded_text

################################################################

# def add_qa_pair(qa_list, question, answer):
#     qa_dict = {'question': question, 'answer': answer}
#     qa_list.append(qa_dict)

################################################################

def extract_text_from_page(driver):
    """
    Extract text from a single JIO FAQ page
    Args:
        driver : Selenium driver for page
    Returns:
        text: Text of the page
    """
    # Initialize an empty list to hold the elements
    elements = []

    # Start the index at 1 (or the appropriate starting index for your elements)
    i = 1
    text = []
    while True:
        # Construct the XPath for the current index
        xpath = f'/html/body/div[3]/section[2]/div/div[3]/div[2]/section/div/div/div/div/div/div[{i}]'
        try:
            # Attempt to find the element
            element = driver.find_element(By.XPATH, xpath)
            # Append the found element to the list
            elements.append(element)
            ans = element.find_element(By.CLASS_NAME, 'sp--base')

            # Print the element or any attribute to verify
            ans_text = ans.get_attribute("innerText")
            ans_text_processed = process_text(ans_text)
            faq_text = element.text + " " + ans_text_processed
            text.append(faq_text)
            #add_qa_pair(text, element.text, ans_text_processed)
            # print(f"Found Question {i}: {element.text}")
            # print(f"Found Answer {i}: {ans_text_processed}")
        except:
            # Break the loop if the element is not found
            print(f"No more elements found at index {i}")
            break
        
        # Increment the index for the next iteration
        i += 1

    return text

################################################################

def click_next_option(driver, curr_page, curr_option):
    """
    Click next option in a section of JIO FAQ
    Args:
        driver : Selenium driver for page
        curr_page : Current FAQ Section
        curr_option : Current Option In FAQ Section
    Returns:
        0
    """
    xpath = f'//*[@id="SecondList"]/div[{curr_page}]/div/div/ul/li[{curr_option+1}]/a'
    next_option = driver.find_element(By.XPATH, xpath)
    actions = ActionChains(driver)
    actions.move_to_element(next_option).pause(1)
    actions.click()
    actions.perform()
    return 0

################################################################

def click_next_page(driver, curr_page):
    """
    Click next section on JIO FAQ
    Args:
        driver : Selenium driver for page
        curr_page : Current FAQ Section
    Returns:
        0
    """
    xpath = f'//*[@id="SecondList"]/div[{curr_page+1}]'
    next_page = driver.find_element(By.XPATH, xpath)
    actions = ActionChains(driver)
    actions.move_to_element(next_page).pause(1)
    actions.click()
    actions.perform()
    return 0

################################################################

def click_view_more(driver):
    """
    Click view more option if it exists on the page
    Args:
        driver : Selenium driver for page
    Returns:
        0
    """
    xpath = '//*[@id="loadMoreCustom"]'
    try:
        # Attempt to find the element
        button = driver.find_element(By.XPATH, xpath)  # Replace with your element's ID
        actions = ActionChains(driver)
        actions.move_to_element(button).pause(1)
        actions.click()
        actions.perform()
        print("Succesful Click")
        return 1
    except NoSuchWindowException:
        return 0
    except ElementNotInteractableException:
        return 0
    except JavascriptException:
        return 0
    except NoSuchElementException:
        return 0
################################################################

def find_no_of_pages(driver):
    """
    Find number of sections and options in each section
    Args:
        driver : Selenium driver for page
    Returns:
        item_count : Number of sections
        tab_order_list : List of options in each section
    """
    try:
        # Find the SecondList div by its ID
        second_list_div = driver.find_element(By.ID, "SecondList")

        # Find elements with the specified class names inside the SecondList div
        items = second_list_div.find_elements(By.CSS_SELECTOR, '.j-accordion-panel.faqs-item-list, .j-accordion-panel.faqs-item-list.active')

        # Get the count of these elements
        item_count = len(items)

        # Output the count
        print('Number of Pages:', item_count)

        # Initialize count for tab-order-list elements
        tab_order_list = []

        # Iterate through the items to find div elements containing tab-order-list
        for i, item in enumerate(items):
            tab_order_lists = item.find_elements(By.CSS_SELECTOR, '.tab-item')
            # print(f"For page {i} we have {len(tab_order_lists)} dropdown pages")
            tab_order_list.append(len(tab_order_lists))
    except Exception as e:
        print('An error occurred:', str(e))
    return item_count, tab_order_list

################################################################

file_path = 'data.txt'

In [15]:
# Set up the WebDriver
PATH = "C:/Users/poops/Desktop/Coding Shenanigans/PokemonLegends Bot/chromedriver-win64/chromedriver-win64/chromedriver.exe"

cService = webdriver.ChromeService(executable_path=PATH)
options = Options()
options.page_load_strategy = 'normal'  # Waits for the full page load
options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images for faster loading

driver = webdriver.Chrome(service=cService, options=options)
driver.set_page_load_timeout(60)  # Set page load timeout
driver.set_script_timeout(60)    # Set script timeout

# URL of the website
page_urls = [
    "https://www.jio.com/help/faq/mobile/jio-true-5g/about-5g#/",

]

driver.get(page_urls[0])

# Print number of sections and options in each section
current_page = 0
current_option = 0
page_count, dropdown_list = find_no_of_pages(driver=driver)
print(dropdown_list)
time.sleep(5)


Number of Pages: 10
[3, 6, 5, 2, 9, 11, 179, 10, 2, 13]


In [16]:
################### CODE TO EXTRACT TEXT DATA FROM JIO FAQ ############################

current_option = 0
current_page = 0
for curr_page in range(page_count):
    current_page_page = curr_page
    time.sleep(1)
    curr_page += 1
    click_next_option(driver=driver, curr_page=curr_page, curr_option=0)
    for curr_option in range(dropdown_list[curr_page-1]):
        current_option = curr_option
        if curr_option % 10:
            driver.refresh()
        curr_option += 1
        time.sleep(2)
        print("Processing :", curr_page, curr_option)
        view_more = 1
        no_clicked = 0 
        while view_more == 1:
            no_clicked += 1
            if no_clicked > 10:
                break
            view_more = click_view_more(driver=driver)
        extracted_texts = extract_text_from_page(driver=driver)
        with open(file_path, 'a', encoding='utf-8') as data:
            for text in extracted_texts:
                json_string = json.dumps(text, indent=4, ensure_ascii=False) + ',\n'
                correct_text = preprocess_text_for_file(json_string)
                data.write(correct_text)
        current_page = curr_page
        current_option = curr_option
        if curr_option == dropdown_list[curr_page-1]:
            break
        time.sleep(3)
        click_next_option(driver=driver, curr_page=curr_page, curr_option=curr_option)
        time.sleep(3)
    time.sleep(3)
    click_next_page(driver=driver, curr_page=curr_page)
    time.sleep(3)

####################################################################################

Processing : 1 1
No more elements found at index 18
Processing : 1 2
No more elements found at index 13
Processing : 1 3
No more elements found at index 5
Processing : 2 1
No more elements found at index 20
Processing : 2 2
No more elements found at index 11
Processing : 2 3
No more elements found at index 7
Processing : 2 4
No more elements found at index 8
Processing : 2 5
No more elements found at index 5
Processing : 2 6
No more elements found at index 14
Processing : 3 1
No more elements found at index 9
Processing : 3 2
No more elements found at index 21
Processing : 3 3
Succesful Click
Succesful Click
Succesful Click
Succesful Click
Succesful Click
No more elements found at index 59
Processing : 3 4
No more elements found at index 11
Processing : 3 5
No more elements found at index 4
Processing : 4 1
No more elements found at index 13
Processing : 4 2
No more elements found at index 17
Processing : 5 1
Succesful Click
Succesful Click
No more elements found at index 30
Processing

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="SecondList"]/div[7]/div/div/ul/li[37]/a"}
  (Session info: chrome=126.0.6478.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF618F322C2+60002]
	(No symbol) [0x00007FF618EACA59]
	(No symbol) [0x00007FF618D67EDA]
	(No symbol) [0x00007FF618DB76E6]
	(No symbol) [0x00007FF618DB77AC]
	(No symbol) [0x00007FF618DFE9D7]
	(No symbol) [0x00007FF618DDC2CF]
	(No symbol) [0x00007FF618DFBCC7]
	(No symbol) [0x00007FF618DDC033]
	(No symbol) [0x00007FF618DA9657]
	(No symbol) [0x00007FF618DAA251]
	GetHandleVerifier [0x00007FF619243E2D+3278285]
	GetHandleVerifier [0x00007FF619290190+3590448]
	GetHandleVerifier [0x00007FF6192861D0+3549552]
	GetHandleVerifier [0x00007FF618FE1DE6+779654]
	(No symbol) [0x00007FF618EB7ACF]
	(No symbol) [0x00007FF618EB2EE4]
	(No symbol) [0x00007FF618EB3072]
	(No symbol) [0x00007FF618EA2C4F]
	BaseThreadInitThunk [0x00007FFD910D257D+29]
	RtlUserThreadStart [0x00007FFD925EAF28+40]


In [11]:
################# CONTINUE IF TIMEOUT ERROR ####################

# Set up the WebDriver
PATH = "C:/Users/poops/Desktop/Coding Shenanigans/PokemonLegends Bot/chromedriver-win64/chromedriver-win64/chromedriver.exe"

cService = webdriver.ChromeService(executable_path=PATH)
options = Options()
options.page_load_strategy = 'normal'  # Waits for the full page load
options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images for faster loading

driver = webdriver.Chrome(service=cService, options=options)
driver.set_page_load_timeout(60)  # Set page load timeout
driver.set_script_timeout(60)    # Set script timeout

# URL of the website with the login form
page_urls = [
    "https://www.jio.com/help/faq/mobile/jio-true-5g/about-5g#/",

]

driver.get(page_urls[0])

time.sleep(5)
click_next_page(driver=driver, curr_page=current_page-1)
for curr_page in range(current_page-1,10):
    time.sleep(1)
    curr_page += 1
    click_next_option(driver=driver, curr_page=curr_page, curr_option=current_option-1)
    if curr_page == current_page:
        start = current_option-1
        end = dropdown_list[curr_page-1]+1
    else:
        start = 1
        end = dropdown_list[curr_page-1]+1
    
    for curr_option in range(start, end):
        time.sleep(2)
        print("Processing :", curr_page, curr_option)
        view_more = 1
        while view_more == 1:
            view_more = click_view_more(driver=driver)
        extracted_texts = extract_text_from_page(driver=driver)
        with open(file_path, 'a', encoding='utf-8') as data:
            for text in extracted_texts:
                #json_string = json.dumps(text, indent=4, ensure_ascii=False) + ',\n'
                correct_text = preprocess_text_for_file(text)
                data.write(correct_text)
        current_page = curr_page
        current_option = curr_option
        if curr_option == dropdown_list[curr_page-1]:
            break
        time.sleep(3)
        click_next_option(driver=driver, curr_page=curr_page, curr_option=curr_option)
        time.sleep(3)
    time.sleep(3)
    click_next_page(driver=driver, curr_page=curr_page)
    time.sleep(3)

##############################################################

Processing : 7 29
No more elements found at index 9
Processing : 7 30
No more elements found at index 9
Processing : 7 31
No more elements found at index 9
Processing : 7 32
No more elements found at index 9
Processing : 7 33
No more elements found at index 1


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="SecondList"]/div[7]/div/div/ul/li[34]/a"}
  (Session info: chrome=126.0.6478.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF618F322C2+60002]
	(No symbol) [0x00007FF618EACA59]
	(No symbol) [0x00007FF618D67EDA]
	(No symbol) [0x00007FF618DB76E6]
	(No symbol) [0x00007FF618DB77AC]
	(No symbol) [0x00007FF618DFE9D7]
	(No symbol) [0x00007FF618DDC2CF]
	(No symbol) [0x00007FF618DFBCC7]
	(No symbol) [0x00007FF618DDC033]
	(No symbol) [0x00007FF618DA9657]
	(No symbol) [0x00007FF618DAA251]
	GetHandleVerifier [0x00007FF619243E2D+3278285]
	GetHandleVerifier [0x00007FF619290190+3590448]
	GetHandleVerifier [0x00007FF6192861D0+3549552]
	GetHandleVerifier [0x00007FF618FE1DE6+779654]
	(No symbol) [0x00007FF618EB7ACF]
	(No symbol) [0x00007FF618EB2EE4]
	(No symbol) [0x00007FF618EB3072]
	(No symbol) [0x00007FF618EA2C4F]
	BaseThreadInitThunk [0x00007FFD910D257D+29]
	RtlUserThreadStart [0x00007FFD925EAF28+40]
