# Importing libraries

In [None]:
import requests
import time
import re
import numpy as np
import pandas as pd
import espncricinfo as ci
import ipywidgets as widgets
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from IPython.display import display, HTML

# Setting configurations

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

comm_filepath = os.path.abspath(os.path.join(os.getcwd(), "../../../data/CommExtract/"))

# Initializing paramters

In [None]:
year = '2023'
series = 'indian-premier-league-2023'
matchid = '66173'
scroll_pause_time = 7

# Web Scrapping - Cricbuzz

### Finding all the series from the yearly calendar

In [None]:
base_url = 'https://www.cricbuzz.com/cricket-scorecard-archives/'
url = base_url + year


page = requests.get(url)
bs = BeautifulSoup(page.content, 'lxml')

links = bs.find_all('a', href=lambda href: href and 'matches' in href)

series_list = []
for link in links:
    href = link['href']
    columns = href.split('/')[2:4]
    #columns[-1] = columns[-1].replace('-', ' ') ---> for better readability convert code to proper text
    series_list.append(columns)

series_id = next((row[0] for row in series_list if row[1] == series), None)

print(str(len(series_list)) + ' series found in the year ' + year)
print('Current series selected is :- ' + series)
print('Series ID : ' + series_id)
#series

### Finding all the matches from the given series

In [None]:
base_url = 'https://www.cricbuzz.com/cricket-series/'
url = base_url + series_id + '/' + series + '/matches'

page = requests.get(url)
bs = BeautifulSoup(page.content, 'lxml')

links = bs.select('div.page')[0].find_all('a', href=lambda href: href and (
    '/cricket-scores/' in href or (series in href and '/live-cricket-scores/' in href)))

matches = []
for link in links:
    href = link['href']
    column = href.split('/')[2:4]
    matches.append(column)
    
matches = list(set(map(tuple,matches)))
matches.sort(key=lambda x: int(x[0]))

# match_id = next((row[0] for row in matches if row[1] == match_wg.value), None)
# match_id = MATCHID
match = next((row[1] for row in matches if row[0] == matchid), None)

print(str(len(matches)) + ' matches found in the series - ' + series.replace('-', ' ') + ' for the year ' + year)
print('Current match selected is :- ' + match)
print('Match ID : ' + matchid)

matches

### Get full commentary using a dynamic scroll with a timer

In [None]:
base_url = 'https://www.cricbuzz.com/cricket-scores/'
url = base_url + matchid + '/' + match

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)   # Set up the driver
driver.get(url)

wait = WebDriverWait(driver, 10)   # Set a wait time for explicit waits

# Find the element by text
element_text = "Load More Commentary"
element_xpath = f"//*[contains(text(), '{element_text}')]"
element = wait.until(EC.visibility_of_element_located((By.XPATH, element_xpath)))

driver.execute_script("arguments[0].scrollIntoView();", element)   # Scroll to the element

driver.execute_script("arguments[0].click();", element)   # Perform JavaScript click

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
print("last_height - " + str(last_height))

start_time = time.time()

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")   # Scroll down to bottom
    time.sleep(0.3)   # Wait to load page
    #driver.implicitly_wait(0.3)
    #element = wait.until(EC.presence_of_element_located((By.XPATH, element_xpath)))


    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    print("new_height - " + str(new_height))
    
    if new_height != last_height:
        start_time = time.time()
        print("New content found. Resetting start_time...")
    
    elapsed_time = time.time() - start_time
    if elapsed_time > SCROLL_PAUSE_TIME:
        print("No new content found. Breaking now...")
        break
    
    last_height = new_height

# Get the updated page source
updated_page_source = driver.page_source

# Perform further actions on the updated page source if needed
bs = BeautifulSoup(updated_page_source, 'lxml')
#print(bs.body.prettify())

driver.quit()   # Close the driver

# Extract clean text parts

In [None]:
div_elements = bs.find_all('div', {'ng-include': "'commentary'"})

text_parts = []
for div in div_elements:
    texts = div.find_all(text=True)
    cleaned_texts = [text.strip() for text in texts if text.strip() and 
                     not text.strip().startswith('ngIf:') and 
                     not text.strip().startswith('end ngIf:') and 
                     not text.strip().startswith('ngRepeat:') and
                     not text.strip().startswith('ngInclude:') and
                     not text.strip().startswith('end ngRepeat')]
    text_parts.extend(cleaned_texts)

#print(text_parts)

# Save the file

In [None]:
path = comm_filepath + '\\comm_' + matchid + '.txt'

# Delete the file if exists
os.remove(path) if os.path.exists(path) else None

# Save the list to a text file
with open(path, 'w') as f:
    for item in text_parts:
        f.write(f"{item}\n")