In [3]:
# merge the data for Turkish IMDB and English IMDB
import pandas as pd

turkish_imdb = pd.read_csv('turkish_drama_data_with_imdb.csv')  
english_imdb = pd.read_csv('turkish_drama_data_with_imdb_eng.csv')

In [8]:
# merge the two dataframes (append the columns of IMDb URL from English and IMDb ID from English to the Turkish dataframe, match with URL)
merged = pd.merge(turkish_imdb, english_imdb[['URL','IMDb URL from English', 'IMDb ID from English']], on='URL', how='left')
merged.head()

Unnamed: 0,URL,Original Title,English Title,Genre,Episodes,Broadcast Network,Broadcast Start Date,Broadcast End Date,Production Company,Director,Screen Writer,Synopsis,IMDb ID from Turkish,IMDb URL from Turkish,IMDb ID From Turkish,IMDb URL from English,IMDb ID from English
0,https://www.turkishdrama.com/the-tailor-terzi-...,Terzi,The Tailor,"Drama, Romance",23.0,Netflix,2-May-23,3-Nov-23,OGM Pictures,Cem Karci,,Peyami Dokumaci (Cagatay Ulusoy) is a young an...,tt13317230,https://www.imdb.com/title/tt13317230/,tt13317230,https://www.imdb.com/title/tt15473010/,tt15473010
1,https://www.turkishdrama.com/sapphire-safir-tv...,Safir,Sapphire,"Romance, Drama",17.0,Atv,4-Sep-23,Present,NTC Medya,Semih Bagci,,"Gulsoy family is a well-known, wealthy family ...",tt0821803,https://www.imdb.com/title/tt0821803/,tt0821803,https://www.imdb.com/title/tt17048670/,tt17048670
2,https://www.turkishdrama.com/omer-tv-series.html,Ömer,Omer,"Drama, Family",34.0,Star TV,9-Jan-23,present,OGM Pictures,Cem Karci,"Gulizar Irmak, Deniz Madanoglu, Sedef Bayburtl...",Omer (Selahattin Pasali) is a young guy in his...,tt22719788,https://www.imdb.com/title/tt22719788/,tt22719788,Not Found,
3,https://www.turkishdrama.com/forevermore-verme...,Vermem Seni Ellere,Forevermore,"Romance, Drama",9.0,Atv,18-Jun-23,13-Aug-23,AKN Film,Ali Balci,"Sehrazat Tunus Tasci, Damla Gucer, Samed Aslan...",Mehmet (Emre Bey) is a young guy who comes fro...,tt27739128,https://www.imdb.com/title/tt27739128/,tt27739128,https://www.imdb.com/title/tt4183480/,tt4183480
4,https://www.turkishdrama.com/queen-kralice-tv-...,Kralice,Queen,"Drama, Romance",11.0,Kanal D,6-Apr-23,7-Jun-23,Mednova,"Cevdet Mercan, Serhan Sahin","Serdar Soydan, Kerem Bozok, Ekin Akcay, Nil Gu...",Deniz (Burcu Ozberk) and Ates (Gokhan Alkan) a...,,,,https://www.imdb.com/title/tt11393148/,tt11393148


In [None]:
import requests
from bs4 import BeautifulSoup

def is_country_of_origin_turkey(imdb_url):
    # if IMDB URL is a string and not start with 'https://www.imdb.com/title/'
    if not isinstance(imdb_url, str) or not imdb_url.startswith('https://www.imdb.com/title/'):
        return None
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(imdb_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the section containing country information
    country_section = soup.find('li', attrs={'data-testid': 'title-details-origin'})
    if not country_section:
        return None

    countries = country_section.find_all('a')
    country_list = [country.get_text(strip=True) for country in countries]

    if 'Turkey' in country_list:
        return imdb_url
    else:
        return None

# apply the function to the IMDb URL from English
merged['IMDb URL from English cleaned'] = merged['IMDb URL from English'].apply(is_country_of_origin_turkey)
merged['IMDb URL from Turkish cleaned'] = merged['IMDb URL from Turkish'].apply(is_country_of_origin_turkey)




In [None]:
# after cleaning and found all the Turkish dramas, we then need to explore the release countries and release dates of the dramas 

def get_countries_and_release_dates(imdb_url, driver=None):
    """
    Scrapes country and release date information from an IMDb release info page.

    Parameters:
        imdb_url (str): The IMDb release info URL.
        driver (webdriver.Chrome): An instance of Chrome WebDriver.

    Returns:
        dict: A dictionary containing the URL and extracted data.
    """
    import time

    # If no driver is provided, initialize it
    driver_created = False
    if driver is None:
        from selenium import webdriver
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.chrome.options import Options

        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("user-agent=Mozilla/5.0")

        service = Service('path/to/chromedriver')  # Update this path
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver_created = True

    data = {'URL': imdb_url}

    try:
        driver.get(imdb_url)
        wait = WebDriverWait(driver, 20)

        # Handle the overlay if present
        overlay_locator = (By.CSS_SELECTOR, '.sc-kDvujY.kUNdqF')
        try:
            wait.until(EC.invisibility_of_element_located(overlay_locator))
        except:
            # Remove overlay via JavaScript if still present
            driver.execute_script("""
                var overlay = document.querySelector('.sc-kDvujY.kUNdqF');
                if (overlay) {
                    overlay.parentNode.removeChild(overlay);
                }
            """)

        # Locate and click the "See More" button if present
        try:
            button_locator = (By.CLASS_NAME, 'ipc-see-more__button')
            button = wait.until(EC.element_to_be_clickable(button_locator))
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
            time.sleep(1)

            if button.is_displayed() and button.is_enabled():
                try:
                    button.click()
                except Exception:
                    driver.execute_script("arguments[0].click();", button)
            else:
                print("Button is not visible or enabled after scrolling.")
        except Exception:
            pass  # No "See More" button found, proceed without clicking

        # Wait for the new content to load
        time.sleep(2)  # Adjust based on actual loading time

        # Locate all <li> elements with IDs starting with 'rel_'
        li_elements = driver.find_elements(By.XPATH, "//li[starts-with(@id, 'rel_')]")

        # Extract data from each <li> element
        for index, li in enumerate(li_elements, start=1):
            try:
                # Extract the ID
                li_id = li.get_attribute('id')

                # Extract the <a> element
                link = li.find_element(By.TAG_NAME, 'a')

                # Extract the country from the 'aria-label' attribute
                country = link.get_attribute('aria-label')

                # Extract the date from the specified <span> element
                date_element = li.find_element(By.CSS_SELECTOR, 'span.ipc-metadata-list-item__list-content-item')
                date_text = date_element.text

                # Add the data to the dictionary
                data[f'id{index}'] = li_id
                data[f'country{index}'] = country
                data[f'date{index}'] = date_text
            except Exception as e:
                print(f"Error processing li element: {e}")
                continue  # Skip this li element and continue with the next one

    except Exception as e:
        print(f"An error occurred while processing {imdb_url}: {e}")

    finally:
        # Only quit the driver if it was created in this function
        if driver_created:
            driver.quit()

    return data
