# Scraping Work

## Converting Json to CSV

In [None]:
import pandas as pd

# Load JSON data into DataFrame
with open('celeb_heights.com.json', encoding='utf-8') as f:
    data = pd.read_json(f)

# Extract names
names = data['name'].tolist()

# Define CSV file path
csv_file = 'names.csv'

# Save names to CSV file
data = pd.DataFrame({'Name': names})
data.to_csv(csv_file, index=False)

print('Names saved to', csv_file)


## Scraping From Wikipedia Lists

In [3]:
import csv
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_American_television_actresses"

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Find the div with the specified class
div = soup.select(".div-col")
for idx,d in enumerate(div):
# Find all the list items within the div
    list_items = d.find_all("li")
    # Extract the names from the list items
    names = [item.find("a").get_text(strip=True) for item in list_items]
    # Specify the filename for the CSV file
    filename = f"indianactress{idx}.csv"

    # Open the CSV file in write mode and write the data
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
            # Write the header to the CSV file
        writer.writerow(["Name"])

        # Write each name to the CSV file as a row
        writer.writerows([[name] for name in names])

print(f"Names extracted successfully and saved to '{filename}'.")


Names extracted successfully and saved to 'indianactress23.csv'.


# MechanicalSoup

## Scraping using IMDB

In [None]:
import mechanicalsoup,random
import json
headers={
        "user-agent" : random.choice([
               "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/8.0 Mobile/11A465 Safari/9537.53",
              "Mozilla/5.0 (iPhone; CPU iPhone OS 15_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/98.0.300355889 Mobile/15E148 Safari/604.1",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/270.0.542728515 Mobile/15E148 Safari/604.1",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 it.fanpage.rn.ios/3.10.9",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MobileIron/2.18.1 Version/16.5.1 Safari/605.1.15",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MobileIron/2.18.1 Version/16.1.2 Safari/605.1.15",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_8 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/81.0.4044.113 Mobile/16G201 Safari/604.1",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 12_5_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Mobile/15E148 Safari/604.1 RDDocuments/7.5.4.747",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.163 Mobile/15E148 Safari/604.1",

            ]),
    "Accept-Language": "en-US,en;q=0.5"
    }
browser = mechanicalsoup.StatefulBrowser()
url = 'https://www.imdb.com/name/nm0000002/bio/?ref_=nm_ov_bio_sm'
browser.open(url, headers=headers)
soup = browser.get_current_page()

# Find the h2 tag with the attribute 'data-testid="subtitle"'
h2_tag = soup.select_one('h2[data-testid="subtitle"]')

# Extract the text content of the h2 tag
if h2_tag:
    subtitle = h2_tag.text
    print("Name:", subtitle)
else:
    print("Subtitle not found.")

height_li = soup.find('li', id='height')

# Extract the height value
if height_li:
    height_div = height_li.find('div', class_='ipc-html-content-inner-div')
    height_text = height_div.get_text(strip=True)
    print("Height:", height_text)

## Selenium

## Scraping of famouspeople

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Set up the Selenium WebDriver (make sure you have the appropriate browser driver installed)
driver = webdriver.Chrome()

url = "https://www.thefamouspeople.com/actress.php"
driver.get(url)

# Scroll to the bottom of the page
def scroll_to_bottom():
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")

# Find and click the element with class "loading-bar"
def click_loading_bar():
    loading_bar = driver.find_element(By.CSS_SELECTOR, "div.loading-bar")
    loading_bar.click()

# Wait for the page to load
def wait_for_page_to_load():
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.ptitle-internal a.tileLink")))

# Perform the scrolling and extraction
scroll_to_bottom()
click_loading_bar()
wait_for_page_to_load()

# Extract the information you need from the scrolled page
# actor_elements = driver.find_elements(By.CSS_SELECTOR, "div.ptitle-internal a.tileLink")
# for actor_element in actor_elements:
#     name = actor_element.text.strip()
#     print(name)

# Repeat the process
count=0
actress_names=[]
while True:
    scroll_to_bottom()
    click_loading_bar()
    wait_for_page_to_load()

    # Extract the information from the newly loaded content
    actor_elements = driver.find_elements(By.CSS_SELECTOR, "div.ptitle-internal a.tileLink")
    for actor_element in actor_elements:
        name = actor_element.text.strip()
        if name not in actress_names:
            count+=1
            print(f"{count} elements are sucessfully pushed to the fresh array")
            actress_names.append(name)
        print(name)
        
    # Check if the end of the page is reached
    if len(actor_elements) == 0:
        break

# Close the browser
driver.quit()


The version of chrome cannot be detected. Trying with latest driver version


WebDriverException: Message: unknown error: cannot find Chrome binary
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00C3A813+48355]
	(No symbol) [0x00BCC4B1]
	(No symbol) [0x00AD5358]
	(No symbol) [0x00AF1A9E]
	(No symbol) [0x00AF0579]
	(No symbol) [0x00B20C55]
	(No symbol) [0x00B2093C]
	(No symbol) [0x00B1A536]
	(No symbol) [0x00AF82DC]
	(No symbol) [0x00AF93DD]
	GetHandleVerifier [0x00E9AABD+2539405]
	GetHandleVerifier [0x00EDA78F+2800735]
	GetHandleVerifier [0x00ED456C+2775612]
	GetHandleVerifier [0x00CC51E0+616112]
	(No symbol) [0x00BD5F8C]
	(No symbol) [0x00BD2328]
	(No symbol) [0x00BD240B]
	(No symbol) [0x00BC4FF7]
	BaseThreadInitThunk [0x764B7BA9+25]
	RtlInitializeExceptionChain [0x776BBD2B+107]
	RtlClearBits [0x776BBCAF+191]


In [2]:
import pandas as pd

df = pd.read_csv("./formatting/final_data_ready.csv")
names = df.Name.values
l_names = list(names)
l_names
test = l_names[1000:]
print

FileNotFoundError: [Errno 2] No such file or directory: './formatting/final_data_ready.csv'

In [20]:
import random, requests, json

headers={
        "user-agent" : random.choice([
               "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/8.0 Mobile/11A465 Safari/9537.53",
              "Mozilla/5.0 (iPhone; CPU iPhone OS 15_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/98.0.300355889 Mobile/15E148 Safari/604.1",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/270.0.542728515 Mobile/15E148 Safari/604.1",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 it.fanpage.rn.ios/3.10.9",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MobileIron/2.18.1 Version/16.5.1 Safari/605.1.15",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MobileIron/2.18.1 Version/16.1.2 Safari/605.1.15",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_8 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/81.0.4044.113 Mobile/16G201 Safari/604.1",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 12_5_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Mobile/15E148 Safari/604.1 RDDocuments/7.5.4.747",
      "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.163 Mobile/15E148 Safari/604.1",

            ]),
    "Accept-Language": "en-US,en;q=0.5"
    }
for c_name in l_names[:1]:
        searchWord=c_name
        url=f"https://www.famousbirthdays.com/api/autocomplete?term={searchWord}"
        web_res = requests.get(url, headers=headers)
        response_data = web_res.json()
        print(response_data["suggestions"][0]['data'])
        print(response_data["suggestions"][0]['value'])


2 Chainz (rapper)
2 Chainz (rapper)


In [24]:
import json
import requests
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import Timeout

# Create an empty list to store the responses
responses = []

def fetch_data(name):
    try:
        api_url = f'https://www.famousbirthdays.com/api/autocomplete?term={name}'
        response = requests.get(api_url, timeout=10)
        
        if response.status_code == requests.codes.ok:
            response_data = response.json()
            if response_data:
                 newDict = {}
                 newDict["data"] = response_data["suggestions"][0]['data']
                 newDict["value"] = (response_data["suggestions"][0]['value'])
                 responses.append(newDict)
                 print(newDict)
        else:
            print("Error:", response.status_code, response.text)
    
    except Timeout:
        print("Connection timeout occurred for", name)

# Create a ThreadPoolExecutor with maximum concurrency of 5
with ThreadPoolExecutor(max_workers=32) as executor:
    # Submit tasks to the executor
    executor.map(fetch_data, test)

# Save the responses to a JSON file
with open('categorical.json', 'w') as file:
    json.dump(responses, file)


{'data': 'Angus Young (guitarist)', 'value': 'Angus Young (guitarist)'}
{'data': 'Ani DiFranco (folk singer)', 'value': 'Ani DiFranco (folk singer)'}
{'data': 'Angus Sampson (tv actor)', 'value': 'Angus Sampson (tv actor)'}
{'data': 'Anil Kapoor (tv actor)', 'value': 'Anil Kapoor (tv actor)'}
{'data': 'Angus T. Jones (tv actor)', 'value': 'Angus T. Jones (tv actor)'}
{'data': 'Anika Noni Rose (movie actress)', 'value': 'Anika Noni Rose (movie actress)'}
{'data': 'Anita Ekberg (movie actress)', 'value': 'Anita Ekberg (movie actress)'}
{'data': 'Anja Rubik (model)', 'value': 'Anja Rubik (model)'}
{'data': 'Anissa Jones (tv actress)', 'value': 'Anissa Jones (tv actress)'}
{'data': 'Anita Rani (journalist)', 'value': 'Anita Rani (journalist)'}
{'data': 'Angus Scrimm (movie actor)', 'value': 'Angus Scrimm (movie actor)'}
{'data': 'Anjana Sukhani (movie actress)', 'value': 'Anjana Sukhani (movie actress)'}
{'data': 'Anitta (pop singer)', 'value': 'Anitta (pop singer)'}
{'data': 'Anita Dobson