In [1]:
import re
import os
import csv
import json
import time
import httpx
import asyncio
import requests
from urllib import parse
from bs4 import BeautifulSoup
from library.output_data import exportData

In [2]:
# getting all user input
# reading files from local directory
# file_path: threads_data/json/interactive_trading.json,
# file_path: threads_data/json/commercial_content.json,
# file_path: threads_data/json/trading_journals.json,
# file_path: threads_data/json/platform_tech.json,
# file_path: threads_data/json/trading_systems.json,
# file_path: threads_data/json/rookie_talk.json,
# file_path: threads_data/json/broker_discussion.json,
# file_path: threads_data/json/trading_discussion.json


file_dir = input('Enter file directory: ')
file_name = input('Enter the file_name: ')

In [3]:
# header list
header_list = [
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/114.0'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 OPR/100.0.0.0'},
    {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
]

active_header = header_list[0]

In [4]:
# get url crawler fast function
async def get_url(thread_url, header):
    res = httpx.get(thread_url, headers=header)
    parsed_url = parse.urlparse(thread_url)

    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'lxml')
        page_num_tag = soup.select_one('div.forumdisplay__footer li.visible-mv a.last')
        page_num = page_num_tag.string
        # url_path = page_num_tag['href'] # url path is not needed
        query_list = [f"page={page}" for page in range(1, int(page_num) + 1)]
        url_list = []
        # generating all the urls

        for custom_query in query_list:
            post_url = parsed_url._replace(query=custom_query).geturl()
            # print(post_url)
            url_list.append(post_url)

        return url_list
    
    else:
        print(f"Bad Response: {res.status_code}")
        print(f"Got error from page {res.url}")

In [5]:
start_time = time.perf_counter()
new_html_list = asyncio.run(get_url("https://www.forexfactory.com/thread/1228091-from-learning-through-losses-to-achievement", active_header)) # running the crawler
stop_time = time.perf_counter()

RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# get html function
async def get_html(client, url, header):
        res = await client.get(url, headers=header)
        
        if res.status_code == 200:
            return {"result": res.status_code, "url": res.url, "html_string": res.text}
        else:
            return {"result": res.status_code, "url": res.url, "html_string": None}

In [None]:
# running the main spider in concurrent mode
async def fast_spider(header, thread_url):
    async with httpx.AsyncClient(timeout=None) as client:
        tasks = []

        # generating the url_list
        url_list = await get_url(thread_url, header)

        for url in url_list:
            tasks.append(asyncio.ensure_future(get_html(client, url, header)))

        # printing the number of requests
        print(f"total number of request send: {len(url_list)}")

        # waiting to collect all the response
        async_response = await asyncio.gather(*tasks)

        # getting the list data
        html_list = []
        error_response_list = []

        for response in async_response:
            if response['result'] == 200:
                html_list.append(response['html_string'])
            else:
                error_response_list.append(response)

        
        print(f"total html_string collected: {len(html_list)}")
        return html_list

In [None]:
# getting the complete url list
with open(file_dir, mode='r') as json_file:
    json_data = json_file.read()
    url_list = json.loads(json_data)
    print(f"total number of threads: {len(url_list)}")

In [None]:
start_time = time.perf_counter()
new_html_list = asyncio.run(fast_spider(active_header, "https://www.forexfactory.com/thread/1156787-price-action-intradayscalps-only")) # running the crawler
stop_time = time.perf_counter()

print(f"total time took: {stop_time - start_time}")
print(f"post found: {len(new_html_list)}")

In [None]:
# old html_list
main_html_list = []


for url in url_list:
    start_time = time.perf_counter()
    new_html_list = asyncio.run(fast_spider(active_header, url)) # running the crawler
    stop_time = time.perf_counter()

    # collection all html and taking them in single list
    main_html_list.extend(new_html_list)

    print(f"total time took: {stop_time - start_time}")
    print(f"post found: {len(new_html_list)}")

### solving the error

In [2]:
def get_data(html, data_list):
    soup = BeautifulSoup(html, 'lxml')
    page_content = soup.select_one('section.content')

    # setting the base url
    base_url = "https://www.forexfactory.com"

    category = page_content.select_one('div.head span[itemprop="name"]').string
    topic_name = page_content.select_one('div.showthread__title h1').get_text(strip=True)

    posts = page_content.select('div#posts div.showthread')

    for post in posts:

        # fixing the time scrape selector
        date_tag = soup.select_one('div.threadpost-header__controls li.threadpost-header__controllink--nolink')
        # print(date_tag)
        date_text = date_tag.get_text(strip=True)
        pattern = r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\,\s20\d{2}"

        match = re.search(pattern, date_text)

        if match:
            date_string = match.group()
        else:
            print('date find error!')
            date_string = None

        has_files = post.select_one('div.threadpost-content div.threadpost-content__attachments')
        
        if has_files:
            post_details_tag = post.select_one('div.threadpost-header div.threadpost-header__controls a[title="Post Permalink"]')
            post_url = parse.urljoin(base_url, post_details_tag['href'])
            post_count = post_details_tag['data-postnum']
            
            try:
                info_text = has_files.select_one('span.info').get_text(strip=True)
                download_text = re.search(r"\b\|[0-9\,]+\sdownloads\b", info_text).group()
                downloads = download_text.replace('|', '').replace('downloads', '').replace(',', '').strip()
            except:
                download_text = "Not Provided"
                downloads = "Not Provided"
            
            attach_files = has_files.select('div.attachinfo')

            for file in attach_files:
                file_url = file.a['href']
                file_name = file.a.string

                data_dict = {
                    "file_name": file_name,
                    "download_url": file_url,
                    "downloads": downloads,
                    "date": date_string,
                    "topic_name": topic_name,
                    "category": category,
                    "post_url": post_url,
                    "post#": post_count
                }

                data_list.append(data_dict)
                # print(data_dict)     
        else:
            continue

In [3]:
url = "https://www.forexfactory.com/thread/post/8981136"

In [4]:
res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/114.0'})

if res.status_code == 200:
    print("yes")
    html = res.text

yes


In [5]:
file_data_list = []

get_data(html, file_data_list)

In [None]:
len(file_data_list)