In [1]:
import re
import os
import csv
import json
import time
import httpx
import asyncio
import requests
from urllib import parse
from bs4 import BeautifulSoup
from library.output_data import exportData

In [2]:
# header list
header_list = [
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/114.0'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 OPR/100.0.0.0'},
    {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
]

active_header = header_list[4]

### this is the phase-2 program that will collect data from post pages   

In [3]:
# function to get the html

async def get_html(client, url, header):
    res = await client.get(url, headers=header)
    
    if res.status_code == 200:
        return {"result": res.status_code, "url": res.url, "html_string": res.text}
    else:
        return {"result": res.status_code, "url": res.url, "html_string": None}

In [4]:
# function for getting html from all the post page

async def get_html_list(header, url_list):
    
    # getting the list data
    html_response_list = []
    error_response_list = []

    async with httpx.AsyncClient(timeout=None) as client:
        tasks = []

        # appending async task to tasks
        for url in url_list:
            tasks.append(asyncio.ensure_future(get_html(client, url, header)))

        # printing the number of requests
        print(f"total number of request send: {len(url_list)}")

        # waiting to collect all the response
        async_response = await asyncio.gather(*tasks)

        for response in async_response:
            if response['result'] == 200:
                html_response = {
                    "url": response["url"],
                    "html_string": response["html_string"]
                }

                html_response_list.append(html_response)
            else:
                error_response_list.append(response)



    return (html_response_list, error_response_list)

In [5]:
# dir_path = "post_data/"
# dir_iter = os.scandir(dir_path)

# for file in dir_iter:
#     print(file.name)

trading_journals_post_urls.json
interactive_trading_post_urls.json
rookie_talk_post_urls.json
broker_discussion_post_urls.json
trading_system_post_urls.json
commercial_content_post_urls.json
trading_discussion_post_urls.json


In [None]:
# trading_journals_post_urls.json
# interactive_trading_post_urls.json
# rookie_talk_post_urls.json
# broker_discussion_post_urls.json
# trading_system_post_urls.json
# commercial_content_post_urls.json
# trading_discussion_post_urls.json

# user input for the files and url list

file_dir = input('Enter file directory: ')
file_name = input('Enter the file_name: ')

In [None]:
# getting the complete url list
with open("post_data/rookie_talk_post_urls.json", mode='r') as json_file:
    json_data = json_file.read()
    url_list = json.loads(json_data)
    print(f"total number of threads: {len(url_list)}")
    print(url_list[-1])
    print(type(url_list))

In [None]:
# # running the concurrent
start_time = time.perf_counter()
html_response_list, error_response_list = await get_html_list(active_header, url_list) # running the crawler
stop_time = time.perf_counter()

print(len(html_response_list))
print(len(error_response_list))

In [None]:
# parser function to find the files from the post pages

def get_data(html, data_list, error_list):
    soup = BeautifulSoup(html, 'lxml')
    page_content = soup.select_one('section.content')

    # setting the base url
    base_url = "https://www.forexfactory.com"

    category = page_content.select_one('div.head span[itemprop="name"]').string
    topic_name = page_content.select_one('div.showthread__title h1').get_text(strip=True)

    posts = page_content.select('div#posts div.showthread')

    for post in posts:

        # fixing the time scrape selector
        date_tag = soup.select_one('div.threadpost-header__controls li.threadpost-header__controllink--nolink')
        # print(date_tag)
        date_text = date_tag.get_text(strip=True)
        # pattern = r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\,\s20\d{2}"

        # match = re.search(pattern, date_text)

        # if match:
        #     date_string = match.group()
        # else:
        #     print('date find error!')
        #     date_string = None

        has_files = post.select_one('div.threadpost-content div.threadpost-content__attachments')
        
        if has_files:
            post_details_tag = post.select_one('div.threadpost-header div.threadpost-header__controls a[title="Post Permalink"]')
            post_url = parse.urljoin(base_url, post_details_tag['href'])
            post_count = post_details_tag['data-postnum']
            
            try:
                info_text = has_files.select_one('span.info').get_text(strip=True)
                download_text = re.search(r"\b\|[0-9\,]+\sdownloads\b", info_text).group()
                downloads = download_text.replace('|', '').replace('downloads', '').replace(',', '').strip()
            except:
                download_text = "Not Provided"
                downloads = "Not Provided"
                error_list.append(post)
            
            attach_files = has_files.select('div.attachinfo')

            for file in attach_files:
                file_url = file.a['href']
                file_name = file.a.string

                data_dict = {
                    "file_name": file_name,
                    "download_url": file_url,
                    "downloads": downloads,
                    "date": date_text,
                    "topic_name": topic_name,
                    "category": category,
                    "post_url": post_url,
                    "post#": post_count
                }

                data_list.append(data_dict)
                # print(data_dict)     
        else:
            continue

In [None]:
# running the parser and collecting the data

file_data_list = []
file_error_list = []

for response in html_response_list:
    html = response["html_string"]
    
    # running the parser
    get_data(html, file_data_list)

print(len(file_data_list))
print(len(file_error_list))