**Environment Setup**

In [None]:
# Install dependenies
!pip -r ./InvestHK/requirements.txt

# You also need to install pytorch with GPU support to accelerate Deep learning, 
# See https://pytorch.org/get-started/locally/
# Example:

!conda install pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch -c nvidia

# To use crawler, you need to install Chrome browser & Corresponding Chrome Webdriver
# See https://chromedriver.chromium.org/downloads
# Example: 

!wget https://chromedriver.storage.googleapis.com/108.0.5359.71/chromedriver_linux64.zip



In [1]:
# Unzip the .zip file and set the executable path to config
# Example
import InvestHK

InvestHK.CFG.CHROME_DRIVER_PATH = r'D:/User/下載/chromedriver.exe'

# Set the log file path

InvestHK.CFG.CRAWLER_LOG_PATH = 'crawler.log'

  from .autonotebook import tqdm as notebook_tqdm


**InvestHK: Crawler - How to use**

In [2]:
# Check InvestHK supported websites
InvestHK.CFG.SUPPORTED_WEBSITES

['bloomberg',
 'cnbc',
 'financialtimes',
 'xinhuanet',
 'reuters',
 'xinhuanet_chinese',
 'renminribao',
 'chinadaily',
 'GBAChinese',
 'GBAEnglish',
 'SouthCN',
 'AVCJ',
 'YiCai',
 'jiemian']

In [3]:
# 1. Set up a webdriver instance

driver = InvestHK.setup_webdriver(headless=False, pictures=True, scripts=True)

# You will see a browser window pop up

In [4]:
# 2. Fetch URL from news website

from datetime import datetime, timedelta

# From yesterday to today (Include both)
to_date = datetime.today()
from_date = datetime.today() - timedelta(days=1)
fetch_results = InvestHK.fetch_url(
                                   driver, 
                                   website='SouthCN',
                                   query='',
                                   from_date=from_date,
                                   to_date=to_date
                                   )
fetch_results

[{'title': '天下一家｜坚持人与自然和谐共生',
  'url': 'https://www.southcn.com/node_b5769d65fb/c5b528dc00.shtml',
  'datetime': datetime.datetime(2023, 3, 29, 14, 28)},
 {'title': '“习近平高质量发展新论断”｜推进农业现代化',
  'url': 'https://www.southcn.com/node_b5769d65fb/725802359e.shtml',
  'datetime': datetime.datetime(2023, 3, 29, 11, 42)},
 {'title': '中国特色社会主义是实现中华民族伟大复兴的必由之路',
  'url': 'https://www.southcn.com/node_b5769d65fb/1663504de7.shtml',
  'datetime': datetime.datetime(2023, 3, 29, 9, 28)},
 {'title': '写在博鳌亚洲论坛2023年年会召开之际：携手同行创未来',
  'url': 'https://news.southcn.com/node_179d29f1ce/0292d56f27.shtml',
  'datetime': datetime.datetime(2023, 3, 29, 7, 54)},
 {'title': '解读2023年中央预算公开：钱花得更明白、更有效',
  'url': 'https://news.southcn.com/node_179d29f1ce/3daad05967.shtml',
  'datetime': datetime.datetime(2023, 3, 29, 7, 46)},
 {'title': '我国完成首单液化天然气跨境人民币结算交易',
  'url': 'https://news.southcn.com/node_179d29f1ce/cd5b54380c.shtml',
  'datetime': datetime.datetime(2023, 3, 29, 14, 23)},
 {'title': '我国绿色制造体系已基本构建 打造新型工业化坚实

In [5]:
# 3.Take one result and use analyze_html to extract titles & contents

import requests
result = fetch_results[0]
url = result['url']

driver.get(url)

In [7]:
InvestHK.analyze_html(driver.page_source, website='SouthCN')

(['天下一家｜坚持人与自然和谐共生'],
 '“中华民族历来讲求‘天下一家’，主张民胞物与、协和万邦、天下大同，憧憬‘大道之行，天下为公’的美好世界。”千百年来，“世界大同、天下一家”始终是中华民族的理想追求。习近平主席汲取中华优秀传统文化思想精华，多次提及“天下一家”观点，为世界人民求和平、谋发展贡献中国智慧、中国方案。央视网《联播+》推出《天下一家》系列报道，与您一同感受习近平主席胸怀天下的情怀与担当。\n人与自然的关系是一种天人合一、互利共赢的共同体关系，这正是习近平生态文明思想的重要内涵。\n近年来，习近平主席在不同场合多次呼吁构建人与自然和谐共生的地球家园。他说：“这是对我们自己负责，也是对世界负责。”')

In [None]:
# 4.Train a model to predict investment leads score

bads = '''Freshfields & Mercer on Hong Kong Board Diversity
United Airlines Plans to Resume Hong Kong Flights From January
GDS Is Said to Explore Singapore Listing After Hong Kong, US
China to Start Swap Connect With Hong Kong in Six Months
Why a Primary Listing in Hong Kong Matters for Alibaba, BiliBili
Colliers Rosanna Tang on HK Property Market
Cryptocurrencies are gaining ground in Asia, says Hong Kong-based crypto platform CoinUnited.io
CNBC Transcript: Franklin Tong Fuk-Kay, Chief Executive Officer, Hong Kong Applied Science and Technology Research Institute
Jinmao Investments' Hong Kong trading debut
New economy companies are capturing investor attention in Hong Kong'''.split('\n') 

goods = '''Fintech Giant Lufax Plans Hong Kong Listing to Hedge US Risk
Citadel Adds Office Space for Growing Hong Kong Teams
Hedge Fund Firm North Rock Bucks Trend With Hong Kong Office
Tencent Music Said to Plan Hong Kong Debut as Soon as Next Week
Hong Kong Telco HKBN Said to Draw Interest From Stonepeak, PAG
Truphone debuts in Hong Kong
RTS Expands Operations in Hong Kong with Platform Equinix
Avature Opens New Office in Hong Kong
Fortnum & Mason to open first overseas store in Hong Kong
Hong Kong and Singapore to work together on blockchain project
More consolidation in Asian private banking as OCBC to buy NAB's Singapore and Hong Kong business
Tencent-backed fintech start-up looks to Hong Kong to expand
This Australian brand is expanding in Hong Kong even though the retail scene is struggling'''.split('\n') 

texts = goods + bads
labels = [1] * len(goods) + [0] * len(bads)

metric = InvestHK.train_new_model('Deberta', texts, labels, './testmodel')

In [None]:
# 5.Check metrics of trained model
metric

In [None]:
# 6.Load model and predict score

fetched_titles = [r['title'] for r in fetch_results]

test_titles = '''Apple is going to set new office in Hong Kong
This Australian brand is expanding in Hong Kong even though the retail scene is struggling'''.split('\n')

titles = fetched_titles + test_titles
print(titles)

scores = InvestHK.predict_score('testmodel', titles)

In [None]:
# 7.See the result
for t, s in zip(titles, scores):
    print(f'{s} - {t}')

**Chinese Model - How to use**

In [None]:
# 8.To train a chinese inference model, use translation API to convert input texts into simplified chinese

sample_en = ['I am one English text', 'I am another English text']
sample_hk = ['我是一個香港繁中文本', '我是另一個香港繁中文本']

print(InvestHK.translate_en_to_zh(sample_en))
print(InvestHK.translate_hk_to_zh(sample_hk))

In [None]:
# 9.Use Model Type 'Chinese' and texts of simplified chinese to train a chinese model.

InvestHK.train_new_model(model_type='Chinese', texts=InvestHK.translate_en_to_zh(sample_en), labels=[1,1], save_to_path='test_chinese_model')