**Environment Setup**

In [1]:
# Install dependenies
!pip -r ./InvestHK/requirements.txt

# You also need to install pytorch with GPU support to accelerate Deep learning, 
# See https://pytorch.org/get-started/locally/
# Example:

!conda install pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch -c nvidia

# To use crawler, you need to install Chrome browser & Corresponding Chrome Webdriver
# See https://chromedriver.chromium.org/downloads
# Example: 

!wget https://chromedriver.storage.googleapis.com/108.0.5359.71/chromedriver_linux64.zip




Usage:   
  pip <command> [options]

no such option: -r


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



'wget' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
# Unzip the .zip file and set the executable path to config
# Example
import InvestHK

InvestHK.CFG.CHROME_DRIVER_PATH = 'D:\chromedriver.exe'

# Set the log file path

InvestHK.CFG.CRAWLER_LOG_PATH = 'crawler.log'

ModuleNotFoundError: No module named 'torch'

**InvestHK: Crawler - How to use**

In [4]:
# Check InvestHK supported websites
InvestHK.CFG.SUPPORTED_WEBSITES

NameError: name 'InvestHK' is not defined

In [3]:
# 1. Set up a webdriver instance

driver = InvestHK.setup_webdriver(headless=False, pictures=True, scripts=True)

# You will see a browser window pop up

NameError: name 'InvestHK' is not defined

In [None]:
# 2. Fetch URL from news website

from datetime import datetime, timedelta

# From yesterday to today (Include both)
to_date = datetime.today()
from_date = datetime.today() - timedelta(days=1)

fetch_results = InvestHK.fetch_url(
                                   driver, 
                                   website='bloomberg',
                                   query='Hong Kong',
                                   from_date=from_date,
                                   to_date=to_date
                                   )
fetch_results

In [None]:
# 3.Take one result and use analyze_html to extract titles & contents

import requests
result = fetch_results[0]
url = result['url']

driver.get(url)

In [None]:
InvestHK.analyze_html(driver.page_source, website='bloomberg')

In [None]:
# 4.Train a model to predict investment leads score

bads = '''Freshfields & Mercer on Hong Kong Board Diversity
United Airlines Plans to Resume Hong Kong Flights From January
GDS Is Said to Explore Singapore Listing After Hong Kong, US
China to Start Swap Connect With Hong Kong in Six Months
Why a Primary Listing in Hong Kong Matters for Alibaba, BiliBili
Colliers Rosanna Tang on HK Property Market
Cryptocurrencies are gaining ground in Asia, says Hong Kong-based crypto platform CoinUnited.io
CNBC Transcript: Franklin Tong Fuk-Kay, Chief Executive Officer, Hong Kong Applied Science and Technology Research Institute
Jinmao Investments' Hong Kong trading debut
New economy companies are capturing investor attention in Hong Kong'''.split('\n') 

goods = '''Fintech Giant Lufax Plans Hong Kong Listing to Hedge US Risk
Citadel Adds Office Space for Growing Hong Kong Teams
Hedge Fund Firm North Rock Bucks Trend With Hong Kong Office
Tencent Music Said to Plan Hong Kong Debut as Soon as Next Week
Hong Kong Telco HKBN Said to Draw Interest From Stonepeak, PAG
Truphone debuts in Hong Kong
RTS Expands Operations in Hong Kong with Platform Equinix
Avature Opens New Office in Hong Kong
Fortnum & Mason to open first overseas store in Hong Kong
Hong Kong and Singapore to work together on blockchain project
More consolidation in Asian private banking as OCBC to buy NAB's Singapore and Hong Kong business
Tencent-backed fintech start-up looks to Hong Kong to expand
This Australian brand is expanding in Hong Kong even though the retail scene is struggling'''.split('\n') 

texts = goods + bads
labels = [1] * len(goods) + [0] * len(bads)

metric = InvestHK.train_new_model('Deberta', texts, labels, './testmodel')

In [None]:
# 5.Check metrics of trained model
metric

In [None]:
# 6.Load model and predict score

fetched_titles = [r['title'] for r in fetch_results]

test_titles = '''Apple is going to set new office in Hong Kong
This Australian brand is expanding in Hong Kong even though the retail scene is struggling'''.split('\n')

titles = fetched_titles + test_titles
print(titles)

scores = InvestHK.predict_score('testmodel', titles)

In [None]:
# 7.See the result
for t, s in zip(titles, scores):
    print(f'{s} - {t}')

**Chinese Model - How to use**

In [3]:
# 8.To train a chinese inference model, use translation API to convert input texts into simplified chinese

sample_en = ['I am one English text', 'I am another English text']
sample_hk = ['我是一個香港繁中文本', '我是另一個香港繁中文本']

print(InvestHK.translate_en_to_zh(sample_en))
print(InvestHK.translate_hk_to_zh(sample_hk))

['我是一个英文文本', '我是另一个英文文本']
['我是一个香港繁中文本', '我是另一个香港繁中文本']


In [None]:
# 9.Use Model Type 'Chinese' and texts of simplified chinese to train a chinese model.

InvestHK.train_new_model(model_type='Chinese', texts=InvestHK.translate_en_to_zh(sample_en), labels=[1,1], save_to_path='test_chinese_model')