Skip to content

Commit

Permalink
Metadata (#38)
Browse files Browse the repository at this point in the history
Added employees and founded values to companies.
  • Loading branch information
SlashGordon committed Jul 22, 2020
1 parent 4abad90 commit 943ceee
Show file tree
Hide file tree
Showing 5 changed files with 3,044 additions and 25 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ build/*
.vscode/*
src/pytickersymbols/data/stocks.json
.theia/*
stockswithmetadata.yaml
80 changes: 80 additions & 0 deletions addmetadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import yaml
import os
import wptools
import re
import wikipedia
import multiprocessing
from collections import OrderedDict

represent_dict_order = lambda self, data: self.represent_mapping(
'tag:yaml.org,2002:map', data.items()
)
yaml.add_representer(OrderedDict, represent_dict_order)


def get_page(page_search):
lang_codes = ['en', 'de', 'es', 'fr']
for lang in lang_codes:
try:
return wptools.page(page_search, lang=lang).get_parse()
except LookupError:
try:
wikipedia.set_lang(lang)
search = wikipedia.search(page_search)
if search:
return wptools.page(search[0], lang=lang).get_parse()
except LookupError:
print(f'no wiki page found for {page_search} lang {lang}.')
return None


def metadata(stock_name):
so = get_page(stock_name)
founded = 'unknown'
employees = 'unknown'
if so:
infobox = so.data.get('infobox', None)
if infobox:
foundation_str = infobox.get('foundation', '')
foundation = re.findall(r'\d{4}', foundation_str)
employees_str = (
infobox.get('num_employees', '')
.replace(',', '')
.replace('.', '')
)
employees_items = re.findall(r'\d+', employees_str)
if employees_items:
employees = int(employees_items[0])
if foundation:
founded = int(foundation[-1])
return founded, employees, stock_name


# convert yaml file to json
input_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), 'stocks.yaml'
)
output_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), 'stockswithmetadata.yaml'
)

pool = multiprocessing.Pool(10)
with open(output_path, 'w', encoding='latin1') as out_file:
with open(input_path, 'r') as in_file:
stocksyaml = yaml.safe_load(in_file)
founded_values, employee_values, stock_names = zip(
*pool.map(
metadata,
map(lambda stock: stock['name'], stocksyaml['companies']),
)
)
metadata_values = list(zip(founded_values, employee_values, stock_names))
for founded, employees, stock_name in metadata_values:
for stock in stocksyaml['companies']:
if stock['name'] == stock_name:
stock['metadata'] = {
'founded': founded,
'employees': employees,
}
break
yaml.dump(stocksyaml, out_file, sort_keys=False)
3 changes: 3 additions & 0 deletions requirements-meta.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
PyYAML==5.3.1
wptools==0.4.17
wikipedia==1.4.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

EXCLUDE_FROM_PACKAGES = ['test', 'test.*', 'test*']

VERSION = '1.1.13'
VERSION = '1.1.14'

with open("README.md", "r") as fh:
long_description = fh.read()
Expand Down

0 comments on commit 943ceee

Please sign in to comment.