This repository has been archived by the owner on Jan 1, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add yahoo finance profile page parser
Add the basic Yahoo Finance fund profile page parser with some (not all) data parsing methods added. Have tested using a basic Yahoo Finance page, testing all the variations of those pages will be a bit of a faff given current design. While aiming for 100% coverage is nice in this case the time-sink does not merit the extra work at the moment. Later, when the script is finished and working, we can refactor this piece of code in such a way that testing will be easier.
- Loading branch information
1 parent
f15cccb
commit 1555483
Showing
4 changed files
with
361 additions
and
0 deletions.
There are no files selected for viewing
78 changes: 78 additions & 0 deletions
78
lib/bsf/scraper/fund_data_populator/profile_page_parser.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
module Bsf | ||
module Scraper | ||
class FundDataPopulator | ||
|
||
# Responsible for parsing the Yahoo Finance profile page to populate the | ||
# details regarding each fund. | ||
# | ||
# Using the CSS path data in the original script was bringing back no | ||
# results. I am not sure why and think investigating it will take too much | ||
# time. Therefore we will use the following method. | ||
# | ||
# Because of the unholy mess that is the HTML on the yahoo finance pages | ||
# we will use nokogiri (via mechanize) to extract the rough portions of | ||
# the html code with the information we want. We then we will get the | ||
# exact information we want by regexp'ing that chunk of HTML markup. This | ||
# may not be the best way but it will suffice for now, can always refactor | ||
# later | ||
class ProfilePageParser | ||
|
||
def initialize(fund, profile_page) | ||
@profile_page = profile_page | ||
@fund = fund | ||
end | ||
|
||
def parse | ||
@fund.category = get_category | ||
@fund.family = get_family | ||
@fund.assets = get_assets | ||
end | ||
|
||
private | ||
# Isolate the 'Fund Overview Table' | ||
# | ||
# Isolate the 'Fund Overview Table' as best we can, change it to a string | ||
# and remove newlines to make it easier to regexp. | ||
def data_tables | ||
@data_tables ||= @profile_page.search('table#yfncsumtab').to_s.gsub("\n",'') | ||
end | ||
|
||
def get_category | ||
result = confirm_result( | ||
data_tables.scan(%r{">Category:<\/td><td class=\"yfnc_datamoddata1\">(.*?)<\/td>}) | ||
) | ||
confirm_result( | ||
data_tables.scan(%r{">Category:<\/td><td class=\"yfnc_datamoddata1\">.*?>(.*?)<\/a>}) | ||
) unless result | ||
result | ||
end | ||
|
||
def get_family | ||
confirm_result( | ||
data_tables.scan(%r{">Fund Family:<\/td><td class=\"yfnc_datamoddata1\">.*?>(.*?)<\/a>}) | ||
) | ||
end | ||
|
||
def get_assets | ||
assets = data_tables.scan(%r{">Net Assets:<\/td><td class=\"yfnc_datamoddata1\">(.*?)<\/td>}) | ||
if assets.size == 0 | ||
nil | ||
else | ||
assets = assets[0][0] | ||
if assets.end_with?('B') | ||
assets[0...-1].to_f * 1_000_000_000 | ||
elsif assets.end_with?('M') | ||
assets[0...-1].to_f * 1_000_000 | ||
else | ||
raise StandardError "Unknown assets price suffix" | ||
end | ||
end | ||
end | ||
|
||
def confirm_result(result) | ||
result.size == 0 ? nil : result[0][0] | ||
end | ||
end | ||
end | ||
end | ||
end |
38 changes: 38 additions & 0 deletions
38
spec/bsf/scraper/fund_data_populator/profile_page_parser_spec.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
require 'mechanize' | ||
require 'ostruct' | ||
require 'bsf/scraper/fund_data_populator/profile_page_parser' | ||
|
||
describe Bsf::Scraper::FundDataPopulator::ProfilePageParser do | ||
|
||
describe '.initialize' do | ||
it { expect { described_class.new }.to raise_error(ArgumentError, | ||
/0 for 2/) } | ||
end | ||
|
||
describe '#parse' do | ||
|
||
let(:fund) {OpenStruct.new} | ||
|
||
before(:each) do | ||
described_class.new(fund, dummy_page).parse | ||
end | ||
|
||
it 'should parse the category' do | ||
fund.category.should == 'Mid-Cap Blend' | ||
end | ||
|
||
it 'should parse the family' do | ||
fund.family.should == '13D Management' | ||
end | ||
|
||
it 'should parse the assets' do | ||
fund.assets.should == 15_490_000 | ||
end | ||
|
||
end | ||
|
||
def dummy_page | ||
@dummy_page ||= Mechanize.new.get("file:///#{$spec_home}/fixtures/yahoo_profile_page.html") | ||
end | ||
|
||
end |
Oops, something went wrong.