Skip to content
Permalink
Browse files

Created process to deal with Internet connections.

  • Loading branch information...
stbnps committed Jan 14, 2014
1 parent aa0dada commit 15556f7164093062a52d616205cae4b76945a4cf
Showing with 72 additions and 19 deletions.
  1. +1 −1 src/epc_slave.erl
  2. +6 −5 src/indexer.erl
  3. +2 −2 src/master.erl
  4. +1 −0 src/tests.erl
  5. +62 −11 src/web_lib.erl
@@ -1,7 +1,7 @@
-module(epc_slave).
-export([init/1]).

-define(MAX_PAGES_DOMAIN, 20).
-define(MAX_PAGES_DOMAIN, 100).

init(Url) ->
% Stop when master is done
@@ -36,12 +36,13 @@ indexer_loop() ->
receive
{_From, {index, ImageURL}} ->
Image = web_lib:downloadImage(ImageURL),
if Image =/= noImage ->
case Image of
noImage -> ok;
_ ->
Perception = extractPerception(Image),
io:format("Url: ~p~nPerception: ~p~n", [ImageURL, Perception]),
addToIndex(ImageURL, Perception);
true ->
ok
%io:format("Url: ~p~nPerception: ~p~n", [ImageURL, Perception]),
addToIndex(ImageURL, Perception)

end,
indexer_loop();
stop ->
@@ -16,9 +16,9 @@
%% Internal Exports
-export([loop/3, removeDuplicatedURLs/2, printList/1]).

-define(SLAVE_LIMIT, 50).
-define(SLAVE_LIMIT, 10).

-define(PENDING_URLS_LIMIT, 100).
-define(PENDING_URLS_LIMIT, 500).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -12,6 +12,7 @@ test1()->
master:getImages(["http://elpais.com/"]).

test()->
web_lib:init(),
master:init(),
indexer:start(),
phash:start(),
@@ -5,15 +5,73 @@
%% @reviewer Laura Castro <laura.castro@gmail.com>

-module(web_lib).
-export([get_data/1, downloadImage/1]).
-export([get_data/1, downloadImage/1, loop/0, init/0]).

-include_lib("xmerl/include/xmerl.hrl").
-export([add_domain/2]).

init() ->
PID = spawn(web_lib, loop, []),
register(web_lib, PID).

get_data(URL) ->
web_lib ! {self(), {get_data, URL}},
receive
{get_data_ans, {LinkURLs, ImageURLs}} ->
{LinkURLs, ImageURLs}
end.

downloadImage(URL) ->
web_lib ! {self(), {downloadImage, URL}},
receive
{downloadImage_ans, Image} ->
Image
end.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Why do we do it this way?
%
% Inets manages concurrent queries poorly:
% http://mail-archives.apache.org/mod_mbox/couchdb-dev/200901.mbox/%3C459588380.1232914742004.JavaMail.jira@brutus%3E
% https://gist.github.com/formido/134224
% https://elearning.erlang-solutions.com/binaries/test_results/test_results_package_R16B02_osx32_1379419828/tests/test_server/ct_run.test_server@esl3.2013-09-17_04.34.47/tests.inets_test.logs/run.2013-09-17_04.34.51/httpc_suite.bad_response.html
% http://erlang.org/pipermail/erlang-bugs/2011-October/002628.html
% http://osdir.com/ml/erlang-programming-bugs/2009-08/msg00012.html
% http://erlang.2086793.n4.nabble.com/inets-HTTP-client-race-condition-td2121808.html
%
%
% Process starvation:
% If indexer and slaves dont cooperate on the way they reach Internet
% the slaves will use all (erlang's) bandwidth so the indexer won't
% be able to download images.
%
% So:
% - Process all internet queries in just one process.
% - Receive first the requests to download images.
% - Only if all images have been downloaded, keep on crawling.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
loop() ->
receive
{From, {downloadImage, URL}} ->
From ! {downloadImage_ans, downloadImage_(URL)},
loop();

stop ->
ok
after 0 ->
receive
{From, {get_data, URL}} ->
From ! {get_data_ans, get_data_(URL)},
loop()
end
end.


%% @doc Returns a list of tuples containin links and images.
%% Example url = "http://www.youtube.com/{relative}"
%% @spec get_data(URL :: string()) -> {Links :: string(), Images :: string()}
get_data(URL) ->
get_data_(URL) ->
inets:start(),
% {_, { _, _, Data}} = httpc:request(URL)
% This can cause web_lib to throw a "no match" exception:
@@ -31,14 +89,7 @@ parseResponse(URL, ResponseHeaders, ResponseBody) ->
case lists:keyfind("content-type", 1, ResponseHeaders) of
{"content-type", ContentType} ->
case string:str(ContentType, "text/html") of % check that we have a web page
0 ->
%% If it's a image, simply return the URL
case string:str(ContentType, "image/") of
0 ->
{[], []};
_ ->
{[], [URL]}
end;
0 -> {[], []};
_ ->
ParsedData = mochiweb_html:parse(ResponseBody),
Anchors = crawl_for(<<"a">>, ParsedData, []),
@@ -55,7 +106,7 @@ parseResponse(URL, ResponseHeaders, ResponseBody) ->
end.


downloadImage(URL) ->
downloadImage_(URL) ->
ImageTypes = ["image/jpg", "image/jpeg", "image/png"], % IMPORTANT NOTE, AS FAR AS I KNOW PHASH ONLY SUPPORTS JPEGs AND PNGs
Response = httpc:request(URL),
case Response of

0 comments on commit 15556f7

Please sign in to comment.
You can’t perform that action at this time.