Permalink
Browse files

[WIP] scrapy-streaming examples (#4)

* init examples

* update examples

update examples

* update exampels

* added examples with r-heldper

* renamed r-helper -> r, removed old r examples

* added java examples

* update to new java interface

* github examples removed

* added node examples

* added fromResponseRequest example

* post request example
  • Loading branch information...
1 parent e3f8e33 commit dd41de4c66b0693bc8d299ad89dfa6c4fa7eed39 @aron-bordin aron-bordin committed with Raul Gallegos Aug 19, 2016
@@ -0,0 +1,74 @@
+import org.scrapy.scrapystreaming.Spider;
+import org.scrapy.scrapystreaming.Logger;
+import org.scrapy.scrapystreaming.core.Callback;
+import org.scrapy.scrapystreaming.core.SpiderException;
+import org.scrapy.scrapystreaming.Request;
+import org.scrapy.scrapystreaming.messages.ResponseMessage;
+import org.scrapy.scrapystreaming.messages.ReceivedMessage;
+import org.scrapy.scrapystreaming.messages.ExceptionMessage;
+import org.scrapy.scrapystreaming.utils.Utils;
+
+
+/**
+ * This example opens multiple pages and check if its running or not.
+ *
+ * The results are written in the Scrapy Streaming debug logger.
+ */
+public class CheckStatus extends Spider {
+
+ static int responsesRemaining = 0; // counter to handle remaining responses
+ CheckStatus() {
+ name = "CheckStatus";
+ }
+
+ public void parse(ResponseMessage response) {
+ }
+
+ public static void main(String args[]) throws Exception {
+
+ CheckStatus spider = new CheckStatus();
+ spider.start();
+
+ // open multiple requests, to test the following response status
+ String codes[] = {"200", "201", "400", "404", "500"};
+
+ for (String code: codes) {
+ Request r = new Request("http://httpbin.org/status/" + code);
+ // adds a new response to the counter
+ responsesRemaining++;
+
+ r.open(new Callback() {
+ public void parse(ResponseMessage response) {
+ // if got a response, the domain is working.
+ Logger.debug(response.url + " is working");
+ responsesRemaining--;
+
+ if (responsesRemaining == 0)
+ spider.close();
+ }
+ });
+ }
+ }
+
+ @Override
+ public void onException(ExceptionMessage exception) throws SpiderException {
+ // if got an exception, the response is not working
+ //
+ // firt use ReceivedMessage to get the message thant caused this problem
+ ReceivedMessage rec = Utils.gson.fromJson(exception.received_message, ReceivedMessage.class);
+
+ // if the exception was generated by an request, then
+ if (rec.type.equals("request")) {
+ // get the request data
+ Request r = Utils.gson.fromJson(exception.received_message, Request.class);
+ Logger.debug(r.url + " is not working!");
+ responsesRemaining--;
+
+ if (responsesRemaining == 0)
+ close();
+ } else {
+ throw new SpiderException("Received an exception: " + exception);
+ }
+ }
+
+}
@@ -0,0 +1,85 @@
+import java.util.HashMap;
+import java.io.Writer;
+import java.io.FileWriter;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.scrapy.scrapystreaming.Request;
+import org.scrapy.scrapystreaming.Spider;
+import org.scrapy.scrapystreaming.utils.Utils;
+import org.scrapy.scrapystreaming.core.Callback;
+import org.scrapy.scrapystreaming.core.SpiderException;
+import org.scrapy.scrapystreaming.messages.ResponseMessage;
+
+
+/**
+ * This spider is covered in the quickstart section.
+ *
+ * This scrape the dmoz webpage, looking for Python websites
+ */
+public class Dmoz extends Spider {
+ // we use the numRequests to count remaining requests
+ static int numRequests = 0;
+ // the results variable store the extracted data, mapping from title: url
+ static HashMap<String, String> results = new HashMap<String, String>(0);
+
+ Dmoz() {
+ name = "dmoz";
+ // set the initial url
+ start_urls.add("http://www.dmoz.org/Computers/Programming/Languages/Python/");
+ }
+
+ public void parse(ResponseMessage response) {
+ // get the intal page, and open a new request to each subcategory
+ Document doc = Jsoup.parse(response.body);
+ Elements hrefs = doc.select("#subcategories-div > section > div > div.cat-item > a[href]");
+ for (Element el: hrefs) {
+ try {
+ Request r = new Request("http://www.dmoz.org" + el.attr("href"));
+ r.open(new Callback() {
+ public void parse(ResponseMessage response) {
+ parseSubcat(response);
+ }
+ });
+
+ // increments the number of open requests
+ numRequests++;
+ } catch (SpiderException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ public void parseSubcat(ResponseMessage response) {
+ // decrement the number of open requests
+ numRequests--;
+ Document doc = Jsoup.parse(response.body);
+ Elements divs = doc.select("div.title-and-desc a");
+
+ // extract all urls in the page
+ for (Element item: divs) {
+ String url = item.attr("href");
+ String title = item.select("div.site-title").first().text();
+ results.put(title, url);
+ }
+
+ // close the spider and save the data, when necessary
+ if (numRequests == 0) {
+ try {
+ Writer writer = new FileWriter("outputs/dmoz.json");
+ Utils.gson.toJson(results, writer);
+ writer.flush();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ close();
+ }
+
+ }
+
+ public static void main(String args[]) throws Exception {
+ Dmoz spider = new Dmoz();
+ spider.start();
+ }
+}
@@ -0,0 +1,54 @@
+import org.scrapy.scrapystreaming.Spider;
+import org.scrapy.scrapystreaming.Logger;
+import org.scrapy.scrapystreaming.core.Callback;
+import org.scrapy.scrapystreaming.FromResponseRequest;
+import org.scrapy.scrapystreaming.messages.ResponseMessage;
+import org.scrapy.scrapystreaming.messages.FromResponseMessage;
+import org.scrapy.scrapystreaming.utils.Utils;
+import java.io.Writer;
+import java.io.FileWriter;
+import java.util.HashMap;
+
+
+/**
+ * This is a sample spider to demonstrate how to use the FromResponseRequest.
+ */
+public class FillForm extends Spider {
+
+ FillForm() {
+ name = "form";
+ }
+
+ public void parse(ResponseMessage response) {
+ }
+
+ public static void main(String args[]) throws Exception {
+ FillForm spider = new FillForm();
+ spider.start();
+
+ // put the form data in a map
+ HashMap<String, String> formData = new HashMap<String, String>(0);
+ formData.put("custname", "Sample");
+ formData.put("custemail", "email@example.com");
+
+ // adds the formdata paramenter with the form information
+ FromResponseMessage data = new FromResponseMessage();
+ data.formdata = formData;
+
+ // open the request
+ FromResponseRequest req = new FromResponseRequest("http://httpbin.org/forms/post", data);
+ req.open(new Callback() {
+ public void parse(ResponseMessage response) {
+ try {
+ Writer writer = new FileWriter("outputs/fill_form.json");
+ Utils.gson.toJson(response.body, writer);
+ writer.flush();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ spider.close();
+ }
+ });
+ }
+
+}
@@ -0,0 +1,46 @@
+import org.scrapy.scrapystreaming.Spider;
+import org.scrapy.scrapystreaming.Logger;
+import org.scrapy.scrapystreaming.core.Callback;
+import org.scrapy.scrapystreaming.Request;
+import org.scrapy.scrapystreaming.messages.ResponseMessage;
+import org.scrapy.scrapystreaming.messages.FromResponseMessage;
+import org.scrapy.scrapystreaming.utils.Utils;
+import java.io.Writer;
+import java.io.FileWriter;
+import java.util.HashMap;
+
+
+/**
+ * This is a sample spider to demonstrate how to use a request with POST.
+ */
+public class PostRequest extends Spider {
+
+ PostRequest() {
+ name = "post";
+ }
+
+ public void parse(ResponseMessage response) {
+ }
+
+ public static void main(String args[]) throws Exception {
+ PostRequest spider = new PostRequest();
+ spider.start();
+ // open the request
+ Request req = new Request("http://httpbin.org/post");
+ req.method = "POST";
+ req.body = "Post Body";
+ req.open(new Callback() {
+ public void parse(ResponseMessage response) {
+ try {
+ Writer writer = new FileWriter("outputs/post.json");
+ Utils.gson.toJson(response.body, writer);
+ writer.flush();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ spider.close();
+ }
+ });
+ }
+
+}
@@ -0,0 +1,52 @@
+import org.scrapy.scrapystreaming.Spider;
+import org.scrapy.scrapystreaming.core.Callback;
+import org.scrapy.scrapystreaming.Request;
+import org.scrapy.scrapystreaming.messages.ResponseMessage;
+import org.apache.commons.codec.binary.Base64;
+import java.io.FileOutputStream;
+import java.io.File;
+
+
+/**
+ * This examples demonstrate how to download binary data using Scrapy Streaming.
+ *
+ * It downloads an image using base64, and then save it to a file.
+ */
+public class RequestImage extends Spider {
+
+ RequestImage() {
+ name = "image";
+ }
+
+ public void parse(ResponseMessage response) {
+
+ }
+
+ public static void main(String args[]) throws Exception {
+
+ RequestImage spider = new RequestImage();
+ spider.start();
+
+ // open a request to download the image
+ Request r = new Request("http://httpbin.org/image/png");
+ // set the body encoding to base64, so it can download the image using json
+ r.base64 = true;
+
+ r.open(new Callback() {
+ public void parse(ResponseMessage response) {
+ try {
+ // write the response body to a file and close the spider
+ byte data[] = Base64.decodeBase64(response.body);
+ FileOutputStream img = new FileOutputStream(new File("outputs/image.png"));
+ img.write(data);
+ img.close();
+ spider.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ });
+
+ }
+
+}
@@ -0,0 +1,45 @@
+import org.scrapy.scrapystreaming.Spider;
+import org.scrapy.scrapystreaming.core.Callback;
+import org.scrapy.scrapystreaming.Request;
+import org.scrapy.scrapystreaming.messages.ResponseMessage;
+import java.io.FileOutputStream;
+import java.io.Writer;
+import java.io.BufferedWriter;
+import java.io.OutputStreamWriter;
+import java.io.File;
+
+
+/**
+ * This is a simple example that shows that Scrapy Streaming is ready to handle utf8 webpages
+ */
+public class RequestUTF8 extends Spider {
+
+ RequestUTF8() {
+ name = "utf8";
+ // opens a page with utf8 content
+ start_urls.add("http://httpbin.org/encoding/utf8");
+ }
+
+ public void parse(ResponseMessage response) {
+ try {
+ // save the response body to a file
+ File f = new File("outputs/utf8.html");
+ Writer out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(f), "UTF8"
+ ));
+ out.write(response.body);
+ out.flush();
+ out.close();
+ close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static void main(String args[]) throws Exception {
+
+ RequestUTF8 spider = new RequestUTF8();
+ spider.start();
+ }
+
+}
@@ -0,0 +1,34 @@
+#!/usr/bin/env node
+
+var scrapy = require('scrapystreaming');
+var jsonfile = require('jsonfile');
+
+scrapy.createSpider('status', [], function(response) {});
+
+var requests = [200, 201, 400, 404, 500];
+var pendingRequests = requests.length;
+var result = {};
+
+var canClose = function() {
+ if (pendingRequests == 0) {
+ jsonfile.writeFile('outputs/check_response.json', result);
+ scrapy.closeSpider();
+ }
+}
+
+var check_response = function(response) {
+ result[response.url] = true;
+ pendingRequests--;
+ canClose();
+};
+
+for (req in requests) {
+ scrapy.sendRequest('http://httpbin.org/status/' + requests[req], check_response);
+}
+
+scrapy.runSpider(function(exception) {
+ var msg = JSON.parse(exception.received_message);
+ result[msg.url] = false;
+ pendingRequests--;
+ canClose();
+});
Oops, something went wrong.

0 comments on commit dd41de4

Please sign in to comment.