Merge pull request #1 from pocesar/feature/search

change output logic
pocesar · Jan 12, 2021 · d3383bf · d3383bf
2 parents fce87ab + b8a8ac5
commit d3383bf
Show file tree

Hide file tree

Showing 12 changed files with 957 additions and 200 deletions.
diff --git a/.eslintrc b/.eslintrc
@@ -1,3 +1,6 @@
 {
-    "extends": "@apify"
+    "extends": "@apify",
+    "rules": {
+        "max-len": 0
+    }
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 2021-01-12
+
+- BREAKING CHANGE: Format of the dataset has changed
+- Search multiple terms at once, search hashtags and terms
+- Enriched user profile information (some information are only available when logged in)
+- Added minimum and max tweet dates
+- Updated SDK version
+- Custom data
+- Powerful extend output / scraper function
+
 ## 2020-11-25
 
 - Remove the need to provide credentials

diff --git a/Dockerfile b/Dockerfile
@@ -33,9 +33,11 @@ RUN npm --quiet set progress=false \
 # for most source file changes.
 COPY . ./
 
+ENV npm_config_loglevel=silent
+
 # Optionally, specify how to launch the source code of your actor.
 # By default, Apify's base Docker images define the CMD instruction
 # that runs the source code using the command specified
 # in the "scripts.start" section of the package.json file.
 # In short, the instruction looks something like this:
-# CMD npm start
+# CMD npm start
diff --git a/INPUT_SCHEMA.json b/INPUT_SCHEMA.json
@@ -3,16 +3,63 @@
     "type": "object",
     "schemaVersion": 1,
     "properties": {
+        "searchTerms": {
+            "title": "Search terms",
+            "type": "array",
+            "editor": "stringList",
+            "description": "Search for an specific terms and extract tweets for that term."
+        },
+        "searchMode": {
+            "title": "Search mode",
+            "description": "Search mode changes the way the data is received",
+            "editor": "select",
+            "type": "string",
+            "prefill": "",
+            "enum": [
+                "",
+                "live",
+                "user",
+                "image",
+                "video"
+            ],
+            "enumTitles": [
+                "Top",
+                "Latest",
+                "People",
+                "Photos",
+                "Videos"
+            ]
+        },
+        "toDate": {
+            "title": "Tweets newer than",
+            "description": "Will get tweets that are newer than this date. Can be used in conjunction with 'Tweets older than' to create specific date slices. Can use specific dates, such as YYYY-MM-DD or relative ones, like '1 month' or '2 days'",
+            "pattern": "(\\d{4}-\\d{2}-\\d{2}|(\\d+ )?\\S+)",
+            "type": "string",
+            "editor": "textfield"
+        },
+        "fromDate": {
+            "title": "Tweets older than",
+            "description": "Will start getting tweets from this date and older. Can be used in conjunction 'Tweets newer than'. Can use specific dates, such as YYYY-MM-DD or relative ones, like '1 month' or '2 days'",
+            "type": "string",
+            "pattern": "(\\d{4}-\\d{2}-\\d{2}|(\\d+ )?\\S+)",
+            "editor": "textfield"
+        },
+        "tweetsDesired": {
+            "title": "Max. Tweets",
+            "type": "integer",
+            "description": "Max. Number of Tweets to Retrieve.",
+            "prefill": 100
+        },
         "handle": {
             "title": "List of handles to scrape",
             "type": "array",
-            "description": "The twitter handles of the profiles you want to scrape.",
+            "description": "The twitter handles of the profiles you want to scrape. Shortcut instead of inputting https://twitter.com urls",
             "prefill": ["elonmusk", "patrickc"],
             "editor": "stringList"
         },
         "mode": {
             "title": "Tweet types",
-            "description": "Select the tweet types to get. Only tweets or tweets and replies",
+            "description": "Select the tweet types to get. Only tweets or tweets and replies for the selected handle when visiting profiles.",
             "type": "string",
             "editor": "select",
             "prefill": "replies",
@@ -26,28 +73,57 @@
                 "replies"
             ]
         },
-        "tweetsDesired": {
-            "title": "Max. Tweets",
-            "type": "integer",
-            "description": "Max. Number of Tweets to Retrieve.",
-            "prefill": 100
+        "startUrls": {
+            "title": "Start URLs",
+            "description": "If you have a file or a url list of twitter urls, you provide them here",
+            "default": [],
+            "type": "array",
+            "editor": "requestListSources"
         },
         "proxyConfig": {
             "title": "Proxy configuration",
             "type": "object",
-            "description": "Optionally use a proxy.",
+            "description": "Highly recommended to use a proxy.",
             "prefill": {
                 "useApifyProxy": true
             },
-            "default": {},
+            "default": {
+                "useApifyProxy": true
+            },
             "editor": "proxy"
         },
+        "extendOutputFunction": {
+            "title": "Extend Output Function",
+            "description": "Add or remove properties on the output object or omit the output returning null",
+            "type": "string",
+            "default": "",
+            "prefill": "async ({ data, item, page, request, customData }) => {\n  return item;\n}",
+            "editor": "javascript"
+        },
+        "extendScraperFunction": {
+            "title": "Extend Scraper Function",
+            "description": "Advanced function that allows you to extend the default scraper functionality, allowing you to manually perform actions on the page",
+            "type": "string",
+            "default": "",
+            "prefill": "async ({ page, request, addSearch, addProfile, addEvent, customData, Apify }) => {\n \n}",
+            "editor": "javascript"
+        },
+        "customData": {
+            "title": "Custom data",
+            "description": "Any data that you want to have available inside the Extend Output/Scraper Function",
+            "default": {},
+            "prefill": {},
+            "type": "object",
+            "editor": "json"
+        },
         "initialCookies": {
             "title": "Login Cookies",
             "type": "array",
             "description": "Your login cookies will be used to bypass the login wall. See <a href='https://apify.com/vdrmota/twitter-scraper'>ReadMe</a> for instructions.",
             "editor": "json"
         }
     },
-    "required": ["handle"]
+    "required": [
+        "proxyConfig"
+    ]
 }
diff --git a/README.MD b/README.MD
@@ -16,12 +16,95 @@ The actor is useful for extracting large amounts of tweet data. Unlike the Twitt
 
 The actor has the following input options
 
-- **Login Cookies** - Your Twitter login cookies (no username/password is submitted). For instructions on how to get your login cookies, please see our [tutorial](https://apify.com/help-dev/en/articles/1444249-log-in-to-website-by-transferring-cookies-from-web-browser).
 - **Mode** - Scrape only own tweets from the profile page or include replies to other users
 - **List of Handles** - Specify a list of twitter handles (usernames) you want to scrape
   shall the crawler visit. If zero, the actor ignores the links and only crawls the Start URLs.
 - **Max. Tweets** - Specify the maximum number of tweets you want to scrape.
-- **Proxy Configuration** - Optionally, select a proxy to be used by the actor.
+- **Proxy Configuration** - Select a proxy to be used by the actor.
+- **Login Cookies** - Your Twitter login cookies (no username/password is submitted). For instructions on how to get your login cookies, please see our [tutorial](https://apify.com/help-dev/en/articles/1444249-log-in-to-website-by-transferring-cookies-from-web-browser).
+
+## Migration
+
+Version 0.1 -> 1.0:
+    * Every item on dataset is now a separate tweet. That means that using `unwind` parameter is not necessary anymore (and doesn't work.)
+    * Proxies are required when running on Apify platform
+    * Login isn't required anymore, but some profiles/tweets can only be accessed using this
+    * Some fields were renamed, it matches twitter property names
+
+## Extend output function
+
+This parameter allows you to change the shape of your dataset output, split arrays into separate dataset items or filter the output:
+
+```js
+async ({ item, request }) => {
+    item.user = undefined; // removes this field from the output
+
+    if (request.userData.search) {
+        item.search = request.userData.search; // add the search term to the output
+        item.searchUrl = request.loadedUrl; // add the raw search url to the output
+    }
+
+    return item;
+}
+```
+
+Filtering items:
+
+```js
+async ({ item }) => {
+    if (!item.contentText.includes('lovely')) {
+        return null; // omit the output if doesn't contain the text
+    }
+
+    return item;
+}
+```
+
+Splitting into multiple dataset items:
+
+```js
+async ({ item }) => {
+    const result = [];
+
+    const hashtags = item.contextText.match(/#([\S]+)/g);
+
+    if (hashtags) {
+        // dataset will be full of items like { hashtag: '#somehashtag' }
+        hashtags.forEach(hashtag => result.push({ hashtag }));
+    }
+
+    return result; // returning an array here will split in multiple dataset items
+}
+```
+
+## Extend scraper function
+
+This parameter allows to extend how the scraper works, can make it easier to extend the default functionality without having to create your own version. As an example, you can include searching the trending topics on each page visit:
+
+```js
+async ({ page, request, addSearch, addProfile, customData }) => {
+    await page.waitForSelector('[aria-label="Timeline: Trending now"] [data-testid="trend"]');
+
+    const trending = await page.evaluate(() => {
+        const trendingEls = $('[aria-label="Timeline: Trending now"] [data-testid="trend"]');
+
+        return trendingEls.map((_, el) => {
+            return {
+                term: $(el).find('> div > div:nth-child(2)').text().trim(),
+                profiles: $(el).find('> div > div:nth-child(3) [role="link"]').map((_, el) => $(el).text()).get()
+            }
+        }).get();
+    });
+
+    for (const { search, profiles } of trending) {
+        await addSearch(search);
+
+        for (const profile of profiles) {
+            await addProfile(profile);
+        }
+    }
+}
+```
 
 ## Results
 
@@ -39,40 +122,14 @@ For each Twitter profile scraped, the resulting dataset contains a single record
     "joined": "Tue Apr 17 01:46:27 +0000 2007",
     "username": "patrickc"
   },
-  "tweets": [
-    {
-      "contentText": "@balajis I'm very happy to visit many restaurants that I suspect are not particularly good businesses.",
-      "conversationId": "1162066623240347648",
-      "replies": 2,
-      "retweets": 0,
-      "favorites": 51,
-      "dateTime": "Thu Aug 15 18:23:53 +0000 2019",
-      "tweetId": "1162067401954869248"
-    },
-    {
-      "contentText": "I've wanted this feature for so long. 😍 https://t.co/jspRvv8wDD https://t.co/Q0gRwwIGYd https://t.co/k30UK0hvdc",
-      "conversationId": "1161319133570457600",
-      "replies": 13,
-      "retweets": 12,
-      "favorites": 247,
-      "dateTime": "Tue Aug 13 16:50:32 +0000 2019",
-      "tweetId": "1161319133570457600"
-    },
-    //...
-  ]
+  "contentText": "@balajis I'm very happy to visit many restaurants that I suspect are not particularly good businesses.",
+  "conversationId": "1162066623240347648",
+  "replies": 2,
+  "retweets": 0,
+  "favorites": 51,
+  "dateTime": "Thu Aug 15 18:23:53 +0000 2019",
+  "tweetId": "1162067401954869248"
+  //...
 }
 ```
 
-To download the results, you can use the [Get items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint.
-
-```
-https://api.apify.com/v2/datasets/[DATASET_ID]/items?format=json
-```
-
-Where `DATASET_ID` is the ID of the dataset as provided in the actor run object. You can use the `format` query parameter to specify format of the results, e.g. `xml`, `csv` or `xlsx`.
-
-If you want only one tweet per line, use the `unwind` parameter:
-
-```
-https://api.apify.com/v2/datasets/[DATASET_ID]/items?format=json&unwind=tweets
-```
diff --git a/apify.json b/apify.json
@@ -1,6 +1,8 @@
 {
-	"name": "twitter-extractor",
-	"version": "0.0",
+	"name": "actor-twitter-scraper",
+	"version": "1.0",
 	"buildTag": "latest",
-	"env": null
+	"env": {
+        "npm_config_loglevel": "silent"
+    }
 }
diff --git a/jsconfig.json b/jsconfig.json
@@ -0,0 +1,27 @@
+{
+    "compilerOptions": {
+      "target": "es2018",
+      "module": "commonJS",
+      "lib": [
+        "dom",
+        "dom.iterable",
+        "es5",
+        "es6",
+        "es2018",
+        "es2019.array",
+        "es2019.object",
+        "es2020.string"
+      ],
+      "strict": true,
+      "checkJs": true,
+      "alwaysStrict": true,
+      "moduleResolution": "node",
+      "esModuleInterop": true,
+      "noImplicitAny": true,
+      "noImplicitReturns": false,
+      "allowSyntheticDefaultImports": false
+    },
+    "include": [
+      "./src/*.js"
+    ]
+  }
diff --git a/package.json b/package.json
@@ -10,12 +10,15 @@
     "author": "",
     "license": "ISC",
     "dependencies": {
-        "apify": "^0.21.9",
-        "puppeteer": "^5.4.1"
+        "apify": "^0.22.4",
+        "puppeteer": "^5.5.0",
+        "lodash": "^4.17.20",
+        "moment": "^2.29.1"
     },
     "devDependencies": {
+        "@types/lodash": "^4.14.167",
         "@apify/eslint-config": "^0.1.3",
         "@types/node": "^12",
-        "eslint": "^7.12.1"
+        "eslint": "^7.17.0"
     }
 }