Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/tumblr #2

Merged
merged 11 commits into from
Feb 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ List of implemented scrapers looks like this so far:
- [YoutubeSkraper](./skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt) - [YouTube](https://youtube.com) scraper
- [NinegagSkraper](./skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt) - [9gag](https://9gag.com) scraper
- [PinterestSkraper](./skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt) - [Pinterest](https://www.pinterest.com) scraper
- [TumblrSkraper](./skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt) - [Tumblr](https://tumblr.com) scraper
- [IFunnySkraper](./skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt) - [IFunny](https://ifunny.co) scraper
- [VkSkraper](./skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt) - [VK](https://vk.com) scraper
- [PikabuSkraper](./skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt) - [Pikabu](https://pikabu.ru) scraper
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@ interface Skraper {
* @param imageSize choice for specific logo size if it's possible
* @return provider logo url
*/
suspend fun getLogoUrl(imageSize: ImageSize = SMALL): String? = "${baseUrl}/favicon.ico"
suspend fun getProviderLogoUrl(imageSize: ImageSize = SMALL): String? = "${baseUrl}/favicon.ico"
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ import ru.sokomishalov.skraper.model.ImageSize.SMALL

suspend fun Skraper.getPageLogoByteArray(uri: String, imageSize: ImageSize = SMALL): ByteArray? = getPageLogoUrl(uri, imageSize)?.let { client.fetchBytes(it) }

suspend fun Skraper.getLogoByteArray(imageSize: ImageSize = SMALL): ByteArray? = getLogoUrl(imageSize)?.let { client.fetchBytes(it) }
suspend fun Skraper.getLogoByteArray(imageSize: ImageSize = SMALL): ByteArray? = getProviderLogoUrl(imageSize)?.let { client.fetchBytes(it) }


Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,43 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
@file:Suppress("unused")

package ru.sokomishalov.skraper.internal.jsoup

import org.jsoup.Jsoup
import org.jsoup.nodes.Element


internal fun Element.getSingleElementByClass(name: String): Element {
fun Element.getSingleElementByClass(name: String): Element {
return getElementsByClass(name).first()
}

internal fun Element.getSingleElementByClassOrNull(name: String): Element? {
fun Element.getSingleElementByClassOrNull(name: String): Element? {
return getElementsByClass(name).firstOrNull()
}

internal fun Element.getSingleElementByTag(name: String): Element {
fun Element.getSingleElementByTag(name: String): Element {
return getElementsByTag(name).first()
}

internal fun Element.getSingleElementByTagOrNull(name: String): Element? {
fun Element.getSingleElementByTagOrNull(name: String): Element? {
return getElementsByTag(name).firstOrNull()
}

internal fun Element.getSingleElementByAttribute(name: String): Element {
fun Element.getSingleElementByAttribute(name: String): Element {
return getElementsByAttribute(name).first()
}

internal fun Element.getSingleElementByAttributeOrNull(name: String): Element? {
fun Element.getSingleElementByAttributeOrNull(name: String): Element? {
return getElementsByAttribute(name).firstOrNull()
}

internal fun Element.getImageBackgroundUrl(): String {
fun Element.getImageBackgroundUrl(): String {
val style = attr("style")
return style.substring(style.indexOf("http"), style.indexOf(")"))
}

internal fun Element.getStyleMap(): Map<String, String> {
fun Element.getStyleMap(): Map<String, String> {
return when {
hasAttr("style").not() -> emptyMap()
else -> attr("style")
Expand All @@ -62,11 +63,11 @@ internal fun Element.getStyleMap(): Map<String, String> {
}
}

internal fun Element.getStyle(name: String): String? {
fun Element.getStyle(name: String): String? {
return this.getStyleMap()[name]
}

internal fun Element.removeLinks(): String? {
fun Element.removeLinks(): String? {
val titleDoc = Jsoup.parse(html())

val allAnchors = titleDoc.select("a")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ package ru.sokomishalov.skraper.model
* @property aspectRatio width to height ratio
*/
data class Attachment(
// url
val url: String,
val type: AttachmentType,
val aspectRatio: Double
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@ package ru.sokomishalov.skraper.model
/**
* Represents a provider post.
* @property id provider's internal id
* @property caption caption (or article)
* @property publishTimestamp published at *(nullable - such data may not exist on the provider's page)
* @property text concatenated text
* @property publishedAt publish timestamp in nanos *(nullable - such data may not exist on the provider's page)
* @property rating rating (likes) count *(nullable - such data may not exist on the provider's page)
* @property commentsCount comments count *(nullable - such data may not exist on the provider's page)
* @property attachments attachments (images or videos)
* @property attachments images or videos
*/
data class Post(
val id: String,
val caption: String? = "",
val publishTimestamp: Long? = null,
val text: String? = "",
val publishedAt: Long? = null,
val rating: Int? = null,
val commentsCount: Int? = null,
val attachments: List<Attachment> = emptyList()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ class FacebookSkraper @JvmOverloads constructor(

Post(
id = id,
caption = it.getCaptionByUserContentWrapper(),
publishTimestamp = it.getPublishedAtByUserContentWrapper(),
text = it.getCaptionByUserContentWrapper(),
publishedAt = it.getPublishedAtByUserContentWrapper(),
rating = node.extractReactionCount(),
commentsCount = node.extractCommentsCount(),
attachments = it.getAttachmentsByUserContentWrapper()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class IFunnySkraper @JvmOverloads constructor(
}

override suspend fun getPageLogoUrl(uri: String, imageSize: ImageSize): String? {
return getLogoUrl(imageSize)
return getProviderLogoUrl(imageSize)
}

private suspend fun getTopicPage(uri: String): Document? {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ class InstagramSkraper @JvmOverloads constructor(
return postsNodes.map {
Post(
id = it.parseId(),
caption = it.parseCaption(),
publishTimestamp = it.parsePublishedAt(),
text = it.parseCaption(),
publishedAt = it.parsePublishedAt(),
rating = it.parseLikesCount(),
commentsCount = it.parseCommentsCount(),
attachments = it.parseAttachments()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ class NinegagSkraper @JvmOverloads constructor(

Post(
id = p["id"]?.asText().orEmpty(),
caption = p["title"]?.asText(),
publishTimestamp = p["creationTs"]?.asLong()?.times(1000),
text = p["title"]?.asText(),
publishedAt = p["creationTs"]?.asLong()?.times(1000),
rating = p.run {
val up = get("upVoteCount")?.asInt()
val down = get("downVoteCount")?.asInt()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ class PikabuSkraper(

Post(
id = it.parseId(),
caption = String(caption.toByteArray(UTF_8)),
publishTimestamp = it.parsePublishDate(),
text = String(caption.toByteArray(UTF_8)),
publishedAt = it.parsePublishDate(),
rating = it.parseRating(),
commentsCount = it.parseCommentsCount(),
attachments = storyBlocks.parseMediaAttachments()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ class PinterestSkraper @JvmOverloads constructor(
val imageInfo = it["images"]["orig"]
Post(
id = it["id"]?.asText().orEmpty(),
caption = it["description"]?.asText(),
publishTimestamp = ZonedDateTime.parse(it["created_at"]?.asText(), DATE_FORMATTER).toInstant().toEpochMilli(),
text = it["description"]?.asText(),
publishedAt = ZonedDateTime.parse(it["created_at"]?.asText(), DATE_FORMATTER).toInstant().toEpochMilli(),
rating = it.get("aggregated_pin_data")?.get("aggregated_stats")?.get("saves")?.asInt(),
commentsCount = it["comment_count"]?.asInt(),
attachments = listOf(Attachment(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ class RedditSkraper @JvmOverloads constructor(
.map {
Post(
id = it.get("id").asText().orEmpty(),
caption = it.get("title").asText(),
publishTimestamp = it.get("created_utc")?.asLong()?.times(1000),
text = it.get("title").asText(),
publishedAt = it.get("created_utc")?.asLong()?.times(1000),
rating = it.get("score")?.asInt(),
commentsCount = it.get("num_comments")?.asInt(),
attachments = listOf(Attachment(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/**
* Copyright 2019-2020 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ru.sokomishalov.skraper.provider.tumblr

import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import ru.sokomishalov.skraper.Skraper
import ru.sokomishalov.skraper.SkraperClient
import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.fetchDocument
import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_ASPECT_RATIO
import ru.sokomishalov.skraper.internal.jsoup.getSingleElementByClassOrNull
import ru.sokomishalov.skraper.internal.jsoup.getSingleElementByTagOrNull
import ru.sokomishalov.skraper.model.Attachment
import ru.sokomishalov.skraper.model.AttachmentType.IMAGE
import ru.sokomishalov.skraper.model.AttachmentType.VIDEO
import ru.sokomishalov.skraper.model.ImageSize
import ru.sokomishalov.skraper.model.Post
import java.time.LocalDate
import java.time.LocalDateTime
import java.time.ZoneOffset.UTC
import java.time.format.DateTimeFormatter
import java.util.Locale.ENGLISH

class TumblrSkraper(
override val client: SkraperClient = DefaultBlockingSkraperClient
) : Skraper {

override val baseUrl: String = "https://tumblr.com"

override suspend fun getPageLogoUrl(uri: String, imageSize: ImageSize): String? {
val document = getPage(uri)

return document
?.getSingleElementByClassOrNull("user-avatar")
?.getSingleElementByTagOrNull("img")
?.attr("src")
}

override suspend fun getLatestPosts(uri: String, limit: Int): List<Post> {
val document = getPage(uri)

val articles = document
?.getElementsByTag("article")
?.take(limit)
.orEmpty()

return articles.map { a ->
Post(
id = a.extractId(),
text = a.extractText(),
publishedAt = a.extractPublishedDate(),
rating = a.extractNotes(),
commentsCount = a.extractNotes(),
attachments = a.extractAttachments()
)
}
}

private suspend fun getPage(uri: String): Document? {
return client.fetchDocument("https://${uri}.tumblr.com")
}

private fun Element.extractId(): String {
return attr("data-post-id")
.ifBlank { attr("id") }
}

private fun Element.extractText(): String? {
return getElementsByTag("figcaption")
.joinToString("\n") { it.wholeText().orEmpty() }
.substringAfter(":")
}

private fun Element.extractPublishedDate(): Long? {
val postDate = getSingleElementByClassOrNull("post-date")
val timePosted = getSingleElementByClassOrNull("time-posted")

return when {

postDate != null -> postDate
.wholeText()
.let { runCatching { LocalDate.parse(it, DATE_FORMATTER) }.getOrNull() }
?.atStartOfDay()
?.toEpochSecond(UTC)
?.times(1000)

timePosted != null -> timePosted
.attr("title")
.replace("am", "AM")
.replace("pm", "PM")
.let { runCatching { LocalDateTime.parse(it, DATE_TIME_FORMATTER) }.getOrNull() }
?.toEpochSecond(UTC)
?.times(1000)

else -> null
}
}

private fun Element.extractNotes(): Int? {
val notesNode = getSingleElementByClassOrNull("post-notes")
?: getSingleElementByClassOrNull("note-count")

return notesNode
?.wholeText()
?.split(" ")
?.firstOrNull()
?.replace(",", "")
?.replace(".", "")
?.toIntOrNull()
?: 0
}

private fun Element.extractAttachments(): List<Attachment> {
return getElementsByTag("figure").mapNotNull { f ->
val video = f.getSingleElementByTagOrNull("video")
val img = f.getSingleElementByTagOrNull("img")

Attachment(
type = when {
video != null -> VIDEO
img != null -> IMAGE
else -> return@mapNotNull null
},
url = when {
video != null -> video.getSingleElementByTagOrNull("source")?.attr("src").orEmpty()
else -> img?.attr("src").orEmpty()
},
aspectRatio = f.run {
val width = attr("data-orig-width")?.toDoubleOrNull()
val height = attr("data-orig-height")?.toDoubleOrNull()
when {
width != null && height != null -> width / height
else -> DEFAULT_POSTS_ASPECT_RATIO
}
}
)
}
}

companion object {
private val DATE_FORMATTER = DateTimeFormatter.ofPattern("MMM d'th,' yyyy").withLocale(ENGLISH)
private val DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("h:mm a, EEEE, MMMM d, yyyy").withLocale(ENGLISH)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ class TwitterSkraper @JvmOverloads constructor(
return posts.map {
Post(
id = it.extractIdFromTweet(),
caption = it.extractCaptionFromTweet(),
text = it.extractCaptionFromTweet(),
rating = it.extractLikes(),
commentsCount = it.extractReplies(),
publishTimestamp = it.extractPublishedAtFromTweet(),
publishedAt = it.extractPublishedAtFromTweet(),
attachments = it.extractAttachmentsFromTweet()
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ class VkSkraper @JvmOverloads constructor(
return posts.map {
Post(
id = it.extractId(),
caption = it.extractCaption(),
publishTimestamp = it.extractPublishedDate(),
text = it.extractCaption(),
publishedAt = it.extractPublishedDate(),
rating = it.extractLikes(),
commentsCount = it.extractReplies(),
attachments = it.extractAttachments()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ class YoutubeSkraper @JvmOverloads constructor(

Post(
id = linkElement.parseId(),
caption = linkElement.parseCaption(),
publishTimestamp = it.parsePublishDate(),
text = linkElement.parseCaption(),
publishedAt = it.parsePublishDate(),
attachments = listOf(Attachment(
url = "${baseUrl}${linkElement?.attr("href")}",
type = VIDEO,
Expand Down
Loading