Skip to content
Permalink
Browse files

add 1pondo, add json parser

  • Loading branch information
denpaneko committed Mar 8, 2020
1 parent 62e6e74 commit b8e1800cd811aedd67e155cb15a34f6b9ef6a331
Showing with 294 additions and 127 deletions.
  1. +2 −1 scrape_test.go
  2. +69 −4 selector/selector.go
  3. +60 −2 selector/site.go
  4. +9 −9 sites/carib.go
  5. +9 −9 sites/caribpr.go
  6. +13 −13 sites/dmm.go
  7. +11 −11 sites/fantia.go
  8. +13 −13 sites/fc2.go
  9. +13 −13 sites/fc2club.go
  10. +13 −13 sites/getchu.go
  11. +13 −13 sites/heyzo.go
  12. +14 −14 sites/mgs.go
  13. +43 −0 sites/pondo.go
  14. +12 −12 sites/tokyo.go
@@ -16,7 +16,8 @@ func TestScrape(t *testing.T) {
//meta := Scrape(sites.Heyzo("2177"))
//meta := Scrape(sites.Fantia("8209"))
//meta := Scrape(sites.Getchu("19622"))
meta := Scrape(sites.Tokyo("n1236"))
//meta := Scrape(sites.Tokyo("n1236"))
meta := Scrape(sites.Pondo("020820_971"))

fmt.Println(meta.Json())
}
@@ -1,12 +1,14 @@
package selector

import (
"fmt"
"github.com/PuerkitoBio/goquery"
"reflect"
"regexp"
"strings"
)

type CssSelector struct {
type Selector struct {
Id *Item
Title *Item
Actor *Item
@@ -27,21 +29,31 @@ type Item struct {
attribute string
replacer *strings.Replacer
preset string
presets []string
matcher string
query string
}

func Selector(selector string) *Item {
func Select(selector string) *Item {
return &Item{selector: selector, attribute: "", replacer: strings.NewReplacer("", ""), preset: ""}
}

func Preset(preset string) *Item {
return &Item{preset: preset}
}

func Matcher(matcher string) *Item {
func Presets(presets []string) *Item {
return &Item{presets: presets}
}

func Match(matcher string) *Item {
return &Item{matcher: matcher}
}

func Query(query string) *Item {
return &Item{query: query}
}

func (selector Item) Replace(oldNew ...string) *Item {
selector.replacer = strings.NewReplacer(oldNew...)
return &selector
@@ -104,6 +116,10 @@ func (selector *Item) Values(doc *goquery.Document) []string {
return texts
}

if selector.presets != nil {
return selector.presets
}

doc.Find(selector.selector).Each(func(i int, selection *goquery.Selection) {
texts = append(texts, selector.textOrAttr(selection))
})
@@ -160,7 +176,56 @@ func (selector Item) Attrs(doc *goquery.Document, attr string) []string {
})
return attrs
}
func (selectors CssSelector) AddExtra(key string, selector *Item) CssSelector {

func (selector *Item) Query(data map[string]interface{}) string {
if selector == nil {
return ""
}
if len(selector.preset) > 0 {
return selector.preset
}
return query(data, selector.query)
}

func query(data map[string]interface{}, key string) string {
value := data[key]

if value != nil {
return fmt.Sprintf("%v", value)
}
return ""
}

func (selector *Item) Queries(data map[string]interface{}) []string {

if selector == nil {
return []string{}
}

if selector.presets != nil {
return selector.presets
}

return queries(data, selector.query)
}

func queries(data map[string]interface{}, key string) []string {
var res []string
x := data[key]
if x != nil {
// if json object is not slice then ignore
if reflect.ValueOf(x).Kind() == reflect.Slice {
array := x.([]interface{})
for _, v := range array {
value := fmt.Sprintf("%v", v)
res = append(res, value)
}
}
}
return res
}

func (selectors Selector) AddExtra(key string, selector *Item) Selector {
if selectors.Extras == nil {
selectors.Extras = make(map[string]*Item)
}
@@ -2,6 +2,7 @@ package selector

import (
"bufio"
"encoding/json"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html/charset"
"io"
@@ -19,14 +20,72 @@ const MobileUserAgent string = "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like M

type Site struct {
Url string
WebUrl string
UserAgent string
Charset string
Cookies []http.Cookie
CssSelector
Json bool
JsonData interface{}
Selector
Next *Site
}

func (site Site) Meta() Meta {
var meta = Meta{}
if site.Json {
meta = site.parseJson()
} else {
meta = site.parseHtml()
}

if len(site.WebUrl) > 0 {
meta.Url = site.WebUrl
} else {
meta.Url = site.Url
}
return meta
}

func (site Site) parseJson() Meta {
var meta = Meta{}
body, err := ioutil.ReadAll(site.Body())
err = json.Unmarshal(body, &site.JsonData)
if err != nil {
log.Fatal(err)
}

data := make(map[string]interface{})
m, ok := site.JsonData.(map[string]interface{})
if ok {
for k, v := range m {
//fmt.Println(k, "=>", v)
data[k] = v
}
}

next := Meta{}
if site.Next != nil {
next = site.Next.Meta()
}

// extract meta data from json data
meta.Title = oneOf(site.Title.Query(data), next.Title)
meta.Actor = oneOf(site.Actor.Query(data), next.Actor)
meta.Poster = oneOf(site.Poster.Query(data), next.Poster)
meta.Producer = oneOf(site.Producer.Query(data), next.Producer)
meta.Sample = oneOf(site.Sample.Query(data), next.Sample)
meta.Series = oneOf(site.Series.Query(data), next.Series)
meta.Release = oneOf(site.Release.Query(data), next.Release)
meta.Duration = oneOf(site.Duration.Query(data), next.Duration)
meta.Id = oneOf(site.Id.Query(data), next.Id)
meta.Label = oneOf(site.Label.Query(data), next.Label)
meta.Genre = oneOfArray(site.Genre.Queries(data), next.Genre)
meta.Images = oneOfArray(site.Images.Queries(data), next.Images)

return meta
}

func (site Site) parseHtml() Meta {
var meta = Meta{}
// load the HTML document
doc, err := goquery.NewDocumentFromReader(site.Body())
@@ -53,7 +112,6 @@ func (site Site) Meta() Meta {
meta.Label = oneOf(site.Label.Value(doc), next.Label)
meta.Genre = oneOfArray(site.Genre.Values(doc), next.Genre)
meta.Images = oneOfArray(site.Images.Values(doc), next.Images)
meta.Url = site.Url

// extract extras to meta
if site.Extras != nil {
@@ -11,19 +11,19 @@ func Carib(id string) Site {
UserAgent: MobileUserAgent,
Charset: "euc-jp",

CssSelector: CssSelector{
Title: Selector("h1[itemprop=name]"),
Actor: Selector("a[itemprop=actor]"),
Selector: Selector{
Title: Select("h1[itemprop=name]"),
Actor: Select("a[itemprop=actor]"),
Poster: Preset(fmt.Sprintf("https://www.caribbeancom.com/moviepages/%s/images/l_l.jpg", id)),
Producer: Preset("Caribbean"),
Sample: Preset(fmt.Sprintf("https://smovie.caribbeancom.com/sample/movies/%s/480p.mp4", id)),
Series: Selector("a[onclick^=gaDetailEvent\\(\\'Series\\ Name\\']"),
Release: Selector("span[itemprop=datePublished]"),
Duration: Selector("span[itemprop=duration]"),
Series: Select("a[onclick^=gaDetailEvent\\(\\'Series\\ Name\\']"),
Release: Select("span[itemprop=datePublished]"),
Duration: Select("span[itemprop=duration]"),
Id: Preset(id),
Label: Selector("null"),
Genre: Selector("a[itemprop=genre]"),
Images: Selector("a[data-is_sample='1']").Attribute("href").Replace("/movie", "https://www.caribbeancom.com/movie"),
Label: Select("null"),
Genre: Select("a[itemprop=genre]"),
Images: Select("a[data-is_sample='1']").Attribute("href").Replace("/movie", "https://www.caribbeancom.com/movie"),
},
}
}
@@ -11,19 +11,19 @@ func CaribPr(id string) Site {
UserAgent: MobileUserAgent,
Charset: "euc-jp",

CssSelector: CssSelector{
Title: Selector("h1"),
Actor: Selector("a.spec-item[href^=\"/search/\"]"),
Selector: Selector{
Title: Select("h1"),
Actor: Select("a.spec-item[href^=\"/search/\"]"),
Poster: Preset(fmt.Sprintf("https://www.caribbeancompr.com/moviepages/%s/images/l_l.jpg", id)),
Producer: Preset("Caribbean"),
Sample: Preset(fmt.Sprintf("https://smovie.caribbeancompr.com/sample/movies/%s/480p.mp4", id)),
Series: Selector("a[href^=\"/serieslist/\"]"),
Release: Selector("div.movie-info > div > ul > li:nth-child(2) > span.spec-content"),
Duration: Selector("div.movie-info > div > ul > li:nth-child(3) > span.spec-content"),
Series: Select("a[href^=\"/serieslist/\"]"),
Release: Select("div.movie-info > div > ul > li:nth-child(2) > span.spec-content"),
Duration: Select("div.movie-info > div > ul > li:nth-child(3) > span.spec-content"),
Id: Preset(id),
Label: Selector("a[href^=\"/serieslist/\"]"),
Genre: Selector("a.spec-item[href^=\"/listpages/\"]"),
Images: Selector("a[data-is_sample='1']").Attribute("href"),
Label: Select("a[href^=\"/serieslist/\"]"),
Genre: Select("a.spec-item[href^=\"/listpages/\"]"),
Images: Select("a[data-is_sample='1']").Attribute("href"),
},
}
}
@@ -10,19 +10,19 @@ func Dmm(id string) Site {
Url: fmt.Sprintf("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=%s/", id),
UserAgent: MobileUserAgent,

CssSelector: CssSelector{
Title: Selector(".ttl-grp"),
Actor: Selector("ul.parts-maindata > li > a > span"),
Poster: Selector(".package").Replace("ps.jpg", "pl.jpg").Attribute("src"),
Producer: Selector(".parts-subdata"),
Sample: Selector(".play-btn"),
Series: Selector("#work-mono-info > dl:nth-child(4) > dd"),
Release: Selector("#work-mono-info > dl:nth-child(8) > dd"),
Duration: Selector("#work-mono-info > dl:nth-child(9) > dd"),
Id: Selector("#work-mono-info > dl:nth-child(10) > dd"),
Label: Selector("#work-mono-info > dl:nth-child(6) > dd > ul > li > a"),
Genre: Selector("#work-mono-info > dl.box-genreinfo > dd > ul > li > a"),
Images: Selector("#sample-list > ul > li > a > span > img").Replace("-", "jp-").Attribute("src"),
Selector: Selector{
Title: Select(".ttl-grp"),
Actor: Select("ul.parts-maindata > li > a > span"),
Poster: Select(".package").Replace("ps.jpg", "pl.jpg").Attribute("src"),
Producer: Select(".parts-subdata"),
Sample: Select(".play-btn"),
Series: Select("#work-mono-info > dl:nth-child(4) > dd"),
Release: Select("#work-mono-info > dl:nth-child(8) > dd"),
Duration: Select("#work-mono-info > dl:nth-child(9) > dd"),
Id: Select("#work-mono-info > dl:nth-child(10) > dd"),
Label: Select("#work-mono-info > dl:nth-child(6) > dd > ul > li > a"),
Genre: Select("#work-mono-info > dl.box-genreinfo > dd > ul > li > a"),
Images: Select("#sample-list > ul > li > a > span > img").Replace("-", "jp-").Attribute("src"),
},
}
}
@@ -13,17 +13,17 @@ func Fantia(id string) Site {
UserAgent: UserAgent,
Cookies: []http.Cookie{{Name: "_session_id", Value: "5602e9a9f48bba1997b07baca88e525f"}},

CssSelector: CssSelector{
Title: Selector(".product-title"),
Actor: Selector("h3.fanclub-name"),
Poster: Selector("img[src^=\"https://c.fantia.jp/uploads/product/image\"]").Attribute("src"),
Producer: Selector("h3.fanclub-name"),
Sample: Selector("null"),
Series: Selector("h3.fanclub-name"),
Id: Selector("a.btn.btn-default.btn-sm.btn-star").Attribute("data-product_id"),
Label: Selector("null"),
Genre: Selector("null"),
Images: Selector("img[src^=\"https://c.fantia.jp/uploads/product_image\"]").Attribute("src"),
Selector: Selector{
Title: Select(".product-title"),
Actor: Select("h3.fanclub-name"),
Poster: Select("img[src^=\"https://c.fantia.jp/uploads/product/image\"]").Attribute("src"),
Producer: Select("h3.fanclub-name"),
Sample: Select("null"),
Series: Select("h3.fanclub-name"),
Id: Select("a.btn.btn-default.btn-sm.btn-star").Attribute("data-product_id"),
Label: Select("null"),
Genre: Select("null"),
Images: Select("img[src^=\"https://c.fantia.jp/uploads/product_image\"]").Attribute("src"),
},
}
}
@@ -10,19 +10,19 @@ func Fc2(id string) Site {
Url: fmt.Sprintf("https://adult.contents.fc2.com/article/%s/", id),
UserAgent: MobileUserAgent,

CssSelector: CssSelector{
Title: Selector(".items_article_MainitemNameTitle"),
Actor: Selector(".items_article_seller").Replace("by ", ""),
Poster: Selector("meta[property^=\"og:image\"]").Attribute("content"),
Producer: Selector(".items_article_seller").Replace("by ", ""),
Sample: Selector(".main-video").Attribute("src"),
Series: Selector(".items_article_seller").Replace("by ", ""),
Release: Selector(".items_article_Releasedate").Replace("販売日 : ", ""),
Duration: Selector(".items_article_MainitemThumb > p"),
Id: Selector(".items_article_TagArea").Attribute("data-id"),
Label: Selector("null"),
Genre: Selector("null"),
Images: Selector("li[data-img^=\"https://storage\"]").Attribute("data-img"),
Selector: Selector{
Title: Select(".items_article_MainitemNameTitle"),
Actor: Select(".items_article_seller").Replace("by ", ""),
Poster: Select("meta[property^=\"og:image\"]").Attribute("content"),
Producer: Select(".items_article_seller").Replace("by ", ""),
Sample: Select(".main-video").Attribute("src"),
Series: Select(".items_article_seller").Replace("by ", ""),
Release: Select(".items_article_Releasedate").Replace("販売日 : ", ""),
Duration: Select(".items_article_MainitemThumb > p"),
Id: Select(".items_article_TagArea").Attribute("data-id"),
Label: Select("null"),
Genre: Select("null"),
Images: Select("li[data-img^=\"https://storage\"]").Attribute("data-img"),
},
}
}

0 comments on commit b8e1800

Please sign in to comment.
You can’t perform that action at this time.