# Media Scrapper
## Glossary
1. Story: one topic of images that often displays on multiple pages

## Goal: *Download the best story under a certain tag automatically.*

## Procedures
1. Create a list of story urls and popularity by tag, with story urls descending ordered by popularity
2. Test if story url is downloadable
3. Get story name to create a folder for the incoming images
4. Confirm series of pages for the same story
5. Download all images given a single story url, including multiple pages under a story
6. Save all images to the folder under the story

## Precautions
1. Scrapper interval: time.sleep(0.5)
2. FakeAgent: fake browser login

## Image source
1. http://www.bfpgf.com/fsh/82835.html/
2. http://93.t9p.today/forumdisplay.php?fid=19&page=1
3. Pinterest
4. Instagram
5. Weibo

## Improvements
- [x] How to add image number information to the story folder suffix?
- [ ] Update the Tag Folder Reguarly, Show lastest images
- [ ] Popularity Trend of Story
- [ ] Download all images of a story on multiple pages (for competition or poll story)
- [ ] Given any url, download all images on that webpage, save to a given directory
- [ ] Download images from all image sources under the same tag
- [ ] Daily newsfeed for subscribers
- [ ] Performance Enhancement: Better move file
- [ ] Facial recognition by dlib to classify different stars
- [ ] Build as an API
- [ ] Web application: Django + Vue.JS

# Fuligets Scrapper

In [2]:
# Fuligets Scrapper
# http://www.bfpgf.com/

# Input a full URL Address
url = 'http://www.bfpgf.com/yld/82962.html'

######################
# Don't Change Below #
######################
from bs4 import BeautifulSoup, Comment
from fake_useragent import UserAgent
from tqdm import *
import requests, os, datetime, shutil, re, time

# Download File Common Function
def download_file(url, path):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                #f.flush() commented by recommendation from J.F.Sebastian
    shutil.move(local_filename,path+'/'+local_filename)

# Get Story Name
#headers = {'User-Agent': str(UserAgent().chrome)}
r = requests.get(url)#,headers)
soup = BeautifulSoup(r.text,'lxml')
name = soup.find('h1',class_="article-title").find('a').get_text()

# Create Folder
os.makedirs(name)

# Confirm Pages
if len(soup.find_all('div',class_='wp-pagenavi')) == 1:
    comments=soup.find_all(string=lambda text:isinstance(text,Comment))
    for c in comments:
        if url in c:
            text = str(c)
            pages = int(re.compile(".*<span>(.*)</span>.*").match(text).group(1))
        else: 
            pass
else: 
    pages = 1
        
# Download Images
for i in tqdm(range(1,pages+1,1)):
    url_idv = url+'/'+str(i)
    headers = {'User-Agent': str(UserAgent().chrome)}

    r = requests.get(url_idv,headers)
    soup = BeautifulSoup(r.text,'lxml')
    for img in soup.find('article',class_='article-content').find_all('img'):
        download_file('http://www.bfpgf.com'+img['src'],name)
    time.sleep(0.01)

Error occurred during loading data. Trying to use cache server https://fake-useragent.herokuapp.com/browsers/0.1.8
Traceback (most recent call last):
  File "/anaconda/lib/python3.6/urllib/request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/anaconda/lib/python3.6/http/client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/anaconda/lib/python3.6/http/client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/anaconda/lib/python3.6/http/client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/anaconda/lib/python3.6/http/client.py", line 1026, in _send_output
    self.send(msg)
  File "/anaconda/lib/python3.6/http/client.py", line 964, in send
    self.connect()
  File "/anaconda/lib/python3.6/http/client.py", line 1392, in connect
    super().connect()
  File "/anaconda/lib/python3.6/h

# 91 Scrapper: Image

In [None]:
# 91 Scrapper: Image
# http://93.t9p.today/index.php

from bs4 import BeautifulSoup, Comment
from fake_useragent import UserAgent
from tqdm import *
import requests, os, datetime, shutil, re, time
import pandas as pd

# Create a list of story urls by certain tags, ranked by popularity, only keeped tops of the list and undownloadable excluded
def get_stories_by_tag(tag, top_number):
    # Use 'try-finally' in case the input tag does not exist
    try:
        # Get totol pages number of a tag 
        url = 'http://93.t9p.today/tag.php?name='+tag
        headers = {'User-Agent': str(UserAgent().chrome)}
        r = requests.get(url,headers)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text,'lxml')
        pages = 1
        for page in soup.find('div',class_='pages_btns').find_all('a',class_=False):
            pages += 1
        
        # Create a list (dataframe) of story names, popularity and story urls
        df_urls = pd.DataFrame(columns=['name','popularity','url','image_number'])
        for page in tqdm(range(pages)):
            url = 'http://93.t9p.today/tag.php?name='+tag+'&page='+str(page)
            headers = {'User-Agent': str(UserAgent().chrome)}
            r = requests.get(url,headers)
            r.encoding = 'utf-8'
            soup_tag = BeautifulSoup(r.text,'lxml')
            # Get all available stories with url and popularity data and append them into the DataFrame
            for story in soup_tag.find_all('tbody'):
                url = 'http://93.t9p.today/'+story.find('a')['href']
                headers = {'User-Agent': str(UserAgent().chrome)}
                r = requests.get(url,headers)
                r.encoding = 'utf-8'
                soup_url = BeautifulSoup(r.text,'lxml')
                # Filter unaccessible stories
                if soup_url.find('div',class_='alert_error') or soup_url.find('div',class_='postmessage firstpost').find('div',class_='locked') or (len(soup_url.find_all('img',file=True,width=True,id=True,alt=True)) == 0):
                    pass
                else:
                    df_urls = df_urls.append({'name':story.find('a').get_text(), 'popularity':int(story.find('td',class_='nums').find('em').get_text()), 'url':'http://93.t9p.today/'+story.find('a')['href'],'image_number':len(soup_url.find_all('img',file=True,width=True,id=True,alt=True))}, ignore_index=True)
        # Order by popularity, only keep the top_number of stories
        df_urls = df_urls.drop_duplicates(subset='url').sort_values(by='popularity',ascending=False).iloc[:top_number,:]
        # 移除含有删帖字样的帖子
        df_urls['标题是否含删帖'] = df_urls['name'].apply(lambda x:'删帖' in x)
        df_urls = df_urls[df_urls['标题是否含删帖'] == False].iloc[:,:-1]
        df_urls.index = range(len(df_urls))
        df_urls.to_csv('91论坛'+tag+'精选.csv', encoding='gb18030')
    
    except: 
        print ('The input tag does not exist. Please input a new tag.')

image_number_total =0

# Download all images of a story and save them to one folder on desktop
def download_story(url):
    # Download File Common Function
    def download_file(url, path):
        local_filename = url.split('/')[-1]
        # NOTE the stream=True parameter
        r = requests.get(url, stream=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
                    #f.flush() commented by recommendation from J.F.Sebastian
        shutil.move(local_filename,path+'/'+local_filename)

    # Get Story Name
    headers = {'User-Agent': str(UserAgent().chrome)}
    r = requests.get(url,headers)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text,'lxml')
    name = soup.find('title').get_text().split('-')[0].strip()

    # Create Folder, only when there is at least one image in the story
    os.makedirs(name)
    
    # Download Images and Count Numbers of Images
    image_number = 0
    for img in soup.find_all('img',file=True,width=True,id=True,alt=True):
        image_number += 1
        download_file('http://93.t9p.today/'+img['file'],name)
        time.sleep(0.05) # Scrapping too fast will cause server error. Immortal! Inhonorable!
    # Rename Folder and Add Image Counts Suffix
    os.rename(name,name+'['+str(image_number)+'P]')
    
    global image_number_total
    image_number_total += image_number

In [None]:
# 91 Scrapper: Image
# Input a Tag of your interests, with top input numbers of stories displayed.
# Images will be saved in folders under the story name separately at the path where this Jupyter Notebook file locates.

##### INPUT AREA ######
tag_input = '模特' # Name the tag you want
top_story_num_input = 5 # 'None': for all stories; 5: for top 5 stories of the tag
#######################

get_stories_by_tag(tag_input,top_story_num_input)

list_urls = pd.read_csv('91论坛'+tag_input+'精选.csv',encoding='gb18030').url.tolist()
for url in tqdm(list_urls):
    try:
        download_story(url)
    except Exception as e:
        print (str(e), url)

# Total Numbers of Images under a Tag
print (tag_input,'标签共有图片：',image_number_total)

# 91 Scrapper: Video

In [4]:
# 91 Scrapper: Video

from bs4 import BeautifulSoup, Comment
from fake_useragent import UserAgent
from tqdm import *
import requests, os, datetime, shutil, re, time
import pandas as pd

# Download File Common Function
def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def get_video_url(url):
    # Get the video url from the video's page url
    headers = {'User-Agent': str(UserAgent().chrome)}
    r = requests.get(url,headers)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text,'lxml')
    soup.find('source', src=True, type='video/mp4')

In [5]:
get_video_url('http://91porn.com/view_video.php?viewkey=49d446466df6c7912edb&page=1&viewtype=basic&category=hot')

In [7]:
url = 'http://91porn.com/view_video.php?viewkey=49d446466df6c7912edb&page=1&viewtype=basic&category=hot'

# Get the video url from the video's page url
headers = {'User-Agent': str(UserAgent().chrome)}
r = requests.get(url,headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text,'lxml')
soup.find('source', src=True, type='video/mp4')

In [11]:
download_file('http://185.38.13.159//mp43/253097.mp4?st=oxFGYsXz1lcmhpHOpr8-8Q&e=1520023447')