## 油價

In [35]:
import traceback
import requests
import pandas as pd
import json
import xml.etree.ElementTree as etree

class OilDataSet:
    url = ''
    UA = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.13 Safari/537.36"

    def __init__(self, url, UA=UA):
        self.url = url
        self.headers = {'user-agent': UA}
    
    def get_oil_price(self):
        try:
            requests.session()
            headers = self.headers
            root = etree.fromstring(requests.get(self.url, headers=headers).text)
            columns = ["型別名稱", "產品編號", "產品名稱", "包裝", "銷售對象", "交貨地點", "計價單位", "參考牌價", "營業稅", "貨物稅", "牌價生效時間", "備註"]
            datatframe = pd.DataFrame(columns = columns)
            content = ''

            for node in root:
                typeName = node.find("型別名稱").text if node is not None else None
                idNum = node.find("產品編號").text if node is not None else None
                prodName = node.find("產品名稱").text if node is not None else None
                package = node.find("包裝").text if node is not None else None
                target = node.find("銷售對象").text if node is not None else None
                local = node.find("交貨地點").text if node is not None else None
                unit = node.find("計價單位").text if node is not None else None
                ref_money = node.find("參考牌價").text if node is not None else None
                tax_1 = node.find("營業稅").text if node is not None else None
                tax_2 = node.find("貨物稅").text if node is not None else None
                time = node.find("牌價生效時間").text if node is not None else None
                note= node.find("備註").text if node is not None else None
                datatframe = datatframe.append(pd.Series([typeName, idNum, prodName, package, target, local,unit,ref_money,tax_1, tax_2, time, note], index = columns), ignore_index = True)
                
                #for Line output string
                content = content + '{}:{}:{} １%0D%0A'.format(prodName, unit, ref_money)
            
            return content
            #return datatframe

        except (ValueError, EOFError, KeyboardInterrupt):
            errorMsg = '數值錯誤!請稍晚再試'
            return errorMsg

        except:
            errorMsg = '資料擷取錯誤，請稍晚再試!'
            return errorMsg
            #traceback.print_exc()

    ''' API concantenation of government public data '''
    def oil_apiConnect(self):
        headers = self.headers
        res = requests.get(self.url, headers = headers)
        res.encoding='utf-8'

        return json.loads(res.text)

In [36]:
#test = GetDataSet('https://vipmember.tmtd.cpc.com.tw/opendata/ListPriceWebService.asmx/getCPCMainProdListPrice_XML')
#content = test.get_oil_price()

test = OilDataSet('https://data.gov.tw/api/v1/rest/dataset/6339')
content = test.oil_apiConnect()
print(content)


{'help': '', 'success': True, 'result': {'categoryCode': 'I00', 'identifier': '313504100K-000002', 'title': '中油主產品牌價', 'description': '蒐集目的：本資料集主要提供台灣中油公司汽油、柴油、燃料油等油品最新零售牌價資訊。', 'fieldDescription': '型別名稱、產品編號、產品名稱、包裝、銷售對象、交貨地點、計價單位、參考牌價、營業稅、貨物稅、牌價生效時間、備註', 'type': 'rawData', 'license': '政府資料開放授權條款-第1版', 'licenseURL': '', 'cost': '免費', 'costURL': '', 'costLaw': '', 'organization': '', 'organizationContactName': '', 'organizationContactPhone': '', 'organizationContactEmail': '', 'publisher': '台灣中油股份有限公司油品行銷事業部', 'publisherContactName': '張先生', 'publisherContactPhone': '02-87258672', 'publisherContactEmail': '101109@cpc.com.tw', 'publisherOID': '2.16.886.101.20003.20007.20123.20006', 'publisherOrgCode': '313504100K', 'accrualPeriodicity': '每周', 'temporalCoverageFrom': '', 'temporalCoverageTo': '', 'issued': '2013-10-02 08:00:00', 'modified': '2019-11-12 10:07:45', 'spatial': '', 'language': '中文', 'keyword': ['牌價'], 'numberOfData': '12', 'notes': '授權說明網址: http://data.gov.tw/license', 'viewC

## PTT

In [23]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function

import os
import re
import sys
import json
import requests
import argparse
import time
import codecs
from bs4 import BeautifulSoup
from six import u

__version__ = '1.0'

# if python 2, disable verify flag in requests.get()
VERIFY = True
if sys.version_info[0] < 3:
    VERIFY = False
    requests.packages.urllib3.disable_warnings()


class PttWebCrawler(object):

    PTT_URL = 'https://www.ptt.cc'

    """docstring for PttWebCrawler"""
    def __init__(self, cmdline=None, as_lib=False):
        parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
            A crawler for the web version of PTT, the largest online community in Taiwan.
            Input: board name and page indices (or articla ID)
            Output: BOARD_NAME-START_INDEX-END_INDEX.json (or BOARD_NAME-ID.json)
        ''')
        parser.add_argument('-b', metavar='BOARD_NAME', help='Board name', required=True)
        group = parser.add_mutually_exclusive_group(required=True)
        group.add_argument('-i', metavar=('START_INDEX', 'END_INDEX'), type=int, nargs=2, help="Start and end index")
        group.add_argument('-a', metavar='ARTICLE_ID', help="Article ID")
        parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__)

        if not as_lib:
            if cmdline:
                args = parser.parse_args(cmdline)
            else:
                args = parser.parse_args()
            board = args.b
            if args.i:
                start = args.i[0]
                if args.i[1] == -1:
                    end = self.getLastPage(board)
                else:
                    end = args.i[1]
                self.parse_articles(start, end, board)
            else:  # args.a
                article_id = args.a
                self.parse_article(article_id, board)

    def parse_articles(self, start, end, board, path='.', timeout=3):
            filename = board + '-' + str(start) + '-' + str(end) + '.json'
            filename = os.path.join(path, filename)
            self.store(filename, u'{"articles": [', 'w')
            for i in range(end-start+1):
                index = start + i
                print('Processing index:', str(index))
                resp = requests.get(
                    url = self.PTT_URL + '/bbs/' + board + '/index' + str(index) + '.html',
                    cookies={'over18': '1'}, verify=VERIFY, timeout=timeout
                )
                if resp.status_code != 200:
                    print('invalid url:', resp.url)
                    continue
                soup = BeautifulSoup(resp.text, 'html.parser')
                divs = soup.find_all("div", "r-ent")
                for div in divs:
                    try:
                        # ex. link would be <a href="/bbs/PublicServan/M.1127742013.A.240.html">Re: [問題] 職等</a>
                        href = div.find('a')['href']
                        link = self.PTT_URL + href
                        article_id = re.sub('\.html', '', href.split('/')[-1])
                        if div == divs[-1] and i == end-start:  # last div of last page
                            self.store(filename, self.parse(link, article_id, board), 'a')
                        else:
                            self.store(filename, self.parse(link, article_id, board) + ',\n', 'a')
                    except:
                        pass
                time.sleep(0.1)
            self.store(filename, u']}', 'a')
            return filename

    def parse_article(self, article_id, board, path='.'):
        link = self.PTT_URL + '/bbs/' + board + '/' + article_id + '.html'
        filename = board + '-' + article_id + '.json'
        filename = os.path.join(path, filename)
        self.store(filename, self.parse(link, article_id, board), 'w')
        return filename

    @staticmethod
    def parse(link, article_id, board, timeout=3):
        print('Processing article:', article_id)
        resp = requests.get(url=link, cookies={'over18': '1'}, verify=VERIFY, timeout=timeout)
        if resp.status_code != 200:
            print('invalid url:', resp.url)
            return json.dumps({"error": "invalid url"}, sort_keys=True, ensure_ascii=False)
        soup = BeautifulSoup(resp.text, 'html.parser')
        main_content = soup.find(id="main-content")
        metas = main_content.select('div.article-metaline')
        author = ''
        title = ''
        date = ''
        if metas:
            author = metas[0].select('span.article-meta-value')[0].string if metas[0].select('span.article-meta-value')[0] else author
            title = metas[1].select('span.article-meta-value')[0].string if metas[1].select('span.article-meta-value')[0] else title
            date = metas[2].select('span.article-meta-value')[0].string if metas[2].select('span.article-meta-value')[0] else date

            # remove meta nodes
            for meta in metas:
                meta.extract()
            for meta in main_content.select('div.article-metaline-right'):
                meta.extract()

        # remove and keep push nodes
        pushes = main_content.find_all('div', class_='push')
        for push in pushes:
            push.extract()

        try:
            ip = main_content.find(text=re.compile(u'※ 發信站:'))
            ip = re.search('[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*', ip).group()
        except:
            ip = "None"

        # 移除 '※ 發信站:' (starts with u'\u203b'), '◆ From:' (starts with u'\u25c6'), 空行及多餘空白
        # 保留英數字, 中文及中文標點, 網址, 部分特殊符號
        filtered = [ v for v in main_content.stripped_strings if v[0] not in [u'※', u'◆'] and v[:2] not in [u'--'] ]
        expr = re.compile(u(r'[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\s\w:/-_.?~%()]'))
        for i in range(len(filtered)):
            filtered[i] = re.sub(expr, '', filtered[i])

        filtered = [_f for _f in filtered if _f]  # remove empty strings
        filtered = [x for x in filtered if article_id not in x]  # remove last line containing the url of the article
        content = ' '.join(filtered)
        content = re.sub(r'(\s)+', ' ', content)
        # print 'content', content

        # push messages
        p, b, n = 0, 0, 0
        messages = []
        for push in pushes:
            if not push.find('span', 'push-tag'):
                continue
            push_tag = push.find('span', 'push-tag').string.strip(' \t\n\r')
            push_userid = push.find('span', 'push-userid').string.strip(' \t\n\r')
            # if find is None: find().strings -> list -> ' '.join; else the current way
            push_content = push.find('span', 'push-content').strings
            push_content = ' '.join(push_content)[1:].strip(' \t\n\r')  # remove ':'
            push_ipdatetime = push.find('span', 'push-ipdatetime').string.strip(' \t\n\r')
            messages.append( {'push_tag': push_tag, 'push_userid': push_userid, 'push_content': push_content, 'push_ipdatetime': push_ipdatetime} )
            if push_tag == u'推':
                p += 1
            elif push_tag == u'噓':
                b += 1
            else:
                n += 1

        # count: 推噓文相抵後的數量; all: 推文總數
        Message_push_count = p+b+n
        message_count = {'all': p+b+n, 'count': p-b, 'push': p, 'boo': b, "neutral": n}

        # print 'msgs', messages
        # print 'mscounts', message_count

        # json data
        data = {
            'url': link,
            'board': board,
            'article_id': article_id,
            'article_title': title,
            'author': author,
            'date': date,
            'content': content,
            'ip': ip,
            'Message-push-count' : Message_push_count,
            'message_count': message_count,
            'messages': messages
        }
        # print 'original:', d
        return json.dumps(data, sort_keys=True, ensure_ascii=False)
   
    @staticmethod
    def getLastPage(board, timeout=3):
        content = requests.get(
            url= 'https://www.ptt.cc/bbs/' + board + '/index.html',
            cookies={'over18': '1'}, timeout=timeout
        ).content.decode('utf-8')
        first_page = re.search(r'href="/bbs/' + board + '/index(\d+).html">&lsaquo;', content)
        if first_page is None:
            return 1
        return int(first_page.group(1)) + 1

    @staticmethod
    def store(filename, data, mode):
        with codecs.open(filename, mode, encoding='utf-8') as f:
            f.write(data)

    @staticmethod
    def get(filename, mode='r'):
        with codecs.open(filename, mode, encoding='utf-8') as f:
            return json.load(f)

if __name__ == '__main__':
    c = PttWebCrawler()



usage: ipykernel_launcher.py [-h] -b BOARD_NAME
                             (-i START_INDEX END_INDEX | -a ARTICLE_ID) [-v]
ipykernel_launcher.py: error: the following arguments are required: -b


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [29]:
import json
content=''
board = 'Gossiping'
LastPage = PttWebCrawler.getLastPage(borad)
PttWebCrawler(['-b', board, '-i', str(LastPage-1), str(LastPage)])

with open(board + '-' + str(LastPage-1) + '-' + str(LastPage) + '.json' , 'r') as reader:
    data = json.loads(reader.read())
type(data)

#以推文數排序
sorted_data = sorted(data['articles'], key=lambda x : x['Message-push-count'], reverse=True)

#因line無法傳送過多資訊，只取前15筆
for article in sorted_data[0:15]:
    data = '[{} push] {}\n{}\n\n'.format(article.get('Message-push-count', None), article.get('article_title', None),
                                             article.get('url', None))
    content += data
    
print(content)

[79 push] 
https://www.ptt.cc/bbs/Gossiping/M.1588288876.A.14C.html

[39 push] Re: [新聞] 勞工紓困貸款10萬元 1.9萬人申貸 首波核
https://www.ptt.cc/bbs/Gossiping/M.1588753693.A.10F.html

[32 push] [爆卦] 俄羅斯新增武漢肺炎連4天破10000人啦！
https://www.ptt.cc/bbs/Gossiping/M.1588753416.A.0FB.html

[32 push] [問卦] 100萬內買什麼車是冤大頭？
https://www.ptt.cc/bbs/Gossiping/M.1588753501.A.B75.html

[32 push] [新聞] 立委溝通公衛師法 莊競程助理爆罵邱泰源
https://www.ptt.cc/bbs/Gossiping/M.1588753701.A.80F.html

[29 push] [問卦] 35歲以上還在當業務助理的單身台女是？
https://www.ptt.cc/bbs/Gossiping/M.1588753569.A.884.html

[29 push] [問卦] 消費券858億 vs 紓困方案1兆5百億 
https://www.ptt.cc/bbs/Gossiping/M.1588754020.A.626.html

[28 push] [問卦] 太七的女生是不是很賤?
https://www.ptt.cc/bbs/Gossiping/M.1588754010.A.F41.html

[25 push] Re: [新聞] 張麗善籲：排富全面發放紓困金每人1萬元
https://www.ptt.cc/bbs/Gossiping/M.1588753290.A.0EA.html

[25 push] Re: [問卦] 北士商是一所怎麼樣的學校？
https://www.ptt.cc/bbs/Gossiping/M.1588753438.A.1A3.html

[23 push] Re: [爆卦] 中油部分站點 再度封站 問題持續無解中
https://www.ptt.cc/bbs/Gossiping/M.1588753278.A.791.htm

int