In [None]:
Ex_MOMO

# MOMO網頁重點：
### 1. 需使用User-Agent
在這裡沒有設定 User-Agent 會被拒絕連線，因此當我們無法成功取回資料時的第一個步驟一定是加上 User-Agent 來進行測試!
### 2. 使用手機板的網頁
電腦版的網頁不好爬，因為伺服器請求資料的方式是用 POST 的方式，而且POST過去的參數也相當複雜，我有嘗試 POST 這些資料過去但還是沒辦法正常請求到資料，估計還有其他的反爬蟲機制
但改成用手機版的網頁，請求資料的方式變成 GET，而且送的參數變得相當簡單，基本上就是查詢的關鍵詞和第一個分頁就可以了
### 3. 資料的整理會需要使用一些正則表達式(re.sub清除空白)
### 4. 處理不固定量的欄位資料(規格明細)，並且在串接回原資料
### 5. 商品資料用'meta'標籤抓，好像比較不會受到網頁標籤變化影響

# 網頁範例(用Pandas)
https://tlyu0419.github.io/2020/06/15/Crawler-momo/#more

In [43]:
import pandas as pd
from bs4 import BeautifulSoup
import json
import requests
import re


keyword = '羅技'
pages = 2
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}

urls = []
for page in range(1, pages):
    url = 'https://m.momoshop.com.tw/search.momo?_advFirst=N&_advCp=N&curPage={}&searchType=1&cateLevel=2&ent=k&searchKeyword={}&_advThreeHours=N&_isFuzzy=0&_imgSH=fourCardType'
    url = url.format(page, keyword)
    print(url)
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text)
        for item in soup.select('li.goodsItemLi > a'):
            urls.append('https://m.momoshop.com.tw'+item['href'])
    urls = list(set(urls))
    print(len(urls))
#     break

#爬取產品資料
df = []
for i, url in enumerate(urls):  #迴圈加index，可做為進度條使用。
    columns = []
    values = []
    
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text)
    # 標題
    title = soup.find('meta',{'property':'og:title'})['content']
    # 品牌
    brand = soup.find('meta',{'property':'product:brand'})['content']
    # 連結
    link = soup.find('meta',{'property':'og:url'})['content']
    # 原價
    try:
        price = re.sub(r'\r\n| ','',soup.find('del').text)  #\r回車:光標回到起始位，\n換行:光標移到下一行，\r\n就是Enter
    except:
        price = ''
    # 特價
    amount = soup.find('meta',{'property':'product:price:amount'})['content']
    # 類型
    cate = ''.join([i.text for i in soup.findAll('article',{'class':'pathArea'})])
    cate = re.sub('\n|\xa0',' ',cate)
    # 描述
    try:
        desc = soup.find('div',{'class':'Area101'}).text
        desc = re.sub('\r|\n| ', '', desc)
    except:
        desc = ''

    print('==================  {}  =================='.format(i))
    print(title)
    print(brand)
    print(link)
    print(amount)
    print(cate)

    columns += ['title', 'brand', 'link', 'price', 'amount', 'cate', 'desc'] #為何用+=(expand)，因為這裡只是把欄位當index用
    values += [title, brand, link, price, amount, cate, desc]

    # 規格明細：處理不固定量的欄位資料，可能有的多有的少
        #(善用欄位名稱在th、內容都在之下的li的特性，即可用迴圈自行生成)
    for i in soup.select('div.attributesArea > table > tr'):
        try:
            column = i.find('th').text  #欄位名稱都在'th'
            column = re.sub('\n|\r| ','',column)
            value = ''.join([j.text for j in i.findAll('li')])  #內容都在'li'，但可能不只一行，所以用迴圈再合併起來
            value = re.sub('\n|\r| ','',value) 
            columns.append(column)  #將規格明細的資料加入到原資料中
            values.append(value)
        except:
            pass
    ndf = pd.DataFrame(data=values, index=columns).T
    #用欄位當index好整合，但會變成直向，所以再用T(transpose)轉置:從直向轉橫向
    
    df.append(ndf)
    
df=pd.concat(df, ignore_index=True)  #產生一筆資料(已經脫離第一個迴圈)，將一筆一筆concat軸向連接 
    #ignore_index=True可以忽略合併時舊的 index 欄位，改採用自動產生的 index

df.info()

df.to_excel('MOMO.xlsx')

https://m.momoshop.com.tw/search.momo?_advFirst=N&_advCp=N&curPage=1&searchType=1&cateLevel=2&ent=k&searchKeyword=羅技&_advThreeHours=N&_isFuzzy=0&_imgSH=fourCardType
20
【Logitech 羅技】M331 SilentPlus 靜音滑鼠
Logitech 羅技
http://m.momoshop.com.tw/goods.momo?i_code=4318322
539
  電腦/週邊 >  滑鼠/鍵盤 >  館長推薦 >  羅技 ★刷mo卡回饋10%  
【Logitech 羅技】MK470 超薄無線鍵鼠組
Logitech 羅技
http://m.momoshop.com.tw/goods.momo?i_code=6772741
1,490
  電腦/週邊 >  滑鼠/鍵盤 >  Logitech 羅技 >  羅技全系列  
【Logitech 羅技】迷你型USB無線接受器
Logitech 羅技
http://m.momoshop.com.tw/goods.momo?i_code=5221064
299
  電腦/週邊 >  滑鼠/鍵盤 >  Logitech 羅技 >  羅技全系列  
【Logitech 羅技】K380 多工藍芽鍵盤
Logitech 羅技
http://m.momoshop.com.tw/goods.momo?i_code=7065344
990
  電腦/週邊 >  滑鼠/鍵盤 >  館長推薦 >  羅技 ★刷mo卡回饋10%  
【Logitech 羅技】MK270r無線鍵鼠組(黑色)
Logitech 羅技
http://m.momoshop.com.tw/goods.momo?i_code=3539798
729
  電腦/週邊 >  滑鼠/鍵盤 >  館長推薦 >  羅技 ★刷mo卡回饋10%  
【Logitech 羅技】MK235 無線鍵盤滑鼠組
Logitech 羅技
http://m.momoshop.com.tw/goods.momo?i_code=3889193
599
  電腦/週邊 >  滑鼠/鍵盤 >  Logitech 羅技 >  滑鼠  
【Lo

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




# 自行整理(無法抓meta標籤、好像也無法用正則)

In [40]:
import re, time, requests, csv
from bs4 import BeautifulSoup
import csv

#解析(MoMo要用headers)
def get_soup(url):
    headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Mobile Safari/537.36"}
    res = requests.get(url, headers=headers)
    return BeautifulSoup(res.text, 'lxml')

#搜尋網址&換頁
def get_urls(url, query, start_page, end_page): 
    urls = []
    
    for page in range(start_page, end_page+1):
        urls.append(url.format(query, page))    #query帶入url的{0}、page帶入{1}
    print(urls)    
    return urls

# 依序爬取每頁點入網址
def FindLinks(pages):
    linklist = []
    for page in pages:  
        soup = get_soup(page)
        links = soup.find_all('li', "goodsItemLi")
        for link in links:
            k = "https://m.momoshop.com.tw" + link.find("a").get('href')
            linklist.append(k)
    return linklist

# 爬取點入分頁資料
def get_goods(url):
    goods = []
    rows = get_soup(url)

    for row in rows:

        try:
            name = row.find('h3', attrs={'id' : 'goodsName'}).text
        except:
            name = None

        try:
            price = re.sub(r'\r\n| ','',row.find('table', 'prdDetail bookDetail').td.text)
        except:
            price = None
            
        try:
            Sale_price = re.sub(r'\r\n| ','',row.find("td", class_="priceTxtArea").text) #\r回車:光標回到起始位，\n換行:光標移到下一行，\r\n就是Enter
        except:
            Sale_price = None
        
        # 產品描述
        try:
            desc = row.find('div',{'class':'Area101'}).text
            desc = re.sub('\r|\n| ', '', desc)
        except:
            desc = ''
        
        try:
            URL = url
        except:
            URL = None
            
        good= [name, price, Sale_price, desc, URL]
        goods.append(good)
        
    return goods[1]  #因為不知為何第[0]列都會出現一排None，只好取第[1]列

# 將每一個點入頁面的List依序爬取
def scraping(urls):
    all_goods = [["name","price","Sale_price", "desc","URL"]]
    
    for idx,i in enumerate(FindLinks(urls)):  #記錄目前進行的迴圈次數，配上總迴圈次數，可做為進度條使用。
        print("Crawing No." + str(idx+1) + " Item in Total:" + str(len(FindLinks(urls))) + "Item")
        
        goods = get_goods(i)
        time.sleep(0.2)
        all_goods.append(goods)
    return all_goods
#存成CSV
def save_to_csv(items, file):
    with open(file, "w+", newline="", encoding="utf_8_sig") as fp:  #utf_8_sig:能讓輸出的csv正確顯示中文(utf_8會有亂碼)
        writer = csv.writer(fp)
        for item in items:
            writer.writerow(item)
    
# 開始爬蟲
if __name__ == "__main__":
    """在電腦周邊中搜尋"""
    url = "https://m.momoshop.com.tw/search.momo?searchKeyword={0}&couponSeq=&searchType=1&cateLevel=-1&cateCode=-1&ent=k&_imgSH=fourCardStyle"
    
    urls = get_urls(url, "羅技", 0, 1)
    
    m = scraping(urls)
    save_to_csv(m, "m.csv")

['https://m.momoshop.com.tw/search.momo?searchKeyword=羅技&couponSeq=&searchType=1&cateLevel=-1&cateCode=-1&ent=k&_imgSH=fourCardStyle', 'https://m.momoshop.com.tw/search.momo?searchKeyword=羅技&couponSeq=&searchType=1&cateLevel=-1&cateCode=-1&ent=k&_imgSH=fourCardStyle']
Crawing No.1 Item in Total:40Item
Crawing No.2 Item in Total:40Item
Crawing No.3 Item in Total:40Item
Crawing No.4 Item in Total:40Item
Crawing No.5 Item in Total:40Item
Crawing No.6 Item in Total:40Item
Crawing No.7 Item in Total:40Item
Crawing No.8 Item in Total:40Item
Crawing No.9 Item in Total:40Item
Crawing No.10 Item in Total:40Item
Crawing No.11 Item in Total:40Item
Crawing No.12 Item in Total:40Item
Crawing No.13 Item in Total:40Item
Crawing No.14 Item in Total:40Item
Crawing No.15 Item in Total:40Item
Crawing No.16 Item in Total:40Item
Crawing No.17 Item in Total:40Item
Crawing No.18 Item in Total:40Item
Crawing No.19 Item in Total:40Item
Crawing No.20 Item in Total:40Item
Crawing No.21 Item in Total:40Item
Crawi