In [1]:
import requests
import pandas as pd
import os
import time
from bs4 import BeautifulSoup
from datetime import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from selenium.common.exceptions import TimeoutException


In [2]:
"""
arg: url
return: html
"""


def get_feed_html_soup(driver, url):
    retries = 3
    i = 0
    
    while i < retries:
        try:
            driver.get(url)
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
            
            return html_soup
        except TimeoutException:
            i += 1
            print('Timeout, Retrying... {} / {}'.format((i), retries))
            driver.refresh()
            print('driver was refreshed')
            continue
    
    return None


def get_feed_html_soup_newdriver(driver, url):
    i = 0
    while True:
        if i == 0:
            pass
        else:
            driver = driver
        
        try:
            driver.get(url)
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
        
            return driver, html_soup
    
        except TimeoutException:
            driver.quit()
            driver = set_driver()
            timeout = 10
            driver.set_page_load_timeout(timeout)
            driver = login(driver, url, 'feed')  # ログイン
            i += 1


In [3]:
"""
arg: html_soup
return: mid_list
"""


def get_mid_list(html_soup):
    mid_list = []
    div_elements = html_soup.find_all('div', {'id': 'pl_feedlist_index'})
    div_elements = div_elements[0].find_all('div', {'class': 'card-wrap'})
    for elem in div_elements:
        mid = elem.get('mid')
        if type(mid) == str:
            mid_list.append(mid)
    
    return mid_list


In [4]:
"""
arg: mid
return: account_name, feed_time, sentence, share, comment, good
"""


def get_feed_info(html_soup, mid):
    div_mid = html_soup.find_all('div', {'mid': mid})
    
    # アカウント名
    account_name = div_mid[0].find('p').get('nick-name')
    
    # 時刻
    feed_time = ''.join(div_mid[0].find_all('p', {'class': 'from'})[0].find('a').text.split())
    try:
        if '今天' in feed_time:     
            feed_time = dt.strftime(dt.now(), '%Y-%m-%d ') + feed_time.split('今天')[1] + ':00'
        elif '前' in feed_time:
            feed_time = dt.strftime(dt.now(), '%Y-%m-%d %H:%M:%S')
        elif '秒' in feed_time:
            feed_time = dt.strftime(dt.now(), '%Y-%m-%d %H:%M:%S')
        elif len(feed_time) < 12:
            feed_time = dt.strptime(feed_time, '%m月%d日%H:%M').replace(year=2019)
        else:
            feed_time = dt.strptime(feed_time, '%Y年%m月%d日%H:%M')
    except:
        feed_time = 'not detected'
    # 本文
    sentence = ''.join(div_mid[0].find('p').text.split())
    
    # シェア、コメント、いいね
    share = ''
    comment = ''
    suda_data = ''
    good = ''
    bottom_items = div_mid[0].find_all('div', {'class': 'card-act'})[0].find_all('li')
    for bottom_item in bottom_items:
        
        if bottom_item.text == '收藏':
            pass
        elif '转发' in bottom_item.text:
            if len(bottom_item.text.split()) < 2:
                share = '0'
            else:
                share = bottom_item.text.split()[1]
        elif '评论' in bottom_item.text:
            if len(bottom_item.text.split()) < 2:
                comment = '0'
            else:
                comment = bottom_item.text.split()[1]
                suda_data = bottom_item.find_all('a')[0].get('suda-data')
        else:
            if len(bottom_item.text.split()) < 1:
                good = '0'
            else:
                good = bottom_item.text   
    
    return account_name, feed_time, sentence, share, comment, suda_data, good


In [5]:
"""
arg: html_soup
return: comment_id_list
"""


def get_comment_id_list(html_soup):
    comment_id_list = []
    div_elements = html_soup.select('div')

    for elem in div_elements:
        comment_id = elem.get('comment_id')
        if type(comment_id) == str:
            comment_id_list.append(comment_id)
    
    return comment_id_list


In [6]:
"""
arg: comment_id
return: comment_account_name, comment_text
"""


def get_comment_name_sentence(html_soup, comment_id):
    div_comment = html_soup.find_all('div', {'comment_id': comment_id})
    comment_account_name = div_comment[0].find('div', {'class': 'txt'}).text.split()[0]
    comment_sentence = ''
    if len(div_comment[0].find('div', {'class': 'txt'}).text.split()) == 3:
        comment_sentence = div_comment[0].find('div', {'class': 'txt'}).text.split()[2]
    elif len(div_comment[0].find('div', {'class': 'txt'}).text.split()) == 2:
        comment_sentence = 'None or emoji'
    return comment_account_name, comment_sentence


In [7]:
"""
arg: html_soup, mid
return: account_link
"""


def get_account_link(html_soup, mid):
    div_mid = html_soup.find_all('div', {'mid': mid})
    time.sleep(1)
    account_link = div_mid[0].find_all('a', {'class': 'name'})[0].get('href')
    
    return account_link


In [8]:
"""
arg: account_link
return: html_soup
"""


def get_account_html_soup(driver, account_link):
    retries = 3
    i = 0
    
    while i < retries:
        try:
            driver.get(account_link)
            time.sleep(7)  # 3sだとデータが取りきれない場合がある
    
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
            
            return html_soup
        
        except TimeoutException:
            i += 1
            print('Timeout, Retrying... {} / {}'.format((i), retries))
            continue
            
    return None


def get_account_html_soup_newdriver(driver, account_link):
    i = 0
    while True:
        if i == 0:
            pass
        else:
            driver = driver
            
        try:
            driver.get(account_link)
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
        
            return driver, html_soup
    
        except:
            driver.quit()
            driver = set_driver()
            timeout = 10
            driver.set_page_load_timeout(timeout)
            driver = login(driver, account_link, 'account')  # ログイン
            i += 1
        i = 0


In [9]:
"""
arg: html_soup
return: rank, location, gender, follow, follower, weibo, get_time
"""


def get_account_info(html_soup):
    rank = get_rank(html_soup)
    location = get_location(html_soup)
    gender = get_gender(html_soup)
    follow, follower, weibo = get_follow_follower_weibo(html_soup)
    get_time = get_now_time()
    
    return rank, location, gender, follow, follower, weibo, get_time


def failed_account_info():
    rank = 'None'
    location = 'None'
    gender = 'None'
    follow = 'None'
    follower = 'None'
    weibo = 'None'
    get_time = 'None'
    return rank, location, gender, follow, follower, weibo, get_time


In [10]:
"""
arg: html_Soup
return: rank
"""


def get_rank(html_soup):
    span_all = html_soup.find_all('span')

    rank = ''

    for span_tag in span_all:
        if "Lv" in span_tag.text:
            rank = span_tag.text
        
    if rank == '':
        rank = 'no rank'
        
    return rank


In [11]:
"""
arg: html_soup
return: location
"""


def get_location(html_soup):
    if html_soup.find_all('em', {'class': 'W_ficon ficon_cd_place S_ficon'}):
        span = html_soup.find_all('span', {'class': 'item_text W_fl'})

        location = ''

        for tag in span:
            if 'Lv' in tag.text:
                location = span[1].text.split()
                break
            else:
                location = span[0].text.split()

        if type(location) == list:
            word_concat = ''
            for word in location:
                word_concat += word
            location = word_concat
    else:
        location = 'no location'
        
    return location


In [12]:
"""
arg: html_soup
return: gender
"""


def get_gender(html_soup):

    if html_soup.find_all('i', {'class': 'W_icon icon_pf_male'}):
        gender = 'male'
    elif html_soup.find_all('i', {'class': 'W_icon icon_pf_female'}):
        gender = 'female'
    else:
        gender = 'no gender'
        
    return gender


In [13]:
"""
arg: html_soup
return: follow, follower, weibo
"""


def get_follow_follower_weibo(html_soup):
    if html_soup.find_all('strong'):
        strongs = html_soup.find_all('strong')
        try:
            follow = strongs[0].text
        except:
            follow = 'no follow'
        
        try:
            follower = strongs[1].text
        except:
            follower = 'no follower'
        
        try:
            weibo = strongs[2].text
        except:
            weibo = 'no weibo'
    else:
        follow = 'no follow'
        follower = 'no follower'
        weibo = 'no weibo'
        
    return follow, follower, weibo


In [14]:
"""
arg: None
return: now_time
"""


def get_now_time():
    now = dt.strftime(dt.now(), '%Y-%m-%d %H:%M:%S')
    
    return now


In [15]:
"""
arg: None
return: driver
"""


def set_driver():
    options = Options()

    # options.set_headless(True)
    options.binary_location = "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
    # options.add_argument("--headless")

    chromedriver_path = "/Users/higashi/Desktop/Document/chromedriver/chromedriver"

    return webdriver.Chrome(options=options, executable_path=chromedriver_path)


In [16]:
"""
arg: url
return: urls
"""


def get_feed_links(start_url):
    urls = []
    urls.append(start_url)
    
    url = start_url
    
    for i in range(200):  # 何ページ目まで読むか
        r = requests.get(url)
        html_contents = r.text

        html_soup = BeautifulSoup(html_contents)
        try:
            next_link = 'https://s.weibo.com' + html_soup.find_all('a', {'class': 'next'})[0].get('href')
            if next_link in urls:
                continue
            else:
                urls.append(next_link)
                url = next_link
        except:
            break

    return urls


In [17]:
"""
arg: driver
return: elems, elems_suda_data
"""


def get_comment_button_list(driver, url):
    elems = []
    elems_suda_data = []
    i = 0
    
    while True:
        if i == 0:
            pass
        else:
            driver = driver
            
        try:
            element = driver.find_elements_by_tag_name('a')
            for elem in element:
                try:
                    val_string = elem.get_attribute('action-type')
                    if val_string == 'feed_list_comment':
                        elems.append(elem)
                        elems_suda_data.append(elem.get_attribute('suda-data'))
                except:
                    continue
            
            return driver, elems, elems_suda_data
        
        except:
            driver.quit()
            driver = set_driver()
            timeout = 10
            driver.set_page_load_timeout(timeout)
            driver = login(driver, url, 'feed')  # ログイン
            i += 1


In [18]:
"""
arg: suda_data, elems, elems_suda_data
return: None
"""


def click_comment(suda_data, elems, elems_suda_data):
    try:
        for i, elem in enumerate(elems):
            if suda_data == elems_suda_data[i]:
                element = elem
                break
            else:
                continue
        element.click()
        time.sleep(5)  # 2s程度待たないとcomment欄が表示されない(javascript実行時間)
    except:
        pass


In [19]:
def login(driver, url, url_type):
    i = 0
    feed_login_xpath = '//*[@id="weibo_top_public"]/div/div/div[3]/div[2]/ul/li[3]/a'
    account_login_xpath = '//*[@id="pl_common_top"]/div/div/div[3]/div[2]/ul/li[3]/a'
    
    while True:
        if i == 0:
            pass
        else:
            driver = driver
        
        try:  # timeout対策
            driver.get(url)
            time.sleep(10)
            try:
                if url_type == 'feed':
                    login_link = driver.find_element_by_xpath(feed_login_xpath)
                    login_link.click()
                    print('clicked login')
                elif url_type == 'account':
                    login_link = driver.find_element_by_xpath(account_login_xpath)
                    login_link.click()
                    print('clicked login')
            except:
                print("login xpath wasn't detected")
                raise TimeoutException
                
            time.sleep(5)
            
            try:
                element = driver.find_elements_by_tag_name('input')
                for elem in element:
                    try:
                        val_string = elem.get_attribute('node-type')
                        if val_string == 'username':
                            id_input = elem
                            id_input.send_keys('pomtaro0509@gmail.com')
                            print('id ok')
                        elif val_string == 'password':
                            pass_input = elem
                            pass_input.send_keys('poMtar03')
                            print('pass ok')
                    except:
                        print('continue')
                        continue
    
                element = driver.find_elements_by_tag_name('a')
                for elem in element:
                    try:
                        val_string = elem.get_attribute('node-type')
                        if val_string == 'submitBtn':
                            login_button = elem
                            login_button.click()
                            print('login ok')
                            time.sleep(5)
                            
                            return driver  # ログイン完了後のdriverを出力
                    except:
                        print('continue')
                        continue
            except:
                print('reboot browser')
                raise TimeoutException
        except TimeoutException:  # timeout対策
            driver.quit()
            driver = set_driver()
            timeout = 10
            driver.set_page_load_timeout(timeout)
            i += 1


In [20]:
"""
クローリング開始
"""

# フィードデータ用リスト
account_names = []
feed_times = []
sentences = []
shares = []
comments = []
goods = []
ranks = []
locations = []
genders = []
follows = []
followers = []
weiboes = []
get_times = []
types = []
mids = []

# コメントデータ用リスト
comment_account_names = []
comment_sentences = []
comment_types = []
comment_locations = []
comment_genders = []
comment_ranks = []
comment_follows = []
comment_followers = []
comment_weiboes = []
comment_post_times = []
comment_get_times = []
comment_shares = []
comment_comments = []
comment_goods = []
comment_mids = []

# プログラム内保持データリスト
account_links = []

# スタートリンクの定義
start_url = 'https://s.weibo.com/weibo/d%2520program?topnav=1&wvr=6'

# driver初期化
timeout = 15
driver = set_driver()
driver.set_page_load_timeout(timeout)

# ログイン
driver = login(driver, start_url, 'feed')

# フィードのリンクを全て取得
feed_links = get_feed_links(start_url)
print('getting feed links was done.')
print('{} links were detected.'.format(len(feed_links)))

# フィードページごとにデータ取得
for i, feed_link in enumerate(feed_links):
    print('--progress... : {} / {} feed link'.format((i+1), len(feed_links)))
    # フィードのhtmlを取得
    driver, feed_html_soup = get_feed_html_soup_newdriver(driver, feed_link)
    print('--getting feed html_soup was done.')
    
    if feed_html_soup:
        # midリストを取得
        mid_list = get_mid_list(feed_html_soup)
        print('--getting mid_list was done.')
        print('--{} mids are detected.'.format(len(mid_list)))
        
        # commentボタンリストを取得
        driver, elems, elems_suda_data = get_comment_button_list(driver, feed_link)
    else:
        continue
    
    # 初期化
    account_links.clear()
        
    # commentデータを先に取得
    for j, mid in enumerate(mid_list):
        # 進捗表示
        print('----progress... : {} / {} mid, {} / {} feed link'.format((j+1), len(mid_list),
                                                                        (i+1), len(feed_links)))
        
        # アカウントリンクを取得
        account_link = 'https:' + get_account_link(feed_html_soup, mid)
        account_links.append(account_link)
        print('----getting account_link was done.')
               
        # フィードデータを取得
        account_name, feed_time, sentence, share, comment, suda_data, good = get_feed_info(feed_html_soup, mid)
        print('----getting feed info was done.')
        
        # コメントが存在する場合、コメントを取得しに行く
        if comment != '0' and elems is not None and elems_suda_data is not None:
            click_comment(suda_data, elems, elems_suda_data)
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
            comment_id_list = get_comment_id_list(html_soup)
            print('------{} comments were detected.'.format(len(comment_id_list)))
            click_comment(suda_data, elems, elems_suda_data)  # コメント欄を閉じておく
            
            # 宣言
            global comment_account_names, comment_sentences, comment_types, comment_locations
            global comment_genders, comment_ranks, comment_follows, comment_followers
            global comment_weiboes, comment_post_times, comment_get_times, comment_shares
            global comment_comments, comment_goods, comment_mids
            mid = mid
            
            # コメントデータを取得、格納
            for comment_id in comment_id_list:
                # データ取得
                comment_account_name, comment_sentence = get_comment_name_sentence(html_soup, comment_id)
                
                # データ格納
                comment_account_names.append(comment_account_name)
                comment_sentences.append(comment_sentence)
                comment_types.append('comment')
                comment_locations.append('None')
                comment_genders.append('None')
                comment_ranks.append('None')
                comment_follows.append('None')
                comment_followers.append('None')
                comment_weiboes.append('None')
                comment_post_times.append('None')
                comment_get_times.append('None')
                comment_shares.append('None')
                comment_comments.append('None')
                comment_goods.append('None')
                comment_mids.append(mid)
            print('------getting comment data was done.')
        
        # フィードデータの一部を格納
        account_names.append(account_name)
        feed_times.append(feed_time)
        sentences.append(sentence)
        shares.append(share)
        comments.append(comment)
        goods.append(good)
        mids.append(mid)
        
        print('----storing feed data was done.')
    
    # アカウント情報を取得
    for k, account_link in enumerate(account_links):
        # 進捗表示
        print('----progress... : {} / {} account_link, {} / {} feed link'.format((k+1), len(account_links),
                                                                                 (i+1), len(feed_links)))
        
        # アカウント詳細のhtmlを取得
        driver, account_html_soup = get_account_html_soup_newdriver(driver, account_link)
        print('----getting account html_soup was done.')
        
        # アカウントデータを取得
        if account_html_soup:
            rank, location, gender, follow, follower, weibo, get_time = get_account_info(account_html_soup)
            print('----getting account info was done.')
        else:
            rank, location, gender, follow, follower, weibo, get_time = failed_account_info()
            continue
            
        # タイプを定義
        type_name = 'feed'
        
        # フィードデータ格納
        ranks.append(rank)
        locations.append(location)
        genders.append(gender)
        follows.append(follow)
        followers.append(follower)
        weiboes.append(weibo)
        get_times.append(get_time)
        types.append(type_name)
        
        print('----storing account data was done')
        
# csv保存
os.chdir('/Users/higashi/PycharmProjects/Scraping/venv/weibo/data')

print('-*-    -*-    -*-')
print('account_names : ', len(account_names))
print('locations : ', len(locations))
print('genders : ', len(genders))
print('ranks : ', len(ranks))
print('follows : ', len(follows))
print('followers : ', len(followers))
print('weiboes : ', len(weiboes))
print('feed_times : ', len(feed_times))
print('get_times : ', len(get_times))
print('type : ', len(types))
print('sentences : ', len(sentences))
print('mids : ', len(mids))
print('shares : ', len(shares))
print('comments : ', len(comments))
print('goods : ', len(goods))
print('-*-    -*-    -*-')

df_feed = pd.DataFrame({
    'Account': account_names,
    'Location': locations,
    'Gender': genders,
    'Rank': ranks,
    'Follow': follows,
    'Follower': followers,
    'Weibo': weiboes,
    'FeedTime': feed_times,
    'GetTime': get_times,
    'Type': types,
    'Sentence': sentences,
    'Mid': mids,
    'Share': shares,
    'Comment': comments,
    'Good': goods
})

df_comment = pd.DataFrame({
    'Account': comment_account_names,
    'Location': comment_locations,
    'Gender': comment_genders,
    'Rank': comment_ranks,
    'Follow': comment_follows,
    'Follower': comment_followers,
    'Weibo': comment_weiboes,
    'FeedTime': comment_post_times,
    'GetTime': comment_get_times,
    'Type': comment_types,
    'Sentence': comment_sentences,
    'ParentMid': comment_mids,
    'Share': comment_shares,
    'Comment': comment_comments,
    'Good': comment_goods
})

df_feed.to_csv('data_feed_tmp' + '.csv')
df_comment.to_csv('data_comment_tmp' + '.csv')

print('crawling was completed.')


KeyboardInterrupt: 

In [22]:
"""
データ数の確認
"""

print('account_names : ', len(account_names))
print('feed_times : ', len(feed_times))
print('sentences : ', len(sentences))
print('shares : ', len(shares))
print('comments : ', len(comments))
print('goods : ', len(goods))
print('mids', len(mids))

print('-*-    -*-    -*-')

print('locations : ', len(locations))
print('genders : ', len(genders))
print('ranks : ', len(ranks))
print('follows : ', len(follows))
print('followers : ', len(followers))
print('weiboes : ', len(weiboes))
print('get_times : ', len(get_times))

print('-*-    -*-    -*-')

print('comment_shares : ', len(comment_shares))
print('comment_comments : ', len(comment_comments))
print('comment_goods', len(comment_goods))

account_names :  421
feed_times :  421
sentences :  421
shares :  421
comments :  421
goods :  421
mids 421
-*-    -*-    -*-
locations :  421
genders :  421
ranks :  421
follows :  421
followers :  421
weiboes :  421
get_times :  421
-*-    -*-    -*-
comment_shares :  0
comment_comments :  0
comment_goods 0


In [25]:
os.chdir('/Users/higashi/PycharmProjects/Scraping/venv/weibo/data')

df_feed = pd.DataFrame({
    'Account': account_names,
    'Location': locations,
    'Gender': genders,
    'Rank': ranks,
    'Follow': follows,
    'Follower': followers,
    'Weibo': weiboes,
    'FeedTime': feed_times,
    'GetTime': get_times,
    'Type': types,
    'Sentence': sentences,
    'Mid': mids,
    'Share': shares,
    'Comment': comments,
    'Good': goods
})

df_comment = pd.DataFrame({
    'Account': comment_account_names,
    'Location': comment_locations,
    'Gender': comment_genders,
    'Rank': comment_ranks,
    'Follow': comment_follows,
    'Follower': comment_followers,
    'Weibo': comment_weiboes,
    'FeedTime': comment_post_times,
    'GetTime': comment_get_times,
    'Type': comment_types,
    'Sentence': comment_sentences,
    'ParentMid': comment_mids,
    'Share': comment_shares,
    'Comment': comment_comments,
    'Good': comment_goods
})

df_feed.to_csv('20190314_weibo_dprogram_feed' + '.csv')
df_comment.to_csv('20190314_weibo_dprogram_comment' + '.csv')
