In [1]:
import requests
import pandas as pd
import os
import time
from bs4 import BeautifulSoup
from datetime import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from googletrans import Translator
import emoji

In [2]:
"""
翻訳インスタンス作成
"""


translator = Translator()

In [3]:
"""
arg: src_str
return: str
"""

def remove_emoji(src_str):
    return ''.join(c for c in src_str if c not in emoji.UNICODE_EMOJI)

In [4]:
"""
arg: src_str
return: translated_str
"""


def translate_ch_to_ja(src_str):
    src_str = remove_emoji(src_str)
    translated_str = translator.translate(src_str, dest='ja')
    
    return translated_str.text

In [5]:
"""
arg: url
return: html
"""


def get_feed_html_soup(driver, url):  # 使用しない
    retries = 3
    i = 0
    
    while i < retries:
        try:
            driver.get(url)
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
            
            return html_soup
        except TimeoutException:
            i += 1
            print('Timeout, Retrying... {} / {}'.format(i, retries))
            driver.refresh()
            print('driver was refreshed')
            continue
    
    return None


def get_feed_html_soup_newdriver(driver, url):
    i = 0
    while True:
        if i == 0:
            pass
        else:
            driver = driver
        
        try:
            driver.get(url)
            
            # sentenceの全文表示
            # open_fullsentence(driver)
            
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
        
            return driver, html_soup
    
        except TimeoutException:
            driver.quit()
            driver = set_driver()
            timeout = 30
            driver.set_page_load_timeout(timeout)
            driver, redirect_is = login(driver, url, 'feed')  # ログイン
            i += 1


In [6]:
"""
arg: html_soup
return: mid_list
"""


def get_mid_list(html_soup):
    mid_list = []
    div_elements = html_soup.find_all('div', {'id': 'pl_feedlist_index'})
    div_elements = div_elements[0].find_all('div', {'class': 'card-wrap'})
    for elem in div_elements:
        mid = elem.get('mid')
        if type(mid) == str:
            mid_list.append(mid)
    
    return mid_list


In [7]:
"""
arg: mid
return: account_name, feed_time, sentence, share, comment, good
"""


def get_feed_info(html_soup, mid):
    div_mid = html_soup.find_all('div', {'mid': mid})
    
    # アカウント名
    account_name = div_mid[0].find('p').get('nick-name')
    
    # 時刻
    feed_time = ''.join(div_mid[0].find_all('p', {'class': 'from'})[0].find('a').text.split())
    try:
        if '今天' in feed_time:     
            feed_time = dt.strftime(dt.now(), '%Y-%m-%d ') + feed_time.split('今天')[1] + ':00'
        elif '前' in feed_time:
            feed_time = dt.strftime(dt.now(), '%Y-%m-%d %H:%M:%S')
        elif '秒' in feed_time:
            feed_time = dt.strftime(dt.now(), '%Y-%m-%d %H:%M:%S')
        elif len(feed_time) < 12:
            feed_time = dt.strptime(feed_time, '%m月%d日%H:%M').replace(year=2019)
        else:
            feed_time = dt.strptime(feed_time, '%Y年%m月%d日%H:%M')
    except:
        feed_time = 'not detected'
        
    # 本文
    try:
        sentence = ''.join(div_mid[0].find_all('p', {'node-type': 'feed_list_content_full'})[0].text.split('收起全文d')[0])
    except:
        sentence = ''.join(div_mid[0].find('p').text.split())
    
    # 本文、日本語訳
    try:
        sentence_ja = translate_ch_to_ja(sentence)
    except:
        sentence_ja = 'could not translate.'
    
    # 投稿URL
    post_url = div_mid[0].find('p', {'class': 'from'}).find('a').get('href')
    post_url = 'https:' + post_url
        
    # シェア、コメント、いいね
    share = ''
    comment = ''
    suda_data = ''
    good = ''
    bottom_items = div_mid[0].find_all('div', {'class': 'card-act'})[0].find_all('li')
    for bottom_item in bottom_items:
        
        if bottom_item.text == '收藏':
            pass
        elif '转发' in bottom_item.text:
            if len(bottom_item.text.split()) < 2:
                share = '0'
            else:
                share = bottom_item.text.split()[1]
        elif '评论' in bottom_item.text:
            if len(bottom_item.text.split()) < 2:
                comment = '0'
            else:
                comment = bottom_item.text.split()[1]
                suda_data = bottom_item.find_all('a')[0].get('suda-data')
        else:
            if len(bottom_item.text.split()) < 1:
                good = '0'
            else:
                good = bottom_item.text   
    
    return account_name, feed_time, sentence, sentence_ja, share, comment, suda_data, good, post_url


In [8]:
"""
arg: html_soup
return: comment_id_list
"""


def get_comment_id_list(html_soup):
    comment_id_list = []
    div_elements = html_soup.select('div')

    for elem in div_elements:
        comment_id = elem.get('comment_id')
        if type(comment_id) == str:
            if comment_id in comment_id_list:
                continue
            else:
                comment_id_list.append(comment_id)
    
    return comment_id_list


In [9]:
"""
arg: comment_id
return: comment_account_name, comment_text
"""


def get_comment_name_sentence(html_soup, comment_id):
    div_comment = html_soup.find_all('div', {'comment_id': comment_id})
    comment_account_name = div_comment[0].find('div', {'class': 'txt'}).text.split()[0]
    comment_sentence = ''
    if len(div_comment[0].find('div', {'class': 'txt'}).text.split()) == 3:
        comment_sentence = div_comment[0].find('div', {'class': 'txt'}).text.split()[2]
    elif len(div_comment[0].find('div', {'class': 'txt'}).text.split()) == 2:
        comment_sentence = 'None or emoji'
    return comment_account_name, comment_sentence


In [10]:
"""
arg: html_soup, mid
return: account_link
"""


def get_account_link(driver, html_soup, mid):
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'name')))
    div_mid = html_soup.find_all('div', {'mid': mid})
    account_link = div_mid[0].find_all('a', {'class': 'name'})[0].get('href')
    
    return account_link


In [11]:
"""
arg: account_link
return: html_soup
"""


def get_account_html_soup(driver, account_link):  # 使用しない
    retries = 3
    i = 0
    
    while i < retries:
        try:
            driver.get(account_link)
            time.sleep(7)  # 3sだとデータが取りきれない場合がある
    
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
            
            return html_soup
        
        except TimeoutException:
            i += 1
            continue
            
    return None


def get_account_html_soup_newdriver(driver, account_link):
    i = 0
    redirect_is = False  # weiboホームにリダイレクトされたかどうかの判定、Trueはリダイレクトあり、Falseはリダイレクトなし
    
    while True:
        if i == 0:
            pass
        elif i == 3:
            return driver, None
        elif redirect_is:
            print('redirect_is = True')
            return driver, None
        else:
            driver = driver
            
        try:
            driver.get(account_link)
            
            # 性別アイコンの表示を待つ
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'icon_bed')))
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'i')))
            # フォロー、フォロワー、weiboの表示を待つ
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'S_line1')))
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'strong')))
            # プロフィール情報の表示を待つ
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'PCD_person_info')))
            
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
                
            return driver, html_soup
    
        except:
            driver.quit()
            driver = set_driver()
            timeout = 30
            driver.set_page_load_timeout(timeout)
            driver, redirect_is = login(driver, account_link, 'account')  # ログイン
            i += 1


In [12]:
"""
arg: html_soup
return: rank, location, gender, follow, follower, weibo, get_time
"""


def get_account_info(html_soup):
    rank = get_rank(html_soup)
    location = get_location(html_soup)
    gender = get_gender(html_soup)
    follow, follower, weibo = get_follow_follower_weibo(html_soup)
    get_time = get_now_time()
    
    return rank, location, gender, follow, follower, weibo, get_time


def failed_account_info():
    rank = 'None'
    location = 'None'
    gender = 'None'
    follow = 'None'
    follower = 'None'
    weibo = 'None'
    get_time = 'None'
    return rank, location, gender, follow, follower, weibo, get_time


In [13]:
"""
arg: html_Soup
return: rank
"""


def get_rank(html_soup):
    span_all = html_soup.find_all('span')

    rank = ''

    for span_tag in span_all:
        if "Lv" in span_tag.text:
            rank = span_tag.text
        
    if rank == '':
        rank = 'no rank'
        
    return rank


In [14]:
"""
arg: html_soup
return: location
"""


def get_location(html_soup):
    if html_soup.find_all('em', {'class': 'W_ficon ficon_cd_place S_ficon'}):
        span = html_soup.find_all('span', {'class': 'item_text W_fl'})

        location = ''

        for tag in span:
            if 'Lv' in tag.text:
                location = span[1].text.split()
                break
            else:
                location = span[0].text.split()

        if type(location) == list:
            word_concat = ''
            for word in location:
                word_concat += word
            location = word_concat
    else:
        location = 'no location'
        
    return location


In [15]:
"""
arg: html_soup
return: gender
"""


def get_gender(html_soup):

    if html_soup.find_all('i', {'class': 'W_icon icon_pf_male'}):
        gender = 'male'
    elif html_soup.find_all('i', {'class': 'W_icon icon_pf_female'}):
        gender = 'female'
    else:
        gender = 'no gender'
        
    return gender


In [16]:
"""
arg: html_soup
return: follow, follower, weibo
"""


def get_follow_follower_weibo(html_soup):
    if html_soup.find_all('strong'):
        strongs = html_soup.find_all('strong')
        try:
            follow = strongs[0].text
        except:
            follow = 'no follow'
        
        try:
            follower = strongs[1].text
        except:
            follower = 'no follower'
        
        try:
            weibo = strongs[2].text
        except:
            weibo = 'no weibo'
    else:
        follow = 'no follow'
        follower = 'no follower'
        weibo = 'no weibo'
        
    return follow, follower, weibo


In [17]:
"""
arg: None
return: now_time
"""


def get_now_time():
    now = dt.strftime(dt.now(), '%Y-%m-%d %H:%M:%S')
    
    return now


In [28]:
"""
arg: None
return: driver
"""


def set_driver():
    options = Options()

    options.binary_location = "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
    options.add_argument("--headless")

    chromedriver_path = "/Users/higashi/Desktop/Document/chromedriver/chromedriver"

    return webdriver.Chrome(options=options, executable_path=chromedriver_path)


In [19]:
"""
arg: url
return: urls
"""


def get_feed_links_old(start_url):
    urls = []
    urls.append(start_url)
    
    url = start_url
    
    for i in range(200):  # 何ページ目まで読むか
        r = requests.get(url)
        html_contents = r.text

        html_soup = BeautifulSoup(html_contents)
        try:
            next_link = 'https://s.weibo.com' + html_soup.find_all('a', {'class': 'next'})[0].get('href')
            if next_link in urls:
                continue
            else:
                urls.append(next_link)
                url = next_link
        except:
            break

    return urls


def get_feed_links(driver, start_url):
    urls = []
    urls.append(start_url)
    
    while True:
        if len(urls) == 50:
            return urls
        try:
            driver.get(urls[-1])
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'm-page')))
            element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'next')))
            element.click()
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'm-page')))
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'next')))
            current_url = driver.current_url
            urls.append(current_url)
            
        except:
            current_url = driver.current_url
            urls.append(current_url)
            
            return urls


In [20]:
"""
arg: driver
return: elems, elems_suda_data
"""


def get_comment_button_list(driver, url):
    elems = []
    elems_suda_data = []
    i = 0
    
    while True:
        if i == 0:
            pass
        else:
            driver = driver
            
        try:
            element = driver.find_elements_by_tag_name('a')
            for elem in element:
                try:
                    val_string = elem.get_attribute('action-type')
                    if val_string == 'feed_list_comment':
                        elems.append(elem)
                        elems_suda_data.append(elem.get_attribute('suda-data'))
                except:
                    continue
            
            return driver, elems, elems_suda_data
        
        except:
            driver.quit()
            driver = set_driver()
            timeout = 30
            driver.set_page_load_timeout(timeout)
            driver, redirect_is = login(driver, url, 'feed')  # ログイン
            i += 1


In [21]:
"""
arg: suda_data, elems, elems_suda_data
return: None
"""


def comment_display_is(driver):
    try:
        html = driver.page_source.encode('utf-8')
        html_soup = BeautifulSoup(html, "html.parser")
        feed_list_repeat = html_soup.find_all('div', {'node-type': 'feed_list_repeat'})
        comment_display_is = feed_list_repeat[0].get('style')
        return comment_display_is
    except:
        return False


def click_comment_button(driver, suda_data, elems, elems_suda_data, action_type):
    
    if action_type == 'open':
        click_comment(suda_data, elems, elems_suda_data)
        while True:
            if comment_display_is(driver) == 'display: none;':
                time.sleep(0.2)
                print('sleep')  # 2つめのmidでsleepループに入る
            else:
                break
        driver = driver
    elif action_type == 'close':
        click_comment(suda_data, elems, elems_suda_data)
        while True:
            if comment_display_is(driver) == 'display: none;':
                break
            else:
                time.sleep(0.2)
                print('sleep')
        driver = driver
    return driver

    
def click_comment(suda_data, elems, elems_suda_data):
    try:
        for i, elem in enumerate(elems):
            if suda_data == elems_suda_data[i]:
                element = elem
                break
            else:
                continue
        element.click()
        time.sleep(5)
    except:
        pass


In [22]:
"""
arg: driver, url, url_type
return: driver
"""


def login(driver, url, url_type):
    i = 0
    feed_login_xpath = '//*[@id="weibo_top_public"]/div/div/div[3]/div[2]/ul/li[3]/a'
    account_login_xpath = '//*[@id="pl_common_top"]/div/div/div[3]/div[2]/ul/li[3]/a'
    
    while True:
        if i == 0:
            pass
        else:
            driver = driver
        
        try:  # timeout対策
            driver.get(url)
            
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'S_txt1')))
            
            try:
                # リダイレクトされた時のurlを確認、ここに置かないとcurrent_urlは取れない
                current_url = driver.current_url
                print(current_url)
                
                if current_url == 'https://www.weibo.com/jp':
                    return driver, True  # リダイレクトの検知
                else:
                
                    if url_type == 'feed':
                        login_link = driver.find_element_by_xpath(feed_login_xpath)
                        login_link.click()
                    
                    elif url_type == 'account':
                        login_link = driver.find_element_by_xpath(account_login_xpath)
                        login_link.click()
                    
                
            except:
                raise TimeoutException
            
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'item_btn')))
            
            try:
                element = driver.find_elements_by_tag_name('input')
                for elem in element:
                    try:
                        val_string = elem.get_attribute('node-type')
                        if val_string == 'username':
                            id_input = elem
                            id_input.send_keys('pomtaro0509@gmail.com')
                            
                        elif val_string == 'password':
                            pass_input = elem
                            pass_input.send_keys('poMtar03')
                            
                    except:
                        
                        continue
    
                element = driver.find_elements_by_tag_name('a')
                for elem in element:
                    try:
                        val_string = elem.get_attribute('node-type')
                        if val_string == 'submitBtn':
                            login_button = elem
                            login_button.click()
                            
                            time.sleep(10)
                            
                            return driver, False  # ログイン完了後のdriverを出力、リダイレクトなし
                    except:
                        continue
            except:
                raise TimeoutException
        except TimeoutException:  # timeout対策
            driver.quit()
            driver = set_driver()
            timeout = 30
            driver.set_page_load_timeout(timeout)
            i += 1


In [29]:
"""
クローリング開始
"""

# フィードデータ用リスト
account_names = []
feed_times = []
sentences = []
sentences_ja = []
shares = []
comments = []
goods = []
ranks = []
locations = []
genders = []
follows = []
followers = []
weiboes = []
get_times = []
types = []
mids = []
post_urls = []

# コメントデータ用リスト
comment_account_names = []
comment_sentences = []
comment_types = []
comment_locations = []
comment_genders = []
comment_ranks = []
comment_follows = []
comment_followers = []
comment_weiboes = []
comment_post_times = []
comment_get_times = []
comment_shares = []
comment_comments = []
comment_goods = []
comment_mids = []

# プログラム内保持データリスト
account_links = []
comment_id_list = []

# スタートリンクの定義
start_url = 'https://s.weibo.com/weibo?q=CUREL&typeall=1&suball=1&timescope=custom:2019-03-25:2019-03-25&Refer=g'

# driver初期化
timeout = 30
driver = set_driver()
driver.set_page_load_timeout(timeout)

# ログイン
driver, redirect_is = login(driver, start_url, 'feed')

# フィードのリンクを全て取得
feed_links = get_feed_links(driver, start_url)
print('getting feed links was done.')
print('{} links were detected.'.format(len(feed_links)))

# フィードページごとにデータ取得
for i, feed_link in enumerate(feed_links):
    print('--progress... : {} / {} feed link'.format((i+1), len(feed_links)))
    
    # フィードのhtmlを取得
    driver, feed_html_soup = get_feed_html_soup_newdriver(driver, feed_link)
    
    if feed_html_soup:
        # midリストを取得
        mid_list = get_mid_list(feed_html_soup)
        
        # commentボタンリストを取得
        driver, elems, elems_suda_data = get_comment_button_list(driver, feed_link)
    else:
        continue
    
    # 初期化
    account_links.clear()
        
    # commentデータを先に取得
    for j, mid in enumerate(mid_list):
        print('{} th mid'.format(j))
        # アカウントリンクを取得
        account_link = 'https:' + get_account_link(driver, feed_html_soup, mid)
        account_links.append(account_link)
               
        # フィードデータを取得
        account_name, feed_time, sentence, sentence_ja, share, comment, suda_data, good, post_url = get_feed_info(feed_html_soup, mid)
        
        """コメントを取得する場合は以下のコメントアウトを解除
        # コメントが存在する場合、コメントを取得しに行く
        if comment != '0' and elems is not None and elems_suda_data is not None:
            
            click_comment(suda_data, elems, elems_suda_data)
            print('opened')
        
            html = driver.page_source.encode('utf-8')
            html_soup = BeautifulSoup(html, "html.parser")
            comment_id_list = get_comment_id_list(html_soup)
            print('-*-*-')
            print(comment)
            print(comment_id_list)
            print('-*-*-')
            
            click_comment(suda_data, elems, elems_suda_data)  # コメント欄を閉じておく
            print('closed')
            # 宣言
            global comment_account_names, comment_sentences, comment_types, comment_locations
            global comment_genders, comment_ranks, comment_follows, comment_followers
            global comment_weiboes, comment_post_times, comment_get_times, comment_shares
            global comment_comments, comment_goods, comment_mids
            mid = mid
            
            # コメントデータを取得、格納
            for comment_id in comment_id_list:  
                # データ取得
                comment_account_name, comment_sentence = get_comment_name_sentence(html_soup, comment_id)
                
                # データ格納
                comment_account_names.append(comment_account_name)
                comment_sentences.append(comment_sentence)
                comment_types.append('comment')
                comment_locations.append('None')
                comment_genders.append('None')
                comment_ranks.append('None')
                comment_follows.append('None')
                comment_followers.append('None')
                comment_weiboes.append('None')
                comment_post_times.append('None')
                comment_get_times.append('None')
                comment_shares.append('None')
                comment_comments.append('None')
                comment_goods.append('None')
                comment_mids.append(mid)
        """
        
        # フィードデータの一部を格納
        account_names.append(account_name)
        feed_times.append(feed_time)
        sentences.append(sentence)
        sentences_ja.append(sentence_ja)
        shares.append(share)
        comments.append(comment)
        goods.append(good)
        mids.append(mid)
        post_urls.append(post_url)
        
        # print('----storing feed data was done.')
    
    # アカウント情報を取得
    for k, account_link in enumerate(account_links):
        
        print('{} th account_link'.format(k))
        
        # アカウント詳細のhtmlを取得
        driver, account_html_soup = get_account_html_soup_newdriver(driver, account_link)
        # print('----getting account html_soup was done.')
        
        # アカウントデータを取得
        if account_html_soup:
            rank, location, gender, follow, follower, weibo, get_time = get_account_info(account_html_soup)
            # print('----getting account info was done.')
        else:
            rank, location, gender, follow, follower, weibo, get_time = failed_account_info()
            
        # タイプを定義
        type_name = 'feed'
        
        # フィードデータ格納
        ranks.append(rank)
        locations.append(location)
        genders.append(gender)
        follows.append(follow)
        followers.append(follower)
        weiboes.append(weibo)
        get_times.append(get_time)
        types.append(type_name)
        
    os.chdir('/Users/higashi/PycharmProjects/WebScraping/venv/weibo/data')  # csv保存

    print('-*-    -*-    -*-')
    print('account_names : ', len(account_names))
    print('locations : ', len(locations))
    print('genders : ', len(genders))
    print('ranks : ', len(ranks))
    print('follows : ', len(follows))
    print('followers : ', len(followers))
    print('weiboes : ', len(weiboes))
    print('feed_times : ', len(feed_times))
    print('get_times : ', len(get_times))
    print('type : ', len(types))
    print('sentences : ', len(sentences))
    print('mids : ', len(mids))
    print('shares : ', len(shares))
    print('comments : ', len(comments))
    print('goods : ', len(goods))
    print('-*-    -*-    -*-')
	
    df_feed = pd.DataFrame({
		'Account': account_names,
		'Follower': followers,
		'Share': shares,
		'FeedTime': feed_times,
		'Comment': comments,
		'Good': goods,
		'Sentence': sentences,
		'PostUrl': post_urls,
		'Sentence_Ja': sentences_ja,
		'Location': locations,
		'Gender': genders,
		'Rank': ranks,
		'Follow': follows,
		'Weibo': weiboes,
		'GetTime': get_times,
		'Type': types,
		'Mid': mids,
	})
	
    df_comment = pd.DataFrame({
		'Account': comment_account_names,
		'Location': comment_locations,
		'Gender': comment_genders,
		'Rank': comment_ranks,
		'Follow': comment_follows,
		'Follower': comment_followers,
		'Weibo': comment_weiboes,
		'FeedTime': comment_post_times,
		'GetTime': comment_get_times,
		'Type': comment_types,
		'Sentence': comment_sentences,
		'ParentMid': comment_mids,
		'Share': comment_shares,
		'Comment': comment_comments,
		'Good': comment_goods
	})
	
    # df_feed.to_csv('20190319_weibo_freeplus_20190116-20190228_feed' + '.csv')
    # df_comment.to_csv('20190319_weibo_freeplus__20190116-20190228_comment' + '.csv')
	
    df_feed.to_csv('tmp_feed' + '.csv')
    df_comment.to_csv('tmp_comment' + '.csv')
    print('to csv was done.')
	
	
print('crawling was completed.')


https://s.weibo.com/weibo?q=CUREL&typeall=1&suball=1&timescope=custom:2019-03-25:2019-03-25&Refer=g


getting feed links was done.
3 links were detected.
--progress... : 1 / 3 feed link


0 th mid


1 th mid
2 th mid


3 th mid
4 th mid


5 th mid
6 th mid


7 th mid
8 th mid


9 th mid
10 th mid


11 th mid
12 th mid


13 th mid
14 th mid


15 th mid
16 th mid


17 th mid
18 th mid


19 th mid
0 th account_link


https://www.weibo.com/haitaozhangmenren?refer_flag=1001030103_


1 th account_link


2 th account_link


3 th account_link


4 th account_link


https://www.weibo.com/u/5109072250?refer_flag=1001030103_


https://www.weibo.com/u/5109072250?refer_flag=1001030103_


5 th account_link


6 th account_link


7 th account_link


https://www.weibo.com/u/3276471320?refer_flag=1001030103_


8 th account_link


9 th account_link


https://www.weibo.com/curel?refer_flag=1001030103_


10 th account_link


11 th account_link


12 th account_link


13 th account_link


14 th account_link


15 th account_link


16 th account_link


17 th account_link


18 th account_link


19 th account_link


-*-    -*-    -*-
account_names :  20
locations :  20
genders :  20
ranks :  20
follows :  20
followers :  20
weiboes :  20
feed_times :  20
get_times :  20
type :  20
sentences :  20
mids :  20
shares :  20
comments :  20
goods :  20
-*-    -*-    -*-
to csv was done.
--progress... : 2 / 3 feed link


0 th mid


1 th mid
2 th mid


3 th mid
4 th mid
5 th mid


6 th mid
7 th mid
8 th mid


9 th mid
10 th mid
11 th mid


12 th mid
13 th mid


14 th mid
15 th mid


16 th mid
17 th mid


18 th mid
19 th mid


0 th account_link


1 th account_link


2 th account_link


3 th account_link


4 th account_link


5 th account_link


6 th account_link


7 th account_link


https://www.weibo.com/u/3276471320?refer_flag=1001030103_&is_hot=1


8 th account_link


9 th account_link


10 th account_link


11 th account_link


12 th account_link


13 th account_link


14 th account_link


15 th account_link


16 th account_link


17 th account_link


18 th account_link


19 th account_link


-*-    -*-    -*-
account_names :  40
locations :  40
genders :  40
ranks :  40
follows :  40
followers :  40
weiboes :  40
feed_times :  40
get_times :  40
type :  40
sentences :  40
mids :  40
shares :  40
comments :  40
goods :  40
-*-    -*-    -*-
to csv was done.
--progress... : 3 / 3 feed link


0 th mid
1 th mid


2 th mid
3 th mid
4 th mid


5 th mid
6 th mid


7 th mid
8 th mid


9 th mid
10 th mid


11 th mid
12 th mid


13 th mid
14 th mid


15 th mid
0 th account_link


https://www.weibo.com/vip1681681688?refer_flag=1001030103_


https://www.weibo.com/vip1681681688?refer_flag=1001030103_


1 th account_link


2 th account_link


3 th account_link


4 th account_link


5 th account_link


6 th account_link


7 th account_link


8 th account_link


9 th account_link


10 th account_link


11 th account_link


12 th account_link


13 th account_link


https://www.weibo.com/u/5800559759?refer_flag=1001030103_


14 th account_link


15 th account_link


-*-    -*-    -*-
account_names :  56
locations :  56
genders :  56
ranks :  56
follows :  56
followers :  56
weiboes :  56
feed_times :  56
get_times :  56
type :  56
sentences :  56
mids :  56
shares :  56
comments :  56
goods :  56
-*-    -*-    -*-
to csv was done.
crawling was completed.


In [85]:
print(comment_id_list)

['4336742938665912', '4321546749152705', '4321546559972033', '4321381229575291', '4321381208054759', '4321264225062097', '4321248362324656', '4321214073587426', '4321209712233985', '4321163804787424', '4314560285012915', '4314366373073469', '4313094187118583', '4314675052843889', '4312159074660122', '4311027174640643', '4311019675411550', '4311013887433861', '4310986321899985', '4310978164256330', '4307359276894482', '4306188935892850']


In [24]:
"""
データ数の確認
"""

print('account_names : ', len(account_names))
print('feed_times : ', len(feed_times))
print('sentences : ', len(sentences))
print('shares : ', len(shares))
print('comments : ', len(comments))
print('goods : ', len(goods))
print('mids', len(mids))

print('-*-    -*-    -*-')

print('locations : ', len(locations))
print('genders : ', len(genders))
print('ranks : ', len(ranks))
print('follows : ', len(follows))
print('followers : ', len(followers))
print('weiboes : ', len(weiboes))
print('get_times : ', len(get_times))

print('-*-    -*-    -*-')

print('comment_shares : ', len(comment_shares))
print('comment_comments : ', len(comment_comments))
print('comment_goods', len(comment_goods))

In [26]:
os.chdir('/Users/higashi/PycharmProjects/WebScraping/venv/weibo/data')

df_feed = pd.DataFrame({
    'Account': account_names,
    'Location': locations,
    'Gender': genders,
    'Rank': ranks,
    'Follow': follows,
    'Follower': followers,
    'Weibo': weiboes,
    'FeedTime': feed_times,
    'GetTime': get_times,
    'Type': types,
    'Sentence': sentences,
    'Mid': mids,
    'Share': shares,
    'Comment': comments,
    'Good': goods
})

df_comment = pd.DataFrame({
    'Account': comment_account_names,
    'Location': comment_locations,
    'Gender': comment_genders,
    'Rank': comment_ranks,
    'Follow': comment_follows,
    'Follower': comment_followers,
    'Weibo': comment_weiboes,
    'FeedTime': comment_post_times,
    'GetTime': comment_get_times,
    'Type': comment_types,
    'Sentence': comment_sentences,
    'ParentMid': comment_mids,
    'Share': comment_shares,
    'Comment': comment_comments,
    'Good': comment_goods
})

df_feed.to_csv('20190319_weibo_CUREL_20190101-20190115_feed' + '.csv')
df_comment.to_csv('20190319_weibo_CUREL__20190101-20190115_comment' + '.csv')


In [2]:
print(account_names)

In [36]:
print(account_links)

In [None]:
'https://passport.weibo.com/visitor/visitor?entry=miniblog&a=enter&url=https%3A%2F%2Fweibo.com%2F87768787%3Frefer_flag%3D1001030103_&domain=.weibo.com&ua=php-sso_sdk_client-0.6.28&_rand=1552888160.1266'

In [21]:
print(account_links[0])

In [61]:
driver = set_driver()

In [21]:
account_url = 'https://www.weibo.com/shengmingshibao?refer_flag=0000015010_&from=feed&loc=nickname'

In [64]:
for account_url in account_links:
    driver.get(account_url)
    driver, html_soup = get_account_html_soup_newdriver(driver, account_url)
    print(get_account_info(html_soup))

In [42]:
driver.get(account_links[3])

In [46]:
driver, html_soup = get_account_html_soup_newdriver(driver, account_links[3])

In [47]:
print(get_account_info(html_soup))

In [95]:
print(html_soup)

In [62]:
driver.get('https://s.weibo.com/weibo/d%2520program?topnav=1&wvr=6&b=1')

In [63]:
driver, html_soup = get_feed_html_soup_newdriver(driver, 'https://s.weibo.com/weibo/d%2520program?topnav=1&wvr=6&b=1')

In [64]:
print(html_soup)

In [57]:
html = driver.page_source.encode('utf-8')

In [34]:
print(feed_links)

In [30]:
print(feed_times)

In [48]:
A = 1

In [23]:
locations.append('dummy')
genders.append('dummy')
ranks.append('dummy')
follows.append('dummy')
followers.append('dummy')
weiboes.append('dummy')
get_times.append('dummy')
types.append('dummy')

In [1]:
print(feed_links)

In [25]:
print(account_names)

['小L专属空瓶记', '大妞日代-微信43891819', '竹内柚子膏', 'RineVa', 'Raku桑', '小懒分析', 'BoJapan播日本', '叮当爱红茶奶酥不爱铜锣烧', '我是小叶日本代购', '二陈卵子', '功课菌', '锅崽崽_JP', '刘二二人肉正品专柜日本韩国代购', '日本流行每日速报', '我是小叶日本代购', '庞老板是大胖子日代小铺', '起航转运', '要健康开心呀', '未来的温妍', '日本最潮FM-Channel', '网上小玖', '网上小玖', '大可离不开紫薯君', '今天阿曼达翻白眼了吗', '墩布挖白菜', '我是山大jane', '肥鼠日本代_购', '卍岁娘', 'LILI莉丽日本代购', '我看谁还能找到我', '乾乾有只猫', '搞去念三_', '绿野仙踪Philly', 'lllaan_', '是wasabi啊_', '卍岁娘', '金韩彬脸上的创可贴', 'Echo__sssss', '小L专属空瓶记', '019兔斯基是我本命的beauty', '日本代购各类品牌护肤品', '日本代购各类品牌护肤品', 'pummmpk1n', '苗苗苗苗苗儿呀', '于小小小菲1224', '芽绿酱', '躲進夢裡', '是蛋挞君的日代小铺阿', '小猫咪呀小猫咪你真可爱呀', 'zzZmm是不合格的铲屎官', 'itsKauteki']


In [42]:
driver = set_driver()

In [29]:
driver.get('https://www.weibo.com/jp')

In [40]:
driver = set_driver()
driver = login(driver, 'https://www.weibo.com/jp', 'account')

https://www.weibo.com/jp


In [43]:
driver, html_soup = get_account_html_soup_newdriver(driver, 'https://www.weibo.com/jp')

https://www.weibo.com/jp
redirect_is = True


In [51]:
start_url = 'https://s.weibo.com/weibo/CUREL?topnav=1&wvr=6&b=1'

In [65]:
driver = set_driver()
driver.get(start_url)

In [66]:
elements = driver.find_elements_by_tag_name('a')

In [67]:
for element in elements:
    val_string = element.get_attribute('action-type')
    if val_string == 'fl_unfold':
        element.click()
        print('clicked')

clicked


clicked


clicked


clicked


clicked


In [69]:
"""
arg: driver
return: 
"""


def open_fullsentence(driver):
    elements = driver.find_elements_by_tag_name('a')
    
    for element in elements:
        val_string = element.get_attribute('action-type')
        if val_string == 'fl_unfold':
            element.click()
            print('opened sentence')
            
    return


In [88]:
A = '黒柳徹子渡辺直美'

In [89]:
B = '直美'

In [90]:
print(A.split(B))

['黒柳徹子渡辺', '']


In [114]:
print(translator.translate(remove_emoji(emojis), dest='ja'))

Translated(src=zh-CN, dest=ja, text=＃日代推薦##頼符日代＃ "スポット" Curelカット啫ジェルリムーバーハニー130g、70。敏感肌の特別な妊娠中の母親もアンロード後に乳化効果が非常に良いです使用することができ、肌は完全にオイルフリーの押出は透明、白化やさわやかではない、特に肌にアレルギー性乾燥肌に適しています。, pronunciation=None, extra_data="{'translat...")


In [109]:
def remove_emoji(src_str):
    return ''.join(c for c in src_str if c not in emoji.UNICODE_EMOJI)

In [110]:
emojis = '#日代推荐##靠谱日代#「现货」Curel珂润啫喱卸妆蜜130g，💰70。敏感性肌肤专用孕妈妈也可以使用的乳化效果很赞✨卸完后肌肤完全不油挤出来是透明状不泛白清爽特别适合油性易起痘皮肤过敏偏干性皮肤。'

In [111]:
print(remove_emoji(emojis))

#日代推荐##靠谱日代#「现货」Curel珂润啫喱卸妆蜜130g，70。敏感性肌肤专用孕妈妈也可以使用的乳化效果很赞卸完后肌肤完全不油挤出来是透明状不泛白清爽特别适合油性易起痘皮肤过敏偏干性皮肤。


In [123]:
A = translate_ch_to_ja('#日代推荐##靠谱日代#「现货」Curel珂润啫喱卸妆蜜130g，💰70。敏感性肌肤专用孕妈妈也可以使用的乳化效果很赞✨卸完后肌肤完全不油挤出来是透明状不泛白清爽特别适合油性易起痘皮肤过敏偏干性皮肤。')

In [125]:
print(A)

＃日代推薦##頼符日代＃ "スポット" Curelカット啫ジェルリムーバーハニー130g、70。敏感肌の特別な妊娠中の母親もアンロード後に乳化効果が非常に良いです使用することができ、肌は完全にオイルフリーの押出は透明、白化やさわやかではない、特に肌にアレルギー性乾燥肌に適しています。
