# Crawling Code

In [1]:
# install GOT3 package
!pip install GetOldTweets3



In [1]:
# module import
import GetOldTweets3 as got
from bs4 import BeautifulSoup
import sys
import urllib
import json
import datetime
import time
import os
from random import uniform
from tqdm import tqdm
import csv

In [9]:
# GOT 스태틱 메소드
def getJsonResponse(tweetCriteria, refreshCursor, cookieJar, proxy, useragent=None, debug=False):
    """
    Invoke an HTTP query to Twitter.
    Should not be used as an API function. A static method.
    """
    url = "https://twitter.com/i/search/timeline?"

    if not tweetCriteria.topTweets:
        url += "f=tweets&"

    url += ("vertical=news&q=%s&src=typd&%s"
            "&include_available_features=1&include_entities=1&max_position=%s"
            "&reset_error_state=false")

    urlGetData = ''

    if hasattr(tweetCriteria, 'querySearch'):
        urlGetData += tweetCriteria.querySearch

    if hasattr(tweetCriteria, 'excludeWords'):
        urlGetData += ' -'.join([''] + tweetCriteria.excludeWords)

    if hasattr(tweetCriteria, 'username'):
        if not hasattr(tweetCriteria.username, '__iter__'):
            tweetCriteria.username = [tweetCriteria.username]

        usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u]
        tweetCriteria.username = {u.lower() for u in usernames_ if u}

        usernames = [' from:'+u for u in sorted(tweetCriteria.username)]
        if usernames:
            urlGetData += ' OR'.join(usernames)

    if hasattr(tweetCriteria, 'within'):
        if hasattr(tweetCriteria, 'near'):
            urlGetData += ' near:"%s" within:%s' % (tweetCriteria.near, tweetCriteria.within)
        elif hasattr(tweetCriteria, 'lat') and hasattr(tweetCriteria, 'lon'):
            urlGetData += ' geocode:%f,%f,%s' % (tweetCriteria.lat, tweetCriteria.lon, tweetCriteria.within)

    if hasattr(tweetCriteria, 'since'):
        urlGetData += ' since:' + tweetCriteria.since

    if hasattr(tweetCriteria, 'until'):
        urlGetData += ' until:' + tweetCriteria.until

    if hasattr(tweetCriteria, 'minReplies'):
        urlGetData += ' min_replies:' + tweetCriteria.minReplies

    if hasattr(tweetCriteria, 'minFaves'):
        urlGetData += ' min_faves:' + tweetCriteria.minFaves

    if hasattr(tweetCriteria, 'minRetweets'):
        urlGetData += ' min_retweets:' + tweetCriteria.minRetweets

    if hasattr(tweetCriteria, 'lang'):
        urlLang = 'l=' + tweetCriteria.lang + '&'
    else:
        urlLang = ''
    url = url % (urllib.parse.quote(urlGetData.strip()), urlLang, urllib.parse.quote(refreshCursor))
    useragent = useragent or TweetManager.user_agents[0]

    headers = [
        ('Host', "twitter.com"),
        ('User-Agent', useragent),
        ('Accept', "application/json, text/javascript, */*; q=0.01"),
        ('Accept-Language', "en-US,en;q=0.5"),
        ('X-Requested-With', "XMLHttpRequest"),
        ('Referer', url),
        ('Connection', "keep-alive")
    ]

    if proxy:
        opener = urllib.request.build_opener(urllib.request.ProxyHandler({'http': proxy, 'https': proxy}), urllib.request.HTTPCookieProcessor(cookieJar))
    else:
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar))
    opener.addheaders = headers

    # 디버그 옵션 수정
    if debug:
        print(url)
        
    # HTTP Request 429 방지 위해 수정
    time.sleep(3) 

    ##### 아래로 에러 핸들링 전부 수정: sys.exit() 대신 pass.
    try:
        response = opener.open(url)
        jsonResponse = response.read()

    except TimeoutError as e:
        print("Timeout error")
        print("sleep 30")
        time.sleep(30)

        # 한 번 더 시도하도록 수정
        try:
            response = opener.open(url)
            jsonResponse = response.read()
        except TimeoutError as e:
            print("Timeout Error again.")
            print("Pass Data")
            pass

    except Exception as e:
        print("An error occured during an HTTP request:", str(e))
        print("Error URL:", url)
        print("Try to open in browser: https://twitter.com/search?q=%s&src=typd" % urllib.parse.quote(urlGetData))
        print("sleep 30")
        pass

    try:
        s_json = jsonResponse.decode()
    except:
        print("Invalid response from Twitter")
        print("Error URL:", url)
        pass

    else:
        try:
            dataJson = json.loads(s_json)
        except:
            print("Error parsing JSON: %s" % s_json)
            print("Error URL:", url)
        pass

    return dataJson

In [10]:
# 커스텀 에러
class NotValidEndDateError(Exception):
    def __init__(self):
        super().__init__('마지막 검색 날짜를 다시 설정하십시오.')

In [11]:
# setUntil : 마지막 날짜 배제
def set_crawl_date(start_date, end_date):
    
    start_date = datetime.datetime.strptime(str(start_date), "%Y%m%d")
    end_date = datetime.datetime.strptime(str(end_date), "%Y%m%d") - datetime.timedelta(days=1)
    
    if end_date == start_date:
        raise NotValidEndDateError
    
    else:   
        print("트윗 수집 날짜 설정: {0}부터 {1}까지".format(start_date, end_date))    
        return start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")

In [16]:
# 크롤링 후 got tweet 객체 반환
def crawl_tweets(start_date, end_date, query='Elon Musk', lang='en', debug=False):    
    got.manager.TweetManager.getJsonResponse = getJsonResponse
    
    print("========== 트윗 수집 시작: {0} ~ {1} ==========".format(start_date, end_date))
    start_time = time.time()
    
    tweet_criteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                                .setSince(start_date)\
                                                .setUntil(end_date)\
                                                .setLang(lang)
    tweets = got.manager.TweetManager.getTweets(tweet_criteria, debug=debug)
    
    elapsed_time = time.time()-start_time
    print("수집 완료 : {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))
    print("총 수집 트윗 개수 : {0}".format(len(tweets))) 
    
    return tweets

In [17]:
# got tweet 객체로부터 결과 추출
def get_results(tweet_data):
    results = []
    for tweet in tqdm(tweet_data):
        results.append({'url': tweet.permalink,
                        'date': tweet.date,
                        'text': tweet.text,
                        'user': tweet.username,
                        'mentions': tweet.mentions,
                        'retweets': tweet.retweets,
                        'favorites': tweet.favorites,
                        'hashtags': tweet.hashtags})
    return results        

In [18]:
# 추출한 결과 저장
def save_tweets(tweet_lists, base_file_dir="tweets"):
    
    if not os.path.exists(base_file_dir):
        os.makedirs(base_file_dir)
        
    with open(f"{base_file_dir}/tweets_{crawl_start}_{crawl_end}.csv", "a", -1, encoding="utf-8") as f:    
        writer = csv.writer(f)
        writer.writerow(['url', 'date', 'text', 'user', 'mentions', 'retweets', 'favorites', 'hashtags'])        
        for tweet_list in tqdm(tweet_lists):
            writer.writerow(list(tweet_list.values()))
            
    return 

In [19]:
# 크롤링 진행
crawl_start, crawl_end = set_crawl_date(20170601, 20170603)
tweet_results = crawl_tweets(crawl_start, crawl_end)
tweet_results_lists = get_results(tweet_results)
save_tweets(tweet_results_lists)

트윗 수집 날짜 설정: 2017-06-01 00:00:00부터 2017-06-02 00:00:00까지


100%|██████████████████████████████████████████████████████████████████████████████████████| 4329/4329 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4329/4329 [00:00<00:00, 276343.38it/s]

수집 완료 : 00:10:57
총 수집 트윗 개수 : 4329



