In [None]:
%matplotlib inline
import twitter, json, operator
import math
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from pytz import timezone
from datetime import datetime

# Koeran font
path = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
fontprop = fm.FontProperties(fname=path, size=18)
matplotlib.rc('font', family=fontprop.get_name(), size=14)

# Plot settings
plt.rcParams["figure.figsize"] = (15,8)
plt.rcParams['lines.linewidth'] = 4
plt.rcParams['axes.grid'] = True

### key settings
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''

auth = twitter.oauth.OAuth(OAUTH_TOKEN,OAUTH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
print(twitter_api)

In [None]:
def get_date_distribution_from_list(list1, list2, n, option):
    newest_y = 0
    oldest_y = 0
    
    list1_newest = max(list1)
    list1_oldest = min(list1)
    list2_newest = max(list2)
    list2_oldest = min(list2)
    
    # min, max 뽑고 이를 n등분해서 x_delta
    if option == 1:
        # list1, list2 기간의 합집합이 표현되도록
        newest_y = list1_newest if list1_newest > list2_newest else list2_newest
        oldest_y = list1_oldest if list1_oldest < list2_oldest else list2_oldest
    elif option == 2:
        # list1, list2 기간의 교집합이 표현되도록
        newest_y = list2_newest if list1_newest > list2_newest else list1_newest
        oldest_y = list2_oldest if list1_oldest < list2_oldest else list1_oldest
        
    x_delta = (newest_y - oldest_y) / (n-1)

    # delta간격의 x축 리스트 만들기
    x = []
    for i in range(0, n):
        x.append(oldest_y + x_delta*i)

    # y 만들기
    y1 = [0]*n
    for data in list1:
        if data >= x[n-1]:
            y1[n-1] += 1
            continue
        for i in range(0, n-1):
            if data >= x[i] and data < x[i+1]:
                y1[i] += 1
                break

    y2 = [0]*n
    for data in list2:
        if data >= x[n-1]:
            y2[n-1] += 1
            continue
        for i in range(0, n-1):
            if data >= x[i] and data < x[i+1]:
                y2[i] += 1
                break
    
    return x, y1, y2

def get_number_distribution_from_list(list1, list2, n, option):
    # option == 1 : normal
    # option == 2 : log scale
    if option == 2:
        list1 = [math.log10(v) if v != 0 else 0 for v in list1]
        list2 = [math.log10(v) if v != 0 else 0 for v in list2]
    
    # min, max 뽑고 이를 n등분해서 x_delta
    max_y = max(list1) if max(list1) > max(list2) else max(list2)
    min_y = min(list1) if min(list1) < min(list2) else min(list2)
    x_delta = (max_y - min_y) / (n-1)
    
    # delta간격의 x축 리스트 만들기
    x = []
    for i in range(0, n):
        x.append(min_y + x_delta*i)

    # y 만들기
    y1 = [0]*n
    for data in list1:
        if data >= x[n-1]:
            y1[n-1] += 1
            continue
        for i in range(0, n-1):
            if data >= x[i] and data < x[i+1]:
                y1[i] += 1
                break

    y2 = [0]*n
    for data in list2:
        if data >= x[n-1]:
            y2[n-1] += 1
            continue
        for i in range(0, n-1):
            if data >= x[i] and data < x[i+1]:
                y2[i] += 1
                break

    return x, y1, y2

def utc_to_local(utc_dt):
    return utc_dt.replace(tzinfo=timezone('UTC')).astimezone(tz=timezone('Asia/Seoul'))

def remove_duplication(lists, key):
    unique = []
    for v in lists[:]:
        if v[key] not in unique:
            unique.append(v[key])
        else:
            #remove duplication
            lists.remove(v)

In [None]:
### get statuses from searched keyword
keyword1 = '선거'
keyword2 = '동물의숲'

count = 100
search_results = twitter_api.search.tweets(q=keyword1, count=count, result_type='recent')
statuses1 = search_results['statuses']

# search 100 tweets * 9
for _ in range(9):
    try:
        next_results = search_results['search_metadata']['next_results']
    except KeyError: # No more results when next_results doesn't exist
        print(search_results['search_metadata'])
        break
    kwargs = dict([kv.split('=') for kv in next_results[1:].split("&")])
    kwargs['q'] = keyword1
    search_results = twitter_api.search.tweets(**kwargs)
    statuses1 += search_results['statuses']
    print("Length of statuses1", len(statuses1))

search_results = twitter_api.search.tweets(q=keyword2, count=count, result_type='recent')
statuses2 = search_results['statuses']

# search 100 tweets * 9
for _ in range(9):
    try:
        next_results = search_results['search_metadata']['next_results']
    except KeyError: # No more results when next_results doesn't exist
        print(search_results['search_metadata'])
        break
    kwargs = dict([kv.split('=') for kv in next_results[1:].split("&")])
    kwargs['q'] = keyword2
    search_results = twitter_api.search.tweets(**kwargs)
    statuses2 += search_results['statuses']
    print("Length of statuses2", len(statuses2))

In [None]:
### user 뽑아내기
users1 = [ status['user'] for status in statuses1 ]
users2 = [ status['user'] for status in statuses2 ]

### user 중복제거
remove_duplication(users1, 'id')
remove_duplication(users2, 'id')

### entities 뽑아내기
entities1 = [ status['entities'] for status in statuses1 ]
entities2 = [ status['entities'] for status in statuses2 ]

# hashtag 뽑아내기
entities_hashtags1 = [ entity['hashtags'] for entity in entities1 if entity['hashtags'] ]
entities_hashtags2 = [ entity['hashtags'] for entity in entities2 if entity['hashtags'] ]

### retweet 뽑아내기
# (리트윗 횟수, 원작성자, 트윗내용, 트윗id)
retweets1 = [ (status['retweet_count'], status['retweeted_status']['user'], status['text'], status['retweeted_status']['id']) for status in statuses1 if 'retweeted_status' in status]
retweets2 = [ (status['retweet_count'], status['retweeted_status']['user'], status['text'], status['retweeted_status']['id']) for status in statuses2 if 'retweeted_status' in status]

# reverse sort
top_retweets1 = sorted(retweets1, key=operator.itemgetter(0), reverse=True)
top_retweets2 = sorted(retweets2, key=operator.itemgetter(0), reverse=True)

# 중복 제거
remove_duplication(top_retweets1, 3)
remove_duplication(top_retweets2, 3)

In [None]:
# 언제 트윗되었나?
statuses_created_at1 = [ utc_to_local(datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S %z %Y')) for status in statuses1 ]
statuses_created_at2 = [ utc_to_local(datetime.strptime(status['created_at'], '%a %b %d %H:%M:%S %z %Y')) for status in statuses2 ]
x, y1, y2 = get_date_distribution_from_list(statuses_created_at1, statuses_created_at2, 10, 2)
plt.figure(1)
plt.title("특정 키워드가 포함된 트윗의 시간대별 생성 수")
plt.plot(x, y1, 'o-', color='orange', label=keyword1)
plt.plot(x, y2, 'o-', color='lightgrey', label=keyword2)
plt.xlabel('Datetime')
plt.ylabel('# of Tweets')
for i, txt in enumerate(y1):
    plt.annotate(txt, (x[i], y1[i]+2), ha='center')
for i, txt in enumerate(y2):
    plt.annotate(txt, (x[i], y2[i]+2), ha='center')
plt.legend(prop={'family':fontprop.get_name()}, loc='upper right')
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
# 언제 계정을 만들었나?
users_created1 = [ utc_to_local(datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S %z %Y')) for user in users1 ]
users_created2 = [ utc_to_local(datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S %z %Y')) for user in users2 ]
x, y1, y2 = get_date_distribution_from_list(users_created1, users_created2, 10, 1)

# 누적 분포 자료로 만들기
for i, value in enumerate(y1):
    if i >= 1:
        y1[i] += y1[i-1]
        y2[i] += y2[i-1]
plt.figure(2)
plt.title("키워드가 포함된 트윗 작성자의 계정 생성 시기 누적 분포")
plt.plot(x, y1, 'o-', color='orange', label=keyword1)
plt.plot(x, y2, 'o-', color='lightgrey', label=keyword2)
for i, txt in enumerate(y1):
    plt.annotate(txt, (x[i], y1[i]), ha='center')
for i, txt in enumerate(y2):
    plt.annotate(txt, (x[i], y2[i]+10), ha='center')
plt.xlabel('Account created at [year]')
plt.ylabel('# of Users')
plt.legend(prop={'family':fontprop.get_name()}, loc='upper left')
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
# 몇개나 트윗했는가?
users_statuses_count1 = [ user['statuses_count'] for user in users1 ]
users_statuses_count2 = [ user['statuses_count'] for user in users2 ]
x, y1, y2 = get_number_distribution_from_list(users_statuses_count1, users_statuses_count2, 10, 2)
plt.figure(3)
plt.title("키워드가 포함된 트윗 작성자의 전체 트윗 수 분포")
plt.bar([xx-0.2 for xx in x], y1, width=0.2, color='orange', align='center', label=keyword1)
plt.bar(x, y2, width=0.2, color='lightgrey', align='center', label=keyword2)
for i, txt in enumerate(y1):
    plt.annotate(txt, (x[i]-0.2, y1[i]), ha='center')
for i, txt in enumerate(y2):
    plt.annotate(txt, (x[i], y2[i]), ha='center')
plt.xlabel('# of tweets [log]')
plt.ylabel('# of users')
plt.legend(prop={'family':fontprop.get_name()}, loc='upper right')
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
# 팔로워가 몇명인가?
users_followers_count1 = [ user['followers_count'] for user in users1 ]
users_followers_count2 = [ user['followers_count'] for user in users2 ]
x, y1, y2 = get_number_distribution_from_list(users_followers_count1, users_followers_count2, 10, 2)
plt.figure(4)
plt.title("키워드가 포함된 트윗 작성자의 팔로워 수 분포")
plt.bar([xx-0.2 for xx in x], y1, width=0.2, color='orange', align='center', label=keyword1)
plt.bar(x, y2, width=0.2, color='lightgrey', align='center', label=keyword2)
for i, txt in enumerate(y1):
    plt.annotate(txt, (x[i]-0.2, y1[i]), ha='center')
for i, txt in enumerate(y2):
    plt.annotate(txt, (x[i], y2[i]), ha='center')
plt.xlabel('# of followers [log]')
plt.ylabel('# of users')
plt.legend(prop={'family':fontprop.get_name()}, loc='upper right')
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
# 어떤 해시태그가 자주 쓰였나
# hashtag별 횟수 딕셔너리에 저장
hashtags1 = dict()
hashtags2 = dict()

for tag in entities_hashtags1:
    for t in tag:
        if t['text'] in hashtags1:
            hashtags1[t['text']] += 1
        else:
            hashtags1[t['text']] = 1

for tag in entities_hashtags2:
    for t in tag:
        if t['text'] in hashtags2:
            hashtags2[t['text']] += 1
        else:
            hashtags2[t['text']] = 1

# 내림차순 정렬 & 플롯을 위한 list로
hashtags1 = {k: v for k, v in sorted(hashtags1.items(), key=lambda item: item[1], reverse=True)}
hashtags2 = {k: v for k, v in sorted(hashtags2.items(), key=lambda item: item[1], reverse=True)}

x1 = []
y1 = []
for key in hashtags1:
    x1.append(key)
    y1.append(hashtags1[key])

x2 = []
y2 = []
for key in hashtags2:
    x2.append(key)
    y2.append(hashtags2[key])

# Plot TOP N
N = 10
y_pos = range(N)

plt.figure(5)
plt.title("키워드 ["+keyword1+"] 가 포함된 트윗에 쓰인 해시태그 TOP" + str(N))
plt.bar(y_pos, y1[:N], color='orange')
plt.xticks(y_pos, x1[:N])
plt.gcf().autofmt_xdate()
plt.show()

plt.figure(6)
plt.title("키워드 ["+keyword2+"] 가 포함된 트윗에 쓰인 해시태그 TOP" + str(N))
plt.bar(y_pos, y2[:N], color='lightgrey')
plt.xticks(y_pos, x2[:N])
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
# 리트윗 TOP 10
top_retweets1 = top_retweets1[:10]
top_retweets2 = top_retweets2[:10]

# print
i=0
for v in top_retweets1:
    i+=1
    print("#",i,":",v[0]," @",v[1]['screen_name'])
    print(v[2])
    print("===================================================")
    
# print
i=0
for v in top_retweets2:
    i+=1
    print("#",i,":",v[0]," @",v[1]['screen_name'])
    print(v[2])
    print("===================================================")

In [None]:
# TOP 10 리트윗 원작성자의 계정 생성 분포
### user 뽑아내기
users1 = [ r[1] for r in top_retweets1 ]
users2 = [ r[1] for r in top_retweets2 ]

# 중복 제거
remove_duplication(users1, 'id')
remove_duplication(users2, 'id')

# 언제 계정을 만들었나?
users_created1 = [ utc_to_local(datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S %z %Y')) for user in users1 ]
users_created2 = [ utc_to_local(datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S %z %Y')) for user in users2 ]
x, y1, y2 = get_date_distribution_from_list(users_created1, users_created2, 10, 1)

# 누적 분포 자료로 만들기
for i, value in enumerate(y1):
    if i >= 1:
        y1[i] += y1[i-1]
        y2[i] += y2[i-1]
plt.figure(7)
plt.title("TOP10 리트윗 원작성자의 계정 생성 시기 누적 분포")
plt.plot(x, y1, 'o-', color='orange', label=keyword1)
plt.plot(x, y2, 'o-', color='lightgrey', label=keyword2)
for i, txt in enumerate(y1):
    plt.annotate(txt, (x[i], y1[i]), ha='center')
for i, txt in enumerate(y2):
    plt.annotate(txt, (x[i], y2[i]), ha='center')
plt.xlabel('Account created at [year]')
plt.ylabel('# of Users')
plt.legend(prop={'family':fontprop.get_name()}, loc='upper left')
plt.gcf().autofmt_xdate()
plt.show()