# dataMapの作成

In [1]:
import requests
import bz2
from tqdm.notebook import tqdm_notebook as tqdm
import pandas as pd

In [2]:
url = "http://www.db.info.gifu-u.ac.jp/data/tweets_open.csv.bz2"
filename = "./data/twitter/tweets_open.csv"

In [3]:
chunk_size = 32 * 1024
r = requests.get(url, stream=True, headers={'User-agent': 'Mozilla/5.0'})
decom = bz2.BZ2Decompressor()
with open(filename, 'wb') as f:
    for data in r.iter_content(chunk_size):
        f.write(decom.decompress(data))
r.close()

In [3]:
df = pd.read_csv(filename, 'r', header=None)
df.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,0
0,100251000052240771809136640000110
1,100261000052240776800359219200100
2,100271000052240801864262860900110
3,100281000052240839487167283200010
4,100291000052240845477892915300010


In [4]:
data_list = [item.split(",") for item in df[0].to_list()]
data_list[:3]

[['10025', '10000', '522407718091366400', '0', '0', '1', '1', '0'],
 ['10026', '10000', '522407768003592192', '0', '0', '1', '0', '0'],
 ['10027', '10000', '522408018642628609', '0', '0', '1', '1', '0']]

In [6]:
df = pd.DataFrame(data_list)
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,10025,10000,522407718091366400,0,0,1,1,0
1,10026,10000,522407768003592192,0,0,1,0,0
2,10027,10000,522408018642628609,0,0,1,1,0
3,10028,10000,522408394871672832,0,0,0,1,0
4,10029,10000,522408454778929153,0,0,0,1,0


In [7]:
df[df.isna().any(axis=1)]

# df = df.fillna(0).astype('int')
# df.head()

Unnamed: 0,0,1,2,3,4,5,6,7


In [8]:
df.to_csv(filename, header=None, index=None)

# Tweetの取得

In [9]:
import sys
import json
import time
import pickle
import itertools
from requests_oauthlib import OAuth1Session

In [3]:
from dotenv import load_dotenv
load_dotenv(".env")

# 環境変数を参照
import os
CONSUMER_KEY = os.getenv('CONSUMER_KEY')
CONSUMER_SECRET = os.getenv('CONSUMER_SECRET')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')

In [10]:
session = OAuth1Session(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

In [11]:
# Twitetr api v1の場合
url = 'https://api.twitter.com/1.1/statuses/lookup.json'

def get_tweets(tweet_ids):
    tweet_ids = ','.join(tweet_ids)
    res = session.get(url, params = {'id':tweet_ids})  # get tweets

    if res.status_code != 200:
        print ("Twitter API Error: %d" % res.status_code)
        sys.exit(1)

    try:
        res_text = json.loads(res.text)
        data = {rt['id']:rt['text'] for rt in res_text['data']}
    except:
        data = False
    data = {rt['id']:rt['text'] for rt in res_text}
    return data

In [12]:
# Twitetr api v2の場合
url = 'https://api.twitter.com/2/tweets'

def get_tweets_v2(tweet_ids):
    data = False
    tweet_ids = ','.join(tweet_ids)
    res = session.get(url, params = {'ids':tweet_ids})  # get tweets

    if res.status_code != 200:
        print ("Twitter API Error: %d" % res.status_code)
        sys.exit(1)
    try:
        res_text = json.loads(res.text)
        data = {rt['id']:rt['text'] for rt in res_text['data']}
    except:
        pass
    return data

In [13]:
chunk_size = 100

dataset = [line.strip().split(',') for line in open(filename)]
alldata = []

try:
    with tqdm(total=len(dataset)) as pbar:
        for batch in itertools.zip_longest(*[iter(dataset)]*chunk_size):        
            
            batch = [b for b in batch if b is not None]
            tweets = get_tweets_v2([line[2] for line in batch])
            if not tweets:
                continue

            for line in batch:
                data = {'id':int(line[0]),
                        'topic':int(line[1]),
                        'status':int(line[2]),
                        'label':list(map(int, line[3:])),
                        'text':tweets.get(line[2], '')
                        }
                alldata.append(data)
        
            pbar.update(len(batch))
            time.sleep(3)
            
            # if len(alldata) >= 1000:
            #     break

except:
    print("Connection error...")
    pass

pickle.dump(alldata, open('./data/twitter/twitterJSA_data.pickle','wb'))

  0%|          | 0/534963 [00:00<?, ?it/s]

Connection error...


In [14]:
df = pd.DataFrame.from_dict(pd.read_pickle('./data/twitter/twitterJSA_data.pickle'))
df[df['text']!='']

Unnamed: 0,id,topic,status,label,text
0,10025,10000,522407718091366400,"[0, 0, 1, 1, 0]",エクスペリアのGPS南北が逆になるのはデフォだったのか。
1,10026,10000,522407768003592192,"[0, 0, 1, 0, 0]",xperiaでスクフェス糞\n反応遅いんだよ糞が
2,10027,10000,522408018642628609,"[0, 0, 1, 1, 0]",夏春都が持ってたエクスペリアも今使うには辛い
7,10032,10000,522409063154339840,"[0, 0, 0, 1, 0]",少し時間空いちゃいましたが、Xperia Z3のカメラ機能について、ちょっとだけですけどまと...
8,10033,10000,522409073057091584,"[0, 0, 0, 0, 1]",日向「研磨おたおめー。これプレゼント!!」\n孤爪「こ、これは」\n日向「ビビった?」\n孤...
...,...,...,...,...,...
534945,2723562,10021,702909240386584576,"[0, 0, 0, 0, 1]",今さっきカプセルホテルでパスコードとかしてないiPhone6を落としたんだ。\n色々詰んだわ...
534947,2723564,10021,702906549962805248,"[0, 0, 0, 1, 0]",KORG Gadget 、iPhone 6s Plusでじゅうぶん動く。KORG Gadge...
534949,2723932,10021,703558619824926720,"[0, 0, 0, 1, 0]",あ～ケータイが飛んでる～　あれ？ラッキーの顔がiPhone6だ～まあ私のケータイAndroi...
534954,2723937,10021,703557929929015297,"[0, 0, 0, 1, 1]",お風呂上がってぼーっと冷蔵庫の前で\n刑事ドラマの過激なシーンに見とれて\nカバーの付いてな...


In [15]:
# remove no text
df = df.loc[df.text!=''].reset_index(drop=True)
print(f"レコード数: {len(df)}件")

レコード数: 292556件


In [17]:
multi_label_num = 0
for labels in df.label.to_list()[:3]:
    if sum(labels) > 1:
        multi_label_num += 1


[0, 0, 1, 1, 0]
2
[0, 0, 1, 0, 0]
1
[0, 0, 1, 1, 0]
2
