In [27]:
import os
import re
import csv
import pickle
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
def read_table(path):
    print('read', path)
    headers = None
    table = []

    fp = open(path, encoding='utf8', errors='ignore')

    for line in tqdm(fp):
        line = line.strip()
        if line:
            if headers is None:
                headers = line.split(',')
                continue

            row = line.split(',')
            if len(row) == len(headers):
                table.append(row)
#     df = pd.DataFrame(table[:20], columns=headers)
#     emotion = []
#     for x in tqdm(table):
#         text = x[6]
#         m = re.findall(r'\[[\u4e00-\u9fff]{1,3}\]', text)
#         if m:
#             emotion += m
#     sample = [
#         x
#         for x in sorted(list(Counter(emotion).items()), key=lambda x: x[1], reverse=True)
#         if x[1] >= 500
#     ]
#     print(len(sample))
#     print(sample[:10])
    return table

In [3]:
def get_labels():
    labels_dict = {
        'happy': [
            '哈哈',
            '偷笑',
            '嘻嘻',
            '耶',
            '太开心',
            '太開心',
            '(^_^)',
            '(*^__^*)',
            '(^o^)',
            '(^.^)',
            'O(∩_∩)O',
        ],
        'sad': [
            '[泪]',
            '[涙]',
            '[傷心]',
            '[悲傷]',
            '[失望]',
            '[伤心]',
            '(T_T)',
            '(T.T)',
            '(T^T)',
            '(ᅲ.ᅲ)',
            '(╯_╰)',
        ],
        'angry': [
            '[怒]',
            '[抓狂]',
            '[哼]',
            '[左哼哼]',
            '[右哼哼]',
            '[怒骂]',
            '[鄙视]',
            '(╰_╯)',
        ],
        'disgust': [
            '[汗]',
            '[晕]',
            '[暈]',
            '[鄙视]', 
            '[鄙視]', 
            '[黑線]', 
            '[囧]', 
            '[黑线]', 
            '[吐]',
        ],
        'fear': [
            '[可怜]', 
            '[生病]', 
            '[委屈]',
        ],
        'suprise': [
            '[吃惊]', 
            '[吃驚]',
            'OMG',
            '(0.o)',
            '(O_o)',
            '(@_@)',
        ],
        'neutral': [
            '[酷]', 
            '[疑问]', 
            '[握手]', 
            '[疑問]', 
            '[可爱]', 
            '[可愛]', 
            '[微风]', 
            '[浮云]', 
            '[浮雲]', 
            '[兔子]', 
            '[思考]', 
            '[困]',
        ],
    }
    labels = {}
    for k, v in labels_dict.items():
        for vv in v:
            labels[vv] = k
    
    return labels

In [4]:
LABELS = get_labels()

In [5]:
print(LABELS)

{'[囧]': 'disgust', '[汗]': 'disgust', '[生病]': 'fear', '[鄙视]': 'angry', '[可愛]': 'neutral', '[思考]': 'neutral', '[酷]': 'neutral', '[失望]': 'sad', '[黑線]': 'disgust', '(T.T)': 'sad', '太開心': 'happy', '[涙]': 'sad', '[怒]': 'angry', '(O_o)': 'suprise', '[黑线]': 'disgust', '(ᅲ.ᅲ)': 'sad', '[吐]': 'disgust', '[泪]': 'sad', '[兔子]': 'neutral', '[怒骂]': 'angry', '(T_T)': 'sad', '[困]': 'neutral', '[鄙視]': 'disgust', '[可爱]': 'neutral', '太开心': 'happy', '(0.o)': 'suprise', '[伤心]': 'sad', '[微风]': 'neutral', '(╯_╰)': 'sad', '[浮雲]': 'neutral', '偷笑': 'happy', '[疑問]': 'neutral', '[晕]': 'disgust', '[傷心]': 'sad', 'O(∩_∩)O': 'happy', '[右哼哼]': 'angry', '[悲傷]': 'sad', '耶': 'happy', '(*^__^*)': 'happy', '[吃惊]': 'suprise', '[浮云]': 'neutral', 'OMG': 'suprise', '哈哈': 'happy', '(T^T)': 'sad', '(^.^)': 'happy', '[委屈]': 'fear', '[抓狂]': 'angry', '[吃驚]': 'suprise', '[左哼哼]': 'angry', '(@_@)': 'suprise', '[可怜]': 'fear', '[疑问]': 'neutral', '[哼]': 'angry', '(^o^)': 'happy', '嘻嘻': 'happy', '[暈]': 'disgust', '[握手]': 'neutral', '(^_^)'

In [19]:
MIN_LENGTH = 10

def extract_emotion(table):
    emotion_from_table = []
    for x in tqdm(table):
        if len(x) <= 6:
            continue
        text = x[6]
        # 转发取最后一个
        if '//' in text:
            text = text.split('//')[-1]
        # remove at someone
        text = re.sub(r'@[a-zA-Z_0-9\u4e00-\u9fff]+[:：]*(\s+|$)', '', text)
        # remove hash
        text = re.sub(r'#[^#]+#\s*', '', text)
        # without face
        text_without_face = re.sub(r'\s*\[[^\]]+\]\s*', '', text)

        m = re.findall(r'[\u4e00-\u9fff]+', text_without_face)

        if len(m) < MIN_LENGTH:
            continue

        count = 0
        signal_in_text = []
        tag_in_text = None
        for signal, tag in LABELS.items():
            if signal in text:
                if tag == tag_in_text:
                    signal_in_text.append(signal)
                    continue
                elif count >= 1:
                    signal_in_text = None
                    tag_in_text = None
                    break
                else:
                    count += 1
                    signal_in_text.append(signal)
                    tag_in_text = tag

        if tag_in_text is not None:
            clean_text = text_without_face
            for s in signal_in_text:
                clean_text = clean_text.replace(s, '')
            emotion_from_table.append((
                clean_text, text, tag_in_text, signal_in_text))
    return emotion_from_table

In [23]:
root_dir = '/media/qhduan/Seagate Expansion Drive/DATASETS/weibo/'

paths = []
for dirpath, _, filenames in os.walk(root_dir):
    paths += [
        os.path.join(dirpath, x)
        for x in filenames
        if x.endswith('.csv') and 'week' in x
    ]
paths = sorted(paths)

In [24]:
print(len(paths))

52


In [25]:
data = []
for path in paths:
    table = read_table(path)
    emotion_from_table = extract_emotion(table)
    print(len(emotion_from_table))
    data += emotion_from_table

45398it [00:00, 453915.50it/s]

read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week1.csv


4790111it [00:16, 296170.25it/s]
100%|██████████| 4633268/4633268 [00:26<00:00, 172901.54it/s]
0it [00:00, ?it/s]

19083
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week10.csv


3882568it [00:13, 286975.40it/s]
100%|██████████| 3752958/3752958 [00:21<00:00, 174759.86it/s]
0it [00:00, ?it/s]

18040
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week11.csv


4082840it [00:13, 296958.04it/s]
100%|██████████| 3950667/3950667 [00:23<00:00, 164860.23it/s]
0it [00:00, ?it/s]

18371
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week12.csv


5057574it [00:15, 327645.45it/s]
100%|██████████| 4923979/4923979 [00:26<00:00, 183318.86it/s]
0it [00:00, ?it/s]

19950
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week13.csv


3612673it [00:12, 281723.10it/s]
100%|██████████| 3488840/3488840 [00:20<00:00, 173314.03it/s]
37065it [00:00, 370602.64it/s]

17577
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week14.csv


2883248it [00:09, 307001.50it/s]
100%|██████████| 2780423/2780423 [00:16<00:00, 167529.09it/s]
28671it [00:00, 286673.37it/s]

13956
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week15.csv


3018789it [00:08, 340510.87it/s]
100%|██████████| 2908859/2908859 [00:16<00:00, 177807.98it/s]
0it [00:00, ?it/s]

15011
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week16.csv


2716455it [00:08, 315055.91it/s]
100%|██████████| 2611911/2611911 [00:14<00:00, 175612.56it/s]
40608it [00:00, 405933.28it/s]

14542
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week17.csv


3636668it [00:13, 274428.45it/s]
100%|██████████| 3515449/3515449 [00:18<00:00, 188037.43it/s]
0it [00:00, ?it/s]

15444
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week18.csv


3626054it [00:12, 299339.10it/s]
100%|██████████| 3507605/3507605 [00:18<00:00, 192915.52it/s]
0it [00:00, ?it/s]

15069
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week19.csv


4277841it [00:13, 323146.51it/s]
100%|██████████| 4146731/4146731 [00:20<00:00, 198083.99it/s]
0it [00:00, ?it/s]

16283
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week2.csv


4868594it [00:14, 334576.61it/s]
100%|██████████| 4712455/4712455 [00:24<00:00, 189527.81it/s]
0it [00:00, ?it/s]

19521
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week20.csv


3865475it [00:12, 304420.15it/s]
100%|██████████| 3742022/3742022 [00:19<00:00, 192505.09it/s]
0it [00:00, ?it/s]

16022
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week21.csv


4969460it [00:14, 334555.92it/s]
100%|██████████| 4825683/4825683 [00:24<00:00, 196904.15it/s]
0it [00:00, ?it/s]

17908
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week22.csv


6639070it [00:21, 302766.81it/s]
100%|██████████| 6469177/6469177 [00:31<00:00, 203832.60it/s]
39692it [00:00, 396878.74it/s]

21973
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week23.csv


5067321it [00:18, 274792.73it/s]
100%|██████████| 4925471/4925471 [00:27<00:00, 180458.92it/s]
0it [00:00, ?it/s]

21075
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week24.csv


6491779it [00:19, 335013.48it/s]
100%|██████████| 6318912/6318912 [00:34<00:00, 182305.57it/s]
27829it [00:00, 278251.79it/s]

20733
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week25.csv


5710391it [00:22, 248317.10it/s]
100%|██████████| 5549420/5549420 [00:29<00:00, 185414.25it/s]
42895it [00:00, 428710.21it/s]

20637
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week26.csv


6032573it [00:14, 408994.00it/s]
100%|██████████| 5875379/5875379 [00:32<00:00, 180625.63it/s]
48643it [00:00, 486363.21it/s]

19928
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week27.csv


6319898it [00:19, 322707.19it/s]
100%|██████████| 6156266/6156266 [00:32<00:00, 187788.80it/s]
0it [00:00, ?it/s]

24622
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week28.csv


5541848it [00:18, 304606.98it/s]
100%|██████████| 5384880/5384880 [00:28<00:00, 187230.33it/s]
0it [00:00, ?it/s]

21068
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week29.csv


4499735it [00:14, 308800.11it/s]
100%|██████████| 4369676/4369676 [00:24<00:00, 179288.37it/s]
0it [00:00, ?it/s]

17388
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week3.csv


3892054it [00:12, 305094.82it/s]
100%|██████████| 3759701/3759701 [00:22<00:00, 168561.83it/s]
8392it [00:00, 83906.68it/s]

18015
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week30.csv


4825459it [00:15, 309895.94it/s]
100%|██████████| 4700212/4700212 [00:27<00:00, 173489.48it/s]
0it [00:00, ?it/s]

17038
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week31.csv


4382999it [00:16, 273414.01it/s]
100%|██████████| 4244847/4244847 [00:25<00:00, 163577.57it/s]
0it [00:00, ?it/s]

17972
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week32.csv


3774902it [00:10, 345085.29it/s]
100%|██████████| 3663870/3663870 [00:20<00:00, 176871.01it/s]
43821it [00:00, 438085.08it/s]

18932
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week33.csv


5001462it [00:19, 255211.40it/s]
100%|██████████| 4795119/4795119 [00:27<00:00, 173147.00it/s]
0it [00:00, ?it/s]

21364
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week34.csv


5802357it [00:18, 306239.19it/s]
100%|██████████| 5624962/5624962 [00:33<00:00, 168602.76it/s]
0it [00:00, ?it/s]

22984
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week35.csv


3961741it [00:12, 304762.24it/s]
100%|██████████| 3835442/3835442 [00:26<00:00, 145559.97it/s]
0it [00:00, ?it/s]

16832
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week36.csv


3263539it [00:11, 282473.92it/s]
100%|██████████| 3149346/3149346 [00:19<00:00, 159338.99it/s]
40544it [00:00, 405373.70it/s]

16230
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week37.csv


3331673it [00:12, 274466.53it/s]
100%|██████████| 3215910/3215910 [00:20<00:00, 158383.02it/s]
583it [00:00, 4845.24it/s]

15942
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week38.csv


3296945it [00:13, 245388.70it/s]
100%|██████████| 3188497/3188497 [00:19<00:00, 161911.60it/s]
5874it [00:00, 58722.56it/s]

15019
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week39.csv


3168355it [00:10, 301035.31it/s]
100%|██████████| 3063956/3063956 [00:19<00:00, 160608.75it/s]
0it [00:00, ?it/s]

16226
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week4.csv


4005728it [00:13, 297304.86it/s]
100%|██████████| 3814910/3814910 [00:20<00:00, 183332.41it/s]
0it [00:00, ?it/s]

20578
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week40.csv


2541893it [00:07, 334264.39it/s]
100%|██████████| 2450431/2450431 [00:16<00:00, 151133.61it/s]
33396it [00:00, 333918.92it/s]

12196
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week41.csv


3355310it [00:13, 247679.96it/s]
100%|██████████| 3233386/3233386 [00:20<00:00, 160951.02it/s]
0it [00:00, ?it/s]

16627
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week42.csv


3477200it [00:09, 360214.08it/s]
100%|██████████| 3358632/3358632 [00:21<00:00, 156521.50it/s]
44518it [00:00, 445120.99it/s]

16822
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week43.csv


3705655it [00:12, 285712.33it/s]
100%|██████████| 3570565/3570565 [00:22<00:00, 161113.03it/s]
0it [00:00, ?it/s]

17842
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week44.csv


3790660it [00:14, 259708.09it/s]
100%|██████████| 3671284/3671284 [00:22<00:00, 164774.67it/s]
15289it [00:00, 152873.38it/s]

18411
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week45.csv


4249807it [00:11, 375144.17it/s]
100%|██████████| 4115190/4115190 [00:24<00:00, 170283.33it/s]
27049it [00:00, 270431.58it/s]

20317
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week46.csv


4063314it [00:12, 317996.77it/s]
100%|██████████| 3940681/3940681 [00:24<00:00, 163862.41it/s]
27098it [00:00, 270944.08it/s]

21067
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week47.csv


4748598it [00:18, 262804.55it/s]
100%|██████████| 4604138/4604138 [00:27<00:00, 168796.74it/s]
0it [00:00, ?it/s]

22995
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week48.csv


4960386it [00:16, 306011.97it/s]
100%|██████████| 4808294/4808294 [00:27<00:00, 172897.67it/s]
0it [00:00, ?it/s]

24569
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week49.csv


5045252it [00:13, 384751.01it/s]
100%|██████████| 4891398/4891398 [00:30<00:00, 160523.31it/s]
0it [00:00, ?it/s]

24413
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week5.csv


3493571it [00:12, 289313.40it/s]
100%|██████████| 3368836/3368836 [00:18<00:00, 177608.49it/s]
45726it [00:00, 457201.57it/s]

18228
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week50.csv


5423924it [00:19, 280264.83it/s]
100%|██████████| 5258672/5258672 [00:31<00:00, 168037.38it/s]
46692it [00:00, 466863.68it/s]

26373
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week51.csv


5296045it [00:13, 394276.03it/s]
100%|██████████| 5123574/5123574 [00:32<00:00, 159925.85it/s]
0it [00:00, ?it/s]

27686
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week52.csv


4906327it [00:15, 318663.82it/s]
100%|██████████| 4751433/4751433 [00:32<00:00, 147793.23it/s]
0it [00:00, ?it/s]

26531
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week6.csv


4037123it [00:12, 334161.31it/s]
100%|██████████| 3904823/3904823 [00:24<00:00, 162088.65it/s]
15717it [00:00, 150553.66it/s]

18161
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week7.csv


4697486it [00:15, 296346.17it/s]
100%|██████████| 4556239/4556239 [00:27<00:00, 162989.39it/s]
0it [00:00, ?it/s]

19516
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week8.csv


4739982it [00:19, 239152.73it/s]
100%|██████████| 4595408/4595408 [00:26<00:00, 174398.24it/s]
0it [00:00, ?it/s]

18540
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week9.csv


4010777it [00:12, 315467.09it/s]
100%|██████████| 3878452/3878452 [00:23<00:00, 163804.89it/s]

17527





In [26]:
print(len(data))

989154


In [28]:
with open('data.pkl', 'wb') as fp:
    pickle.dump(data, fp)