In [1]:
import os
import re
import csv
import pickle
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
def read_table(path):
    print('read', path)
    headers = None
    table = []

    fp = open(path, encoding='utf8', errors='ignore')

    for line in tqdm(fp):
        line = line.strip()
        if line:
            if headers is None:
                headers = line.split(',')
                continue

            row = line.split(',')
            if len(row) == len(headers):
                table.append(row)
#     df = pd.DataFrame(table[:20], columns=headers)
#     emotion = []
#     for x in tqdm(table):
#         text = x[6]
#         m = re.findall(r'\[[\u4e00-\u9fff]{1,3}\]', text)
#         if m:
#             emotion += m
#     sample = [
#         x
#         for x in sorted(list(Counter(emotion).items()), key=lambda x: x[1], reverse=True)
#         if x[1] >= 500
#     ]
#     print(len(sample))
#     print(sample[:10])
    return table

In [3]:
def get_labels():
    labels_dict = {
        'happy': [
            '哈哈',
            '偷笑',
            '嘻嘻',
            '耶',
            '太开心',
            '太開心',
            '(^_^)',
            '(*^__^*)',
            '(^o^)',
            '(^.^)',
            'O(∩_∩)O',
        ],
        'sad': [
            '[泪]',
            '[涙]',
            '[傷心]',
            '[悲傷]',
            '[失望]',
            '[伤心]',
            '(T_T)',
            '(T.T)',
            '(T^T)',
            '(ᅲ.ᅲ)',
            '(╯_╰)',
        ],
        'angry': [
            '[怒]',
            '[抓狂]',
            '[哼]',
            '[左哼哼]',
            '[右哼哼]',
            '[怒骂]',
            '[鄙视]',
            '(╰_╯)',
        ],
        'disgust': [
            '[汗]',
            '[晕]',
            '[暈]',
            '[鄙视]', 
            '[鄙視]', 
            '[黑線]', 
            '[囧]', 
            '[黑线]', 
            '[吐]',
        ],
        'fear': [
            '[可怜]', 
            '[生病]', 
            '[委屈]',
        ],
        'suprise': [
            '[吃惊]', 
            '[吃驚]',
            'OMG',
            '(0.o)',
            '(O_o)',
            '(@_@)',
        ],
#         'neutral': [
#             '[酷]', 
#             '[疑问]', 
#             '[握手]', 
#             '[疑問]', 
#             '[可爱]', 
#             '[可愛]', 
#             '[微风]', 
#             '[浮云]', 
#             '[浮雲]', 
#             '[兔子]', 
#             '[思考]', 
#             '[困]',
#         ],
    }
    labels = {}
    for k, v in labels_dict.items():
        for vv in v:
            labels[vv] = k
    
    return labels

In [4]:
LABELS = get_labels()

In [5]:
print(LABELS)

{'(^.^)': 'happy', '(T_T)': 'sad', '[生病]': 'fear', '耶': 'happy', '哈哈': 'happy', '[鄙视]': 'disgust', '(^o^)': 'happy', '(0.o)': 'suprise', '(╰_╯)': 'angry', '[鄙視]': 'disgust', '[晕]': 'disgust', '[吃驚]': 'suprise', '(T.T)': 'sad', 'O(∩_∩)O': 'happy', '太開心': 'happy', '[怒骂]': 'angry', '[抓狂]': 'angry', '[涙]': 'sad', '[悲傷]': 'sad', '太开心': 'happy', '[黑線]': 'disgust', '(T^T)': 'sad', '(ᅲ.ᅲ)': 'sad', '[伤心]': 'sad', '[囧]': 'disgust', '[委屈]': 'fear', '[泪]': 'sad', '[傷心]': 'sad', '(*^__^*)': 'happy', '(^_^)': 'happy', '[哼]': 'angry', '[吃惊]': 'suprise', '嘻嘻': 'happy', '[左哼哼]': 'angry', '[可怜]': 'fear', '(╯_╰)': 'sad', '[黑线]': 'disgust', '偷笑': 'happy', '[汗]': 'disgust', '(@_@)': 'suprise', '[暈]': 'disgust', '[吐]': 'disgust', '[失望]': 'sad', '[怒]': 'angry', '[右哼哼]': 'angry', 'OMG': 'suprise', '(O_o)': 'suprise'}


In [6]:
MIN_LENGTH = 10

def extract_emotion(table):
    emotion_from_table = []
    for x in tqdm(table):
        if len(x) <= 6:
            continue
        text = x[6]
        # 转发取最后一个
        if '//' in text:
            text = text.split('//')[-1]
        # remove at someone
        text = re.sub(r'@[a-zA-Z_0-9\u4e00-\u9fff]+[:：]*(\s+|$)', '', text)
        # remove hash
        text = re.sub(r'#[^#]+#\s*', '', text)
        # without face
        text_without_face = re.sub(r'\s*\[[^\]]+\]\s*', '', text)

        m = re.findall(r'[\u4e00-\u9fff]+', text_without_face)

        if len(m) < MIN_LENGTH:
            continue

        count = 0
        signal_in_text = []
        tag_in_text = None
        for signal, tag in LABELS.items():
            if signal in text:
                if tag == tag_in_text:
                    signal_in_text.append(signal)
                    continue
                elif count >= 1:
                    signal_in_text = None
                    tag_in_text = None
                    break
                else:
                    count += 1
                    signal_in_text.append(signal)
                    tag_in_text = tag

        if tag_in_text is not None:
            clean_text = text_without_face
            for s in signal_in_text:
                clean_text = clean_text.replace(s, '')
            emotion_from_table.append((
                clean_text, text, tag_in_text, signal_in_text))
    return emotion_from_table

In [7]:
root_dir = '/media/qhduan/Seagate Expansion Drive/DATASETS/weibo/'

paths = []
for dirpath, _, filenames in os.walk(root_dir):
    paths += [
        os.path.join(dirpath, x)
        for x in filenames
        if x.endswith('.csv') and 'week' in x
    ]
paths = sorted(paths)

In [8]:
print(len(paths))

52


In [9]:
data = []
for path in paths:
    table = read_table(path)
    emotion_from_table = extract_emotion(table)
    print(len(emotion_from_table))
    data += emotion_from_table

45896it [00:00, 458890.42it/s]

read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week1.csv


4790111it [00:15, 302407.98it/s]
100%|██████████| 4633268/4633268 [00:24<00:00, 190433.69it/s]
0it [00:00, ?it/s]

17482
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week10.csv


3882568it [00:11, 325004.26it/s]
100%|██████████| 3752958/3752958 [00:21<00:00, 172780.77it/s]
45870it [00:00, 458643.58it/s]

16008
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week11.csv


4082840it [00:12, 316133.23it/s]
100%|██████████| 3950667/3950667 [00:21<00:00, 186155.02it/s]
0it [00:00, ?it/s]

16201
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week12.csv


5057574it [00:16, 306688.71it/s]
100%|██████████| 4923979/4923979 [00:24<00:00, 197452.35it/s]
48297it [00:00, 482851.89it/s]

17547
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week13.csv


3612673it [00:11, 323174.86it/s]
100%|██████████| 3488840/3488840 [00:18<00:00, 185105.97it/s]
42629it [00:00, 426187.78it/s]

15443
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week14.csv


2883248it [00:08, 327762.39it/s]
100%|██████████| 2780423/2780423 [00:15<00:00, 179102.54it/s]
46168it [00:00, 461513.19it/s]

12315
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week15.csv


3018789it [00:08, 351407.28it/s]
100%|██████████| 2908859/2908859 [00:16<00:00, 177600.93it/s]
50004it [00:00, 499795.01it/s]

13068
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week16.csv


2716455it [00:10, 268418.56it/s]
100%|██████████| 2611911/2611911 [00:16<00:00, 161756.23it/s]
45197it [00:00, 451872.39it/s]

12696
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week17.csv


3636668it [00:10, 350877.16it/s]
100%|██████████| 3515449/3515449 [00:19<00:00, 178298.75it/s]
0it [00:00, ?it/s]

13698
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week18.csv


3626054it [00:12, 295744.24it/s]
100%|██████████| 3507605/3507605 [00:18<00:00, 185197.23it/s]
45144it [00:00, 451338.20it/s]

13555
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week19.csv


4277841it [00:13, 312626.59it/s]
100%|██████████| 4146731/4146731 [00:22<00:00, 186149.71it/s]
37999it [00:00, 379863.57it/s]

14443
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week2.csv


4868594it [00:16, 292069.25it/s]
100%|██████████| 4712455/4712455 [00:26<00:00, 177973.53it/s]
40050it [00:00, 400445.01it/s]

17503
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week20.csv


3865475it [00:12, 298781.69it/s]
100%|██████████| 3742022/3742022 [00:21<00:00, 173780.83it/s]
34116it [00:00, 341116.41it/s]

14459
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week21.csv


4969460it [00:17, 278160.87it/s]
100%|██████████| 4825683/4825683 [00:25<00:00, 185883.54it/s]
50643it [00:00, 506359.25it/s]

15809
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week22.csv


6639070it [00:22, 293133.05it/s]
100%|██████████| 6469177/6469177 [00:33<00:00, 192733.11it/s]
36601it [00:00, 365502.31it/s]

19763
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week23.csv


5067321it [00:16, 305172.07it/s]
100%|██████████| 4925471/4925471 [00:26<00:00, 187893.66it/s]
0it [00:00, ?it/s]

16891
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week24.csv


6491779it [00:18, 344679.64it/s]
100%|██████████| 6318912/6318912 [00:32<00:00, 191955.33it/s]
45118it [00:00, 451125.58it/s]

18792
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week25.csv


5710391it [00:19, 297072.28it/s]
100%|██████████| 5549420/5549420 [00:32<00:00, 171711.66it/s]
37825it [00:00, 375356.19it/s]

18530
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week26.csv


6032573it [00:18, 322824.26it/s]
100%|██████████| 5875379/5875379 [00:33<00:00, 174204.23it/s]
44378it [00:00, 443719.06it/s]

17388
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week27.csv


6319898it [00:23, 272025.46it/s]
100%|██████████| 6156266/6156266 [00:35<00:00, 173493.20it/s]
40152it [00:00, 401462.00it/s]

22312
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week28.csv


5541848it [00:18, 297182.53it/s]
100%|██████████| 5384880/5384880 [00:30<00:00, 179440.83it/s]
39952it [00:00, 399468.95it/s]

16830
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week29.csv


4499735it [00:16, 277923.82it/s]
100%|██████████| 4369676/4369676 [00:26<00:00, 167407.03it/s]
41324it [00:00, 413173.41it/s]

15489
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week3.csv


3892054it [00:12, 303066.48it/s]
100%|██████████| 3759701/3759701 [00:21<00:00, 173269.06it/s]
35513it [00:00, 355088.01it/s]

16202
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week30.csv


4825459it [00:15, 321228.01it/s]
100%|██████████| 4700212/4700212 [00:26<00:00, 175265.29it/s]
42362it [00:00, 423554.76it/s]

15249
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week31.csv


4382999it [00:17, 250541.94it/s]
100%|██████████| 4244847/4244847 [00:23<00:00, 180414.19it/s]
40922it [00:00, 409168.69it/s]

16147
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week32.csv


3774902it [00:10, 368354.72it/s]
100%|██████████| 3663870/3663870 [00:21<00:00, 171112.47it/s]
37209it [00:00, 371940.49it/s]

17634
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week33.csv


5001462it [00:19, 254281.87it/s]
100%|██████████| 4795119/4795119 [00:26<00:00, 177637.65it/s]
42965it [00:00, 429584.86it/s]

19821
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week34.csv


5802357it [00:18, 305453.99it/s]
100%|██████████| 5624962/5624962 [00:31<00:00, 180577.18it/s]
50349it [00:00, 503269.70it/s]

21429
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week35.csv


3961741it [00:13, 303174.94it/s]
100%|██████████| 3835442/3835442 [00:22<00:00, 167143.53it/s]
47488it [00:00, 472854.56it/s]

15135
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week36.csv


3263539it [00:10, 299528.91it/s]
100%|██████████| 3149346/3149346 [00:19<00:00, 164419.99it/s]
47861it [00:00, 478548.85it/s]

14494
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week37.csv


3331673it [00:10, 324645.80it/s]
100%|██████████| 3215910/3215910 [00:21<00:00, 151978.14it/s]
36972it [00:00, 368339.84it/s]

14181
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week38.csv


3296945it [00:10, 318349.83it/s]
100%|██████████| 3188497/3188497 [00:18<00:00, 169392.85it/s]
46609it [00:00, 466006.01it/s]

13240
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week39.csv


3168355it [00:10, 316210.37it/s]
100%|██████████| 3063956/3063956 [00:20<00:00, 151655.12it/s]
42727it [00:00, 427220.50it/s]

14497
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week4.csv


4005728it [00:13, 299169.76it/s]
100%|██████████| 3814910/3814910 [00:21<00:00, 175787.89it/s]
41975it [00:00, 419686.36it/s]

19521
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week40.csv


2541893it [00:08, 283611.46it/s]
100%|██████████| 2450431/2450431 [00:14<00:00, 164953.14it/s]
0it [00:00, ?it/s]

11004
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week41.csv


3355310it [00:10, 307454.39it/s]
100%|██████████| 3233386/3233386 [00:19<00:00, 167008.14it/s]
46526it [00:00, 465136.24it/s]

14723
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week42.csv


3477200it [00:11, 312685.28it/s]
100%|██████████| 3358632/3358632 [00:20<00:00, 165593.08it/s]
45882it [00:00, 458760.28it/s]

14891
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week43.csv


3705655it [00:11, 326754.44it/s]
100%|██████████| 3570565/3570565 [00:21<00:00, 168889.03it/s]
48000it [00:00, 479201.85it/s]

15948
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week44.csv


3790660it [00:11, 326091.33it/s]
100%|██████████| 3671284/3671284 [00:20<00:00, 176983.36it/s]
0it [00:00, ?it/s]

16267
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week45.csv


4249807it [00:12, 327599.45it/s]
100%|██████████| 4115190/4115190 [00:22<00:00, 186022.95it/s]
47205it [00:00, 471988.56it/s]

17953
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week46.csv


4063314it [00:11, 343592.72it/s]
100%|██████████| 3940681/3940681 [00:21<00:00, 181230.74it/s]
51067it [00:00, 510604.75it/s]

18618
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week47.csv


4748598it [00:15, 311986.95it/s]
100%|██████████| 4604138/4604138 [00:25<00:00, 182460.38it/s]
50056it [00:00, 500494.85it/s]

20089
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week48.csv


4960386it [00:17, 289807.09it/s]
100%|██████████| 4808294/4808294 [00:26<00:00, 183720.02it/s]
50062it [00:00, 500251.93it/s]

21180
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week49.csv


5045252it [00:15, 324299.20it/s]
100%|██████████| 4891398/4891398 [00:27<00:00, 181017.61it/s]
43992it [00:00, 439810.32it/s]

21674
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week5.csv


3493571it [00:09, 384853.58it/s]
100%|██████████| 3368836/3368836 [00:17<00:00, 192026.92it/s]
28957it [00:00, 289489.52it/s]

16438
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week50.csv


5423924it [00:17, 308217.25it/s]
100%|██████████| 5258672/5258672 [00:31<00:00, 167933.39it/s]
47139it [00:00, 471255.62it/s]

23461
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week51.csv


5296045it [00:16, 312014.99it/s]
100%|██████████| 5123574/5123574 [00:28<00:00, 177396.27it/s]
48736it [00:00, 487088.72it/s]

24951
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week52.csv


4906327it [00:15, 313205.69it/s]
100%|██████████| 4751433/4751433 [00:26<00:00, 179673.49it/s]
39825it [00:00, 398166.84it/s]

23859
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week6.csv


4037123it [00:09, 404507.64it/s]
100%|██████████| 3904823/3904823 [00:20<00:00, 194808.23it/s]
45483it [00:00, 453948.36it/s]

16248
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week7.csv


4697486it [00:15, 303122.30it/s]
100%|██████████| 4556239/4556239 [00:22<00:00, 199028.64it/s]
50526it [00:00, 505203.87it/s]

17680
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week8.csv


4739982it [00:11, 430643.67it/s]
100%|██████████| 4595408/4595408 [00:22<00:00, 200984.90it/s]
0it [00:00, ?it/s]

16496
read /media/qhduan/Seagate Expansion Drive/DATASETS/weibo/week9.csv


4010777it [00:14, 269891.76it/s]
100%|██████████| 3878452/3878452 [00:21<00:00, 182475.12it/s]

15708





In [10]:
print(len(data))

880960


In [11]:
with open('data.pkl', 'wb') as fp:
    pickle.dump(data, fp)