In [None]:
# This notebook includes several rounds of data cleaning for Weibo data

# Preliminary data cleaning

In [None]:
import pandas as pd

In [None]:
df = pd.read_pickle('../all_tweets.pkl')

In [None]:
df.info()

In [None]:
# Clean up duplicated posts
# Filter user_id that occurrs in more than one rows

# Group by 'user_id' and count occurrences
user_id_counts = df['user_id'].value_counts()
print(user_id_counts)

# Filter user_ids that occur more than once
duplicate_user_ids = user_id_counts[user_id_counts > 1].index

# number of duplicated user_ids
print('number of duplicated user_ids:', len(duplicate_user_ids))

In [None]:
df_dup_ids = df[df['user_id'].isin(duplicate_user_ids)]
print(df_dup_ids.info())
# df_dup_ids.to_excel('../tweets_with_same_userid.xlsx')

In [None]:
# Function to return the row with the longest 'content_long' with the same "user_id"
def longest_content(group):
    return group.loc[group['content_long'].str.len().idxmax()]

# Apply the function to each group of 'user_id'
df_unique = df.groupby('user_id').apply(longest_content).reset_index(drop=True)
df_unique.info()
# df_unique.to_excel('../all_tweets_unique_userid.xlsx')

In [None]:
df_duplicate = df[~df['mblogid'].isin(df_unique['mblogid'])]
df_duplicate.info()
# df_duplicate.to_excel('../all_tweets_duplicate_userid.xlsx')

In [None]:
# Drop tweets that are likely to be fictions

fiction_keywords = '书名|小说'
fiction = df_unique['content_long'].str.contains(fiction_keywords)
print(fiction.value_counts())
# df_unique[fiction].to_excel('../fiction_tweets.xlsx')
df_clean = df_unique[~fiction]
print(df_clean.info())

In [None]:
# Drop user_ids that contain “平安XX”, “XX公安局”, “XX公安”， “网警"
police_keywords = '平安|公安局|公安|网警'
police_account = df_clean['user_nickname'].str.contains(police_keywords)
print(police_account.value_counts())
# df_clean[police_account].to_excel('../police_tweets.xlsx')
df_clean = df_clean[~police_account]

In [None]:
# test
print(df_clean['user_nickname'].str.contains('律师').value_counts())
df_clean[df_clean['user_nickname'].str.contains('律师')]['content_long']

In [None]:
# manually read some tweets to identify patterns of (irrelevance) and record tweet ids
id_drop_all = ['1296492494', '1081663190', '1394657835', '1564834725', '1606667657', '1644467702', '1647486362'
              '1677318422', '1686117203', '1690286072', '1722262045', '1751193582', '1751845874', '1752221123',
               '1770600902', '1772857380', '1784038920', '1804994885', '1811670545', '1821383387', '1844741851',
               '1871994611', '1877094373', '1893162821', '1899956213', '1912661142', '1923453581', '1934457620',
               '1952359211', '1974576991', '1974890273', '2035996144', '2053886377', '2059048010', '2099262421',
               '2132599977', '2150642134', '2165230551', '2257231834', '2299211261', '2309793804', '2310663307',
               '2337853855', '2338945183', '2339380177', '2365322025', '2499841932', '2523245097', '2530370612',
               '2612002252', '2633812090', '2679119973', '2683684161', '2702325155', '2716784935', '2725689431',
               '2786481185', '2839442860', '2885668544', '3442980450',  '3504031483', '3525333977', '3043598067',
               '3084859935', '3205272115', '3215052832',
               '3627702155'               
              ]

In [None]:
id_keep_all = ['1222221682', '1286131412', '1609734732', '1642385340', '1717833412', '1728892794', '1738111281',
               '1791447807', '1845864154', '1856446532', '1887344341', '1903046517', '1926079932', '1961718870',
               '1974567457', '1977460817', '1989527362', '2106671735', '2377492125', '2729757644', '3483877775',
               '3151530492', '7841362555'             
              ]

In [None]:
# drop tweets from all tweets if user_id is in id_drop_all
df_clean = df_clean[~df_clean['user_id'].isin(id_drop_all)]

In [None]:
df_clean['content_long']

In [None]:
# Clean up noises: "@XXX ", "//@XXX:", "http://t.cn/XXX", "\n", "[XX]", "#XX#"
import re
pattern = r'//@\S+:|http://t\.cn/\S+|\n|\[[^\]]+\]'
def clean_noises(text):
    return re.sub(pattern, '', text)
df_clean['content_clean'] = df_clean['content_long'].apply(clean_noises)

def clean_mention(text):
    return re.sub(r'@\S+ ', '', text)
df_clean['content_clean'] = df_clean['content_clean'].apply(clean_mention)

In [None]:
# Further clean up df_unique
irrelevant_keywords = '哈哈|嘻嘻|嘿嘿|粉丝|影视|韩剧|黑粉|剧组|艺人|网络黑社会|说一个真事|三建|'\
                        '雇佣军|雇佣兵|俄罗斯|日本|韩国|南韩|美国|香港|印度|墨西哥|泰国|美國|荷兰|'\
                          '狂飙|渣男|扫黑除恶专项斗争|扫黑除恶斗争|电影|主演|N/A|指导意见|皇后|失眠'

In [None]:
# test
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

test = '三建'
print(df_clean['content_clean'].str.contains(test).value_counts())
df_clean[df_clean['content_clean'].str.contains(test)][['content_clean','user_id']]

In [None]:
# Drop tweets with irrelevant keywords
df_clean = df_clean[~df_clean['content_clean'].str.contains(irrelevant_keywords)]
print(df_clean.info())

In [None]:
df_clean['length'] = df_clean['content_clean'].apply(lambda x:len(x))
df_clean['length'].describe()

In [None]:
# test
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Principal of cutoff: drop noises as much as possible, may contain a few useful tweets.
cutoff = 35
short_df = df_clean[df_clean['length'] < cutoff]
print(len(short_df))
print(short_df['content_clean'])


In [None]:
df_clean = df_clean[~df_clean['mblogid'].isin(short_df['mblogid'])]
df_clean.info()

In [None]:
df_clean.to_excel('../for_review.xlsx')

# Integrate GPT-cleaned results

In [None]:
import json
from pathlib import Path
fpath = Path('gpt_cleaned_data.json')
result = json.loads(fpath.read_text())
result

In [None]:
gpt_df = pd.DataFrame.from_dict(result, orient='index', columns=['reason','relevance'])
gpt_df.reset_index(inplace=True)
gpt_df.rename(columns={'index': 'mblogid'}, inplace=True)
gpt_df.info()
gpt_df['relevance'].value_counts()

In [None]:
# Merge df and gpt_df by tweet id
df_gpt_cleaned = pd.merge(df_clean, gpt_df, how='left', on='mblogid')
df_gpt_cleaned.info()

In [None]:
# Fill in hand coded data
# Create a dataframe 
df_hand_result = pd.DataFrame({
    'mblogid': id_list,
    'relevance_y': result_list
})
df_hand_result

In [None]:
# Fill in hand coded data 

df_clean_final = pd.merge(df_gpt_cleaned, df_hand_result, on='mblogid', how='left')
df_clean_final.info()

In [None]:
# Update missing values for "relevance" with "relevance_y"
df_clean_final['relevance'].fillna(df_clean_final['relevance_y'], inplace=True)
df_clean_final = df_clean_final.drop('relevance_y', axis=1)
df_clean_final.info()

In [None]:
df_clean_final.to_pickle('../all_tweets_cleaned.pkl')

In [None]:
df_clean_final['relevance'].value_counts()

In [None]:
df_clean_final['created_at']

# Clean up repetitive data

In [None]:
df_clean = pd.read_pickle('../all_tweets_cleaned.pkl')

In [None]:
df_clean.info()

In [None]:
df_rel = df_clean[df_clean['relevance']==1]

In [None]:
! pip install levenshtein

In [None]:
text1 = "久拖不解！村霸书记吴惠芳雇佣黑恶势力长期监视跟踪！今8月30日上午我去做核酸，这个蒙面人骑的电瓶车，还有步行的，贺青松开汽车的，他把我拍的视频用高科技对着我的手机删除，这是黑社会在犯法犯罪！这视频变短了很多，为什么敢做不敢当呀？真不要脸！"
text2 = "#安徽车超冤案# 久拖不解！村霸书记吴惠芳雇佣黑恶势力长期监视跟踪！今8月30日上午我去做核酸，这个蒙面人骑的电瓶车，还有步行的，贺青松开汽车的，他把我拍的视频用高科技对着我的手机删除，这是黑社会在犯法犯罪！这视频变短了很多，为什么敢做不敢当呀？真不要脸！"
text3 = "扬州是宜居城市养老胜地但不是让你们这些垃圾官员来养老享福的十几轮核酸了还有新增天天有人求助就医、物资等各种问题政府人员小区可以随意调整风险等级还雇佣黑社会当志愿者打人明天不知道又有什么新惊喜等着扬州市民呢"

from Levenshtein import distance
print(distance(text1, text2))
print(distance(text1, text3))

from Levenshtein import ratio
print(ratio(text1, text2))
print(ratio(text1, text3))

In [None]:
from tqdm import tqdm
from Levenshtein import distance
import numpy as np

def levenshtein_distance_matrix(strings):
    n = len(strings)
    # Initialize an n x n matrix with zeros
    matrix = np.zeros((n, n), dtype=np.float64)

    for i in tqdm(range(n)):
        for j in range(i+1, n):
            matrix[i, j] = ratio(strings[i], strings[j])

    return matrix

# Get the column as a list
column_list = df_rel['content_clean'].tolist()

# Calculate the matrix
distance_matrix = levenshtein_distance_matrix(column_list)

In [None]:
len(column_list)

In [None]:
scores = distance_matrix.flatten()
scores = scores[scores > 0.2]

# plot histogram of scores
import matplotlib.pyplot as plt
plt.hist(scores)


In [None]:
len(scores)

In [None]:
# Find indices where the elements are at least 0.9
rows, cols = np.where(distance_matrix > 0.4)

kept = set(range(len(df_rel)))
for i, j in zip(rows, cols):
    if i in kept and j in kept:
        if len(column_list[i]) < len(column_list[j]):
            kept.remove(i)
        else:
            kept.remove(j)

df_kept = df_rel.iloc[sorted(kept)]

In [None]:
df_kept.info()

In [None]:
distance_matrix.size

In [None]:
1737*1737

In [None]:
print(column_list[42])
print(column_list[524])

In [None]:
with open("output.txt", "w") as f:
    for row, col in zip(rows, cols):
        f.write(f'Index: ({row}, {col}), Value: {distance_matrix[row, col]}\n')
        f.write(f'{column_list[row]}\n')
        f.write(f'{column_list[col]}\n')

In [None]:
df_kept.to_pickle('../all_tweets_cleaned_final.pkl')