In [None]:
# This notebook cleans up data from WQW, RFA, and CLB

In [None]:
import pandas as pd
import numpy as np

# WQW

In [None]:
df = pd.read_parquet("../WQW/all_posts_wqw.parquet")

In [None]:
df.info()

In [None]:
keywords = '维权网|良心犯|中国维权动态'
df_filter = df[df['title'].str.contains(keywords)]

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_filter['title']

In [None]:
df_filter.info()

In [None]:
# Get paragraphs that contain keywords
import re

# Define the keywords
keywords = ['黑社会', '打手', '小混混', '闲散人员', '地痞流氓']

def get_paragraphs(text, keywords):
    paragraphs = text.split('\n')
    # Filter paragraphs that contain any of the keywords
    filtered_paragraphs = [para for para in paragraphs if any(keyword in para for keyword in keywords)]
    return filtered_paragraphs

# Apply the function to the DataFrame
df_filter['result'] = df_filter['content'].apply(lambda x: get_paragraphs(x, keywords))
df_filter = df_filter[['blogid', 'posted_date', 'result']]

In [None]:
# split lists into dataframes
all_exploded_df = df_filter.explode('result')

In [None]:
# Select rows with unique values in the 'column_name' column
unique_df = all_exploded_df.drop_duplicates(subset=['result'])
unique_df.info()

In [None]:
# drop repetitive content/names using Levenshtein distance
from tqdm import tqdm
from Levenshtein import distance
from Levenshtein import ratio

def levenshtein_distance_matrix(strings):
    n = len(strings)
    # Initialize an n x n matrix with zeros
    matrix = np.zeros((n, n), dtype=np.float64)

    for i in tqdm(range(n)):
        for j in range(i+1, n):
            matrix[i, j] = ratio(strings[i], strings[j])

    return matrix

# Get the column as a list
column_list = unique_df['result'].tolist()

# Calculate the matrix
distance_matrix = levenshtein_distance_matrix(column_list)

scores = distance_matrix.flatten()

# plot histogram of scores
import matplotlib.pyplot as plt
plt.hist(scores)

In [None]:
count = np.sum(distance_matrix > 0.8)
count

In [None]:
# Find indices where the ratios are at least 0.8
rows, cols = np.where(distance_matrix > 0.8)

kept = set(range(len(column_list)))
for i, j in zip(rows, cols):
    if i in kept and j in kept:
        if len(column_list[i]) < len(column_list[j]):
            kept.remove(i)
        else:
            kept.remove(j)

df_kept = unique_df.iloc[sorted(kept)]

In [None]:
df_kept.info()

In [None]:
# data remained to clean: df excluding df_filter
mask_df = df['blogid'].isin(df_filter['blogid'])
df_rest = df[~mask_df]

In [None]:
df_rest.info()

In [None]:
df_filter.info()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_rest[['title', 'blogid']]

In [None]:
# Concatenate df_kept and df_rest
df_kept.rename(columns = {"result": "content"}, inplace = True)
df_all = pd.concat([df_rest, df_kept])
df_all.info()

In [None]:
df_all.to_excel('../WQW/cleaned_data_wqw.xlsx')

# rfa

In [None]:
rfa_df = pd.read_excel("../rfa/google_results_rfa.xlsx")

In [None]:
rfa_df.info()

In [None]:
pd.set_option('display.max_colwidth', None)
pd .set_option('display.max_rows', None)
rfa_df[['blogid', 'title']]

In [None]:
# Drop "pinglun" articles
keyword = 'www.rfa.org/mandarin/pinglun'
df_drop = rfa_df['blogid'].str.contains(keyword)
rfa_df = rfa_df[~df_drop]

In [None]:
keyword = '简要新闻|选读来信'
rfa_df_filter = rfa_df[rfa_df['title'].str.contains(keyword)]

In [None]:
rfa_df_filter['content']

In [None]:
# Get paragraphs that contain keywords
import re

# Define the keywords
keywords = ['黑社会', '打手', '小混混', '社会闲散人员', '地痞流氓']

def get_paragraphs(text, keywords):
    paragraphs = text.split('\n\n\n\n\n')
    # Filter paragraphs that contain any of the keywords
    filtered_paragraphs = [para for para in paragraphs if any(keyword in para for keyword in keywords)]
    return filtered_paragraphs

# Apply the function to the DataFrame
rfa_df_filter['result'] = rfa_df_filter['content'].apply(lambda x: get_paragraphs(x, keywords))
rfa_df_filter = rfa_df_filter[['blogid', 'posted_date', 'result']]

In [None]:
# split lists into dataframes
rfa_exploded_df = rfa_df_filter.explode('result')
rfa_exploded_df

In [None]:
# Select rows with unique values in the 'result' column
rfa_unique_df = rfa_exploded_df.drop_duplicates(subset=['result'])
rfa_unique_df

In [None]:
rfa_selected = rfa_unique_df.iloc[[1, 3, 4, 5, 7]]

In [None]:
# zhuanlan articles
keyword = 'zhuanlan'
rfa_review = rfa_to_clean[rfa_to_clean['blogid'].str.contains(keyword)]

In [None]:
pd.set_option('display.max_colwidth', None)
pd .set_option('display.max_rows', None)
rfa_review['title']

In [None]:
# data remained to clean: df excluding df_filter
mask_df = rfa_df['blogid'].isin(rfa_df_filter['blogid'])
rfa_df_rest = rfa_df[~mask_df]

In [None]:
rfa_df_rest.info()

In [None]:
pd.set_option('display.max_colwidth', None)
pd .set_option('display.max_rows', None)
rfa_df_rest[['blogid', 'title']]

In [None]:
# Concatenate df_kept and df_rest
rfa_selected.rename(columns = {"result": "content"}, inplace = True)
rfa_df_all = pd.concat([rfa_df_rest, rfa_selected])
rfa_df_all.info()

In [None]:
rfa_df_all.to_excel('../rfa/cleaned_data_with_google_results_rfa.xlsx')

# CLB

In [None]:
clb_all = pd.read_excel("../CLB/all_posts_clb.xlsx")
clb_df = pd.read_excel("../CLB/all_posts_clb_labeled_part1.xlsx")

In [None]:
mask = clb_all['blogid'].isin(clb_df['blogid'])
clb_to_clean = clb_all[~mask]
clb_to_clean.to_excel("../CLB/all_posts_clb_labeled_part2.xlsx")

In [None]:
clb_df2 = pd.read_excel("../CLB/all_posts_clb_labeled_part2.xlsx")

In [None]:
clb_df1_rel = clb_df[clb_df['relevance'] == 1]
clb_df2_rel = clb_df2[clb_df2['relevance'] == 1]

In [None]:
rel_df = pd.concat([clb_df1_rel, clb_df2_rel])
rel_df.info()

In [None]:
rel_df = rel_df.drop(['Unnamed: 0', 'relevance'], axis = 1)

In [None]:
rel_df.to_excel("../CLB/cleaned_data_clb.xlsx")

# Combine three datasets

In [None]:
import pandas as pd
wqw_df = pd.read_excel('../WQW/cleaned_data_wqw.xlsx')
rfa_df = pd.read_excel('../rfa/cleaned_data_with_google_results_rfa.xlsx')
clb_df = pd.read_excel("../CLB/cleaned_data_clb.xlsx")

In [None]:
wqw_df['source'] = 'wqw'
rfa_df['source'] = 'rfa'
clb_df['source'] = 'clb'

In [None]:
all_df = pd.concat([wqw_df, rfa_df, clb_df])

In [None]:
all_df = all_df.drop(['Unnamed: 0'], axis = 1)

In [None]:
all_df.info()

In [None]:
all_df[all_df['content'].isna()] 

In [None]:
all_df = all_df.dropna(subset = ['content'])

In [None]:
all_df.to_excel('../supplementary_data/multiple_source_data.xlsx')