In [None]:
# This notebook integrates the results of GPT-4o to the Weibo dataset and compares GPT-4o results with some hand-labeled results

In [None]:
import pandas as pd
df = pd.read_pickle('../all_tweets_cleaned.pkl')
df.info()

In [None]:
df_rel = df[df['relevance'] == 1]
df_rel.info()

# 1st layer of classification

In [None]:
hand_coded_id = """
MdvkbCJlY
LxAxIg7uA
Dr7MuDuXE
Hq42B2za8
KkgX5vcXZ
A6ZFrxjdU
EdwYXCJJM
xn31RAtLB
GEwLyFq2R
GbFbhgoTw
Gvuc5gblR
LxBjPzsdN
GEtA5imLb
Ewj5tiyUx
LxyIECqzH
JAvEEyqX2
GyzIV0kgR
Ob1wQ6BVJ
y7j2kEblj
KCvKzbrnj
Kl2tTETrV
yxgqP28gG
I3Betj6Pq
yw2KixPnb
zpbxZFCEl
KtZU4AyIR
KnEtgz81L
NaNEk5kHX
LxvHH3Qwe
M2rq2qPAH
JgXGMFK0G"""

id_list = [line.strip() for line in hand_coded_id.strip().split('\n')]

# Output the result
print(id_list)

In [None]:
# hand_coded data on variable "dispute_area"

result_hand = """
2
-1
-1
6
2
2
4
1
1
-1
2
99
1
1
99
1
1
1
2
99
6
3
6
4
2
-1
99
7
1
4
2"""

# 99 means the tweet is irrelevant
result_list = [line.strip() for line in result_hand.strip().split('\n')]
result_list = [int(x) for x in result_list]

# Output the result
print(result_list)

In [None]:
coded_data = df_rel['mblogid'].isin(id_list)
df_rel[~coded_data].to_pickle('../for_gpt_classification.pkl')

In [None]:
df_rel[~coded_data].info()

In [None]:
import json
from pathlib import Path
fpath = Path('classification_1st_layer.json')
result = json.loads(fpath.read_text())
result

In [None]:
gpt_df = pd.DataFrame.from_dict(result, orient='index', columns=['reason','dispute_area'])
gpt_df.reset_index(inplace=True)
gpt_df.rename(columns={'index': 'mblogid'}, inplace=True)
gpt_df.info()
gpt_df['dispute_area'].value_counts()

In [None]:
# Merge df and gpt_df by tweet id
df_gpt_classified = pd.merge(df_rel, gpt_df, how='left', on='mblogid')
df_gpt_classified.info()

In [None]:
# Fill in hand coded data
# Create a dataframe 
df_hand_result = pd.DataFrame({
    'mblogid': id_list,
    'dispute_area_y': result_list
})
df_hand_result

In [None]:
# Fill in hand coded data 
df_classified = pd.merge(df_gpt_classified, df_hand_result, on='mblogid', how='left')
df_classified.info()

In [None]:
# Update missing values for "relevance" with "relevance_y"
df_classified['dispute_area'].fillna(df_classified['dispute_area_y'], inplace=True)
df_classified = df_classified.drop('dispute_area_y', axis=1)
df_classified.info()

In [None]:
df_classified.to_pickle('../all_tweets_cleaned_classified_1st_layer.pkl')

In [None]:
df_classified['dispute_area'].value_counts().index

In [None]:
df_classified['dispute_area'].replace('3', 3.0, inplace=True)
df_classified['dispute_area'].value_counts().sort_index(ascending=True)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
print(df_classified[df_classified['dispute_area'] == 1]['content_clean'])

# 2nd layer of classification

In [None]:
df_class1 = df_classified[df_classified['dispute_area'] == 1]

In [None]:
df_class1.to_excel('../all_tweets_class1.xlsx')

In [None]:
hand_coded_id_2 = """
xn31RAtLB
GEwLyFq2R
GEtA5imLb
Ewj5tiyUx
JAvEEyqX2
GyzIV0kgR
Ob1wQ6BVJ
LxvHH3Qwe
EaoMKn21r
KwUtTjffK
HB9lwy4Ch
FcGGtc0pe
Ibsw98vhs
zD55Doll1"""
id_list_2 = [line.strip() for line in hand_coded_id_2.strip().split('\n')]

# Output the result
print(id_list_2)

In [None]:
issues_hand = """1
1
1
1
2
3
1
1
2
2
1
4
2
1"""

issues_list = [line.strip() for line in issues_hand.strip().split('\n')]
issues_list = [int(x) for x in issues_list]

# Output the result
print(issues_list)

In [None]:
perpetrator_hand = """4
-1
2
1
3
2
1
2
3
2
4
5
2
1"""

perp_list = [line.strip() for line in perpetrator_hand.strip().split('\n')]
perp_list = [int(x) for x in perp_list]

# Output the result
print(perp_list)

In [None]:
coded_data_2 = df_class1['mblogid'].isin(id_list_2)
df_class1[~coded_data_2].to_pickle('../for_gpt_class1.pkl')
df_class1[~coded_data_2].info()

In [None]:
import json
from pathlib import Path
fpath = Path('class_1_issues.json')
result = json.loads(fpath.read_text())

gpt_class1 = pd.DataFrame.from_dict(result, orient='index', columns=['reason','specific_issue', 'perpetrator'])
gpt_class1.reset_index(inplace=True)
gpt_class1.rename(columns={'index': 'mblogid'}, inplace=True)

# Merge df and gpt_df by tweet id
df_gpt_class_1 = pd.merge(df_class1, gpt_class1, how='left', on='mblogid')

# Fill in hand coded data
# Create a dataframe 
df_hand_class1 = pd.DataFrame({
    'mblogid': id_list_2,
    'specific_issue_y': issues_list,
    'perpetrator_y': perp_list
})

df_gpt_class_1 = pd.merge(df_gpt_class_1, df_hand_class1, on='mblogid', how='left')

# Update missing values for "relevance" with "relevance_y"
df_gpt_class_1['specific_issue'].fillna(df_gpt_class_1['specific_issue_y'], inplace=True)
df_gpt_class_1 = df_gpt_class_1.drop('specific_issue_y', axis=1)

df_gpt_class_1['perpetrator'].fillna(df_gpt_class_1['perpetrator_y'], inplace=True)
df_gpt_class_1 = df_gpt_class_1.drop('perpetrator_y', axis=1)

df_gpt_class_1.info()

In [None]:
df_gpt_class_1['specific_issue'].value_counts()

In [None]:
df_class2 = df_classified[df_classified['dispute_area'] == 2]
df_class2.to_excel('../all_tweets_class2.xlsx')
df_class2.to_pickle('../all_tweets_class2.pkl')

In [None]:
import json
from pathlib import Path
fpath = Path('class_2_issues.json')
result = json.loads(fpath.read_text())

gpt_class2 = pd.DataFrame.from_dict(result, orient='index', columns=['reason','specific_issue', 'perpetrator'])
gpt_class2.reset_index(inplace=True)
gpt_class2.rename(columns={'index': 'mblogid'}, inplace=True)

# Merge df and gpt_df by tweet id
df_gpt_class_2 = pd.merge(df_class2, gpt_class2, how='left', on='mblogid')

df_gpt_class_2.info()

In [None]:
df_gpt_class_2['specific_issue'].value_counts()

In [None]:
df_gpt_class_2['perpetrator'].value_counts()

In [None]:
df_class3 = df_classified[df_classified['dispute_area'] == 3]
df_class3.info()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_class3['content_clean']

In [None]:
df_class4 = df_classified[df_classified['dispute_area'] == 4]
df_class4.info()

In [None]:
df_class4.to_excel('../all_tweets_class4.xlsx')
df_class4.to_pickle('../all_tweets_class4.pkl')

In [None]:
import json
from pathlib import Path
fpath = Path('class_4_issues.json')
result = json.loads(fpath.read_text())

gpt_class4 = pd.DataFrame.from_dict(result, orient='index', columns=['reason','specific_issue', 'perpetrator'])
gpt_class4.reset_index(inplace=True)
gpt_class4.rename(columns={'index': 'mblogid'}, inplace=True)

# Merge df and gpt_df by tweet id
df_gpt_class_4 = pd.merge(df_class4, gpt_class4, how='left', on='mblogid')

df_gpt_class_4.info()

In [None]:
print(df_gpt_class_4['specific_issue'].value_counts())
print(df_gpt_class_4['perpetrator'].value_counts())

In [None]:
df_class5 = df_classified[df_classified['dispute_area'] == 5]
df_class5.info()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_class5['content_clean']

In [None]:
df_class6 = df_classified[df_classified['dispute_area'] == 6]
df_class6.info()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_class6['content_clean']

In [None]:
df_clean_final = pd.read_pickle('../all_tweets_cleaned_final.pkl')

In [None]:
df_clean_final.info()

In [None]:
df_classified.info()

In [None]:
# Map the 'x' column from DataFrame B onto DataFrame A based on 'id'
df_clean_final['dispute_area'] = df_clean_final['mblogid'].map(df_classified.set_index('mblogid')['dispute_area'])

df_clean_final.info()

In [None]:
df_clean_final.to_pickle('../all_tweets_cleaned_final.pkl')

# Combine all GPT results together

In [None]:
import json
from pathlib import Path
fpath = Path('region&violencetypes.json')
result = json.loads(fpath.read_text())

region_type = pd.DataFrame.from_dict(result, orient='index', columns=['reason','region', 'types_of_violence'])
region_type.reset_index(inplace=True)
region_type.rename(columns={'index': 'mblogid'}, inplace=True)

# Merge df and gpt_df by tweet id
df_clean_final = pd.merge(df_clean_final, region_type, how='left', on='mblogid')

df_clean_final.info()

In [None]:
# concatenate class1, 2, 4
df_gpt_class_all = pd.concat([df_gpt_class_1, df_gpt_class_2, df_gpt_class_4], ignore_index=True)
df_gpt_class_all.info()

In [None]:
# map class1 onto df_clean_final
df_clean_final['specific_issue'] = df_clean_final['mblogid'].map(df_gpt_class_all.set_index('mblogid')['specific_issue'])
df_clean_final['perpetrator'] = df_clean_final['mblogid'].map(df_gpt_class_all.set_index('mblogid')['perpetrator'])

In [None]:
df_clean_final.info()

In [None]:
df_clean_final['dispute_area'].value_counts()

In [None]:
df_clean_final.to_pickle('../data_for_analysis.pkl')

In [None]:
import pandas as pd
df = pd.read_pickle('../data_for_analysis.pkl')

In [None]:
df.info()

In [None]:
import json
from pathlib import Path
fpath = Path('province.json')
result = json.loads(fpath.read_text())

province = pd.DataFrame.from_dict(result, orient='index', columns=['province'])
province.reset_index(inplace=True)
province.rename(columns={'index': 'mblogid'}, inplace=True)

# Merge df and gpt_df by tweet id
df = pd.merge(df, province, how='left', on='mblogid')

df.info()

In [None]:
df['province'].value_counts().index

In [None]:
# Mapping dictionary
mapping_dict = {
    '-1': 'NA',
    '上海': '上海', '上海市': '上海',
    '云南': '云南', '云南省': '云南',
    '内蒙古': '内蒙古', '内蒙古自治区': '内蒙古',
    '北京': '北京', '北京市': '北京',
    '吉林省': '吉林',
    '四川': '四川', '四川省': '四川',
    '天津': '天津', '天津市': '天津',
    '宁夏': '宁夏',
    '安徽': '安徽', '安徽省': '安徽',
    '山东': '山东', '山东省': '山东',
    '山西': '山西', '山西省': '山西',
    '广东': '广东', '广东省': '广东',
    '广西': '广西', '广西省': '广西',
    '江苏': '江苏', '江苏省': '江苏',
    '江西': '江西', '江西省': '江西',
    '河北': '河北', '河北省': '河北',
    '河南': '河南', '河南省': '河南',
    '浙江': '浙江', '浙江省': '浙江',
    '海南省': '海南',
    '湖北': '湖北', '湖北省': '湖北',
    '湖南': '湖南', '湖南省': '湖南',
    '甘肃': '甘肃', '甘肃省': '甘肃',
    '福建': '福建', '福建省': '福建',
    '贵州': '贵州', '贵州省': '贵州',
    '辽宁': '辽宁', '辽宁省': '辽宁',
    '重庆': '重庆', '重庆市': '重庆',
    '陕西': '陕西', '陕西省': '陕西',
    '青海': '青海', '青海省': '青海',
    '黑龙江': '黑龙江', '黑龙江省': '黑龙江'
}

# Apply mapping
df['province'] = df['province'].map(mapping_dict)

# Check results
print(df['province'].value_counts().sort_index(ascending=True))


In [None]:
df.to_pickle('../data_for_analysis.pkl')