In [None]:
# This notebook compares the results generated by GPT-4 with hand-labeled results

In [None]:
# Sample 30 datapoints from each theme
sampled_df = df.groupby('new_theme').apply(lambda x: x.sample(n=30, replace=False))
hand_code_df = sampled_df[['id', 'tid', 'fulltext', 'gpt_sentiment', 'new_theme']]

In [None]:
# The first round of hand coding does not restrict years between 2015 to 2023, so in this document, I re-select 5*8 data to do a second round of hand coding.

In [None]:
# Read in the 1st round of hand coding result
import pandas as pd
code_df = pd.read_excel('../processed_data/hand_code_df_1st_round.xlsx')

In [None]:
# Match year back to the hand code dataset
id_to_year_map = df.set_index('id')['year']

# Map the 'id' in code_df to their corresponding 'years' using the map created
code_df['year'] = code_df['id'].map(id_to_year_map)

# Select data points with correct years
code_df = code_df[(code_df['year']>2014) & (code_df['year']<2024) & ~code_df['theme_hand'].isna()] 

# Randomly sample 25 datapoints from each theme
code_df = code_df.groupby('new_theme').apply(lambda x: x.sample(n=25, replace=False))

In [None]:
code_df.info()

In [None]:
code_df.to_excel('../processed_data/hand_code_df_1st_round.xlsx')

# --------The following steps re-select new data for 2nd round of hand coding-----


In [None]:
# Clean data with year requirements
df = pd.read_parquet('../processed_data/with_gpt_results_2nd_round.parquet')

In [None]:
df['year'].describe()

In [None]:
df = df[(df['year']>2014) & (df['year']<2024)] 

In [None]:
df.to_parquet('../processed_data/final_data_2015-23')

In [None]:
# Randomize 5 data points from each theme group
sampled_df = df.groupby('new_theme').apply(lambda x: x.sample(n=5, replace=False))
hand_code_df_2 = sampled_df[['id', 'tid', 'fulltext', 'gpt_sentiment', 'new_theme', 'year']]
hand_code_df_2.to_excel('../processed_data/hand_code_df_2nd_round.xlsx')

In [None]:
# Concatenate 1st round and 2nd round hand coding results.
code_df_1 = pd.read_excel('../processed_data/hand_code_df_1st_round.xlsx')
code_df_2 = pd.read_excel('../processed_data/hand_code_df_2nd_round.xlsx')

hand_code_all = pd.concat([code_df_1, code_df_2], axis=0)

In [None]:
hand_code_all.info()

In [None]:
hand_code_all['year'].describe()

In [None]:
hand_code_all.rename(columns={'new_theme': 'theme_gpt', 'gpt_sentiment':"sentiment_gpt"}, inplace=True)
hand_code_all.info()

# Topic classification results evaluation

In [None]:
# Compare new_theme and theme_hand
bad_result = hand_code_all[hand_code_all['theme_gpt'] != hand_code_all['theme_hand']]
print(len(bad_result) / len(hand_code_all))
print(bad_result['theme_gpt'].value_counts())

In [None]:
# Confusion matrix for -1 and others
def map_values(x):
    if x == -1:
        return 0
    elif 1 <= x <= 7:
        return 1
hand_code_all['relevance_gpt'] = hand_code_all['theme_gpt'].apply(map_values)
hand_code_all['relevance_hand'] = hand_code_all['theme_hand'].apply(map_values)

In [None]:
hand_code_all['relevance_gpt'].value_counts()

In [None]:
hand_code_all['relevance_hand'].value_counts()

In [None]:
# Compare relevance_gpt and relevance_hand
bad_relevance = hand_code_all[hand_code_all['relevance_gpt'] != hand_code_all['relevance_hand']]
len(bad_relevance)/len(hand_code_all)

In [None]:
(192+18)/240

In [None]:
# Confusion matrix for relevance
from sklearn.metrics import confusion_matrix
import numpy as np

y_true = hand_code_all['relevance_hand']
y_pred = hand_code_all['relevance_gpt']

# Generate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

print("Confusion Matrix:")
print(cm)

In [None]:
# Confusion matrix for theme_gpt and theme_hand

relevant_data = hand_code_all[(hand_code_all['relevance_hand']==1) & (hand_code_all['relevance_gpt']==1)]
theme_true = relevant_data['theme_hand']
theme_pred = relevant_data['theme_gpt']

# Generate the confusion matrix
cm_theme = confusion_matrix(theme_true, theme_pred)

print("Confusion Matrix:")
print(cm_theme)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(cm_theme, annot=True, fmt="d", cmap='Blues', xticklabels=range(1, 8), yticklabels=range(1, 8))
plt.xlabel('Manually Coded Labels')
plt.ylabel('GPT-4 Classified Labels')
plt.title('Confusion Matrix')
plt.savefig('../Graphs/confusion_matrix.png')
plt.show()

In [None]:
# Compare theme_gpt and theme_hand in relevant_data
false_class = relevant_data[relevant_data['theme_gpt'] != relevant_data['theme_hand']]
print(len(false_class))
print(len(relevant_data))
print(len(false_class) / len(relevant_data))

In [None]:
1-0.21875

# Sentiment analysis result evaluation

In [None]:
df_all = pd.read_parquet('../processed_data/final_data_2015-23')

In [None]:
# Merge sentiment_nlp from df_all into hand_code_all 
df_sent = df_all[['tid', 'sentiment']]
hand_code_all = pd.merge(hand_code_all, df_sent, on='tid')

In [None]:
hand_code_all.rename(columns={'sentiment': 'sentiment_nlp'}, inplace=True)
hand_code_all.info()

In [None]:
# re-calculate sentiment_gpt and sentiment_hand as positive/netural/negative
sen_map_1 = {1:-1,
           2:-1,
           3:-1,
           4:0,
           5:1,
           6:1,
           7:1
}


hand_code_all['sen_gpt_new'] = hand_code_all['sentiment_gpt'].map(sen_map_1)
hand_code_all['sen_hand_new'] = hand_code_all['sentiment_hand'].map(sen_map_1)

In [None]:
# Confusion matrix for sentiment_gpt and sentiment_hand

sen_true = hand_code_all['sen_gpt_new']
sen_pred = hand_code_all['sen_hand_new']

# Generate the confusion matrix
cm_sen_gpt = confusion_matrix(sen_true, sen_pred)

print("Confusion Matrix:")
print(cm_sen_gpt)

In [None]:
bad_sen_class = hand_code_all[sen_true != sen_pred]
len(bad_sen_class)/len(hand_code_all)

In [None]:
1-0.20833

In [None]:
hand_code_all['sentiment_nlp'].describe()

In [None]:
# re-calculate sentiment_nlp as positive/netural/negative
def map_sen(x):
    if x < 0.4:
        return -1
    if x > 0.6:
        return 1
    else:
        return 0

hand_code_all['sen_nlp_new'] = hand_code_all['sentiment_nlp'].apply(map_sen)

In [None]:
hand_code_all['sen_nlp_new'].value_counts()

In [None]:
163/240

In [None]:
# Confusion matrix for sentiment_gpt and sentiment_hand

sen_nlp_true = hand_code_all['sen_gpt_new']
sen_nlp_pred = hand_code_all['sen_nlp_new']

# Generate the confusion matrix
cm_sen_nlp = confusion_matrix(sen_nlp_true, sen_nlp_pred)

print("Confusion Matrix:")
print(cm_sen_nlp)

In [None]:
bad_sen_nlp_class = hand_code_all[sen_nlp_true != sen_nlp_pred]
len(bad_sen_nlp_class)/len(hand_code_all)

In [None]:
1-0.47083