In [108]:
import json
import math

import scipy.stats

import altair as alt
import pandas as pd
import numpy as np

In [22]:
# results_summary.csv is produced by gather_results.py
results = pd.read_csv('results_summary.csv')
results = results.iloc[0:13].drop(['Days Completed', 'Message Count', 'Num Jobs'], axis=1)
results.set_index(['Year-Month'])

Unnamed: 0_level_0,access.info,adv.impacts,community,other,conspiracy,alt.remedies,misinfo,vaccine.comp
Year-Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-05,23,41,42,87,30,19,51,7
2021-06,78,147,62,211,159,34,121,31
2021-07,224,443,431,1575,829,105,298,136
2021-08,273,435,498,1155,577,110,347,85
2021-09,296,709,811,1426,648,159,496,81
2021-10,186,503,570,1300,744,168,349,151
2021-11,130,355,266,664,364,38,221,51
2021-12,148,413,294,640,450,57,348,41
2022-01,120,267,223,434,319,40,191,50
2022-02,79,153,265,227,215,174,124,27


In [18]:
total_messages = results.drop('Year-Month', axis=1).sum().sum()
total_messages

np.int64(29638)

In [16]:
# messages summed by catagory
results.drop('Year-Month', axis=1).sum()

access.info      1756
adv.impacts      3586
community        3633
other           10013
conspiracy       4954
alt.remedies     2341
misinfo          2685
vaccine.comp      670
dtype: int64

In [19]:
# proportions of messages in each catagory
results.drop('Year-Month', axis=1).sum() / total_messages

access.info     0.059248
adv.impacts     0.120993
community       0.122579
other           0.337843
conspiracy      0.167150
alt.remedies    0.078986
misinfo         0.090593
vaccine.comp    0.022606
dtype: float64

In [146]:
results.set_index('Year-Month').sum(axis=1)
results.set_index('Year-Month').assign(total=results.set_index('Year-Month').sum(axis=1))

Unnamed: 0_level_0,access.info,adv.impacts,community,other,conspiracy,alt.remedies,misinfo,vaccine.comp,total
Year-Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-05,23,41,42,87,30,19,51,7,300
2021-06,78,147,62,211,159,34,121,31,843
2021-07,224,443,431,1575,829,105,298,136,4041
2021-08,273,435,498,1155,577,110,347,85,3480
2021-09,296,709,811,1426,648,159,496,81,4626
2021-10,186,503,570,1300,744,168,349,151,3971
2021-11,130,355,266,664,364,38,221,51,2089
2021-12,148,413,294,640,450,57,348,41,2391
2022-01,120,267,223,434,319,40,191,50,1644
2022-02,79,153,265,227,215,174,124,27,1264


In [70]:
# unique_users.csv is produced by the count_user_ids.py script
user_data = pd.read_csv('unique_users.csv')
user_data.set_index('Year-Month')

Unnamed: 0_level_0,User_Count
Year-Month,Unnamed: 1_level_1
2021-05,44
2021-06,103
2021-07,441
2021-08,544
2021-09,760
2021-10,597
2021-11,434
2021-12,437
2022-01,305
2022-02,386


In [141]:
results_with_users = pd.concat([user_data.set_index('Year-Month'), results.set_index('Year-Month')], axis=1)
results_with_users

Unnamed: 0_level_0,User_Count,access.info,adv.impacts,community,other,conspiracy,alt.remedies,misinfo,vaccine.comp
Year-Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-05,44,23,41,42,87,30,19,51,7
2021-06,103,78,147,62,211,159,34,121,31
2021-07,441,224,443,431,1575,829,105,298,136
2021-08,544,273,435,498,1155,577,110,347,85
2021-09,760,296,709,811,1426,648,159,496,81
2021-10,597,186,503,570,1300,744,168,349,151
2021-11,434,130,355,266,664,364,38,221,51
2021-12,437,148,413,294,640,450,57,348,41
2022-01,305,120,267,223,434,319,40,191,50
2022-02,386,79,153,265,227,215,174,124,27


In [53]:
value_vars = results.columns[2:]
melted_results = pd.melt(results, id_vars=['Year-Month'], value_vars=value_vars)

In [59]:
multiple_user_data = pd.concat([user_data.set_index('Year-Month')]*7)
melted_results_with_users = pd.concat([melted_results.set_index('Year-Month'), multiple_user_data], axis=1).reset_index()

In [166]:
base = alt.Chart(melted_results_with_users, width=500).encode(x=alt.X('Year-Month', title='Months sampled'))
messages = base.mark_bar().encode(y=alt.Y('sum(value)', title='Count of messages matching theme'), 
                                  color=alt.Color('variable', title='Theme'))
users = base.mark_line(color='red').encode(y=alt.Y('User_Count', title='Unique users posting messages'))
alt.layer(messages, users).resolve_scale(y='independent')

In [165]:
alt.Chart(melted_results, width=500).mark_bar().encode(
    x=alt.X('Year-Month', title='Months sampled'), 
    y=alt.Y('sum(value)', title='Count of messages matching theme'), 
    color=alt.Color('variable', title='Theme'))

In [30]:
(alt.Chart(melted_results, width=600).mark_bar().
 encode(x=alt.X('Year-Month', title='Months sampled'), 
        y=alt.Y('sum(value)', title='Percentages of messages matching theme').stack('normalize'),
        color=alt.Color('variable', title='Theme')))

### Message counts
Including spam and non-spam messages

In [123]:
message_count_data = pd.read_csv('message_counts.csv')
message_count_data = message_count_data[(message_count_data.Year == 2021) | 
                                        ((message_count_data.Year == 2022) & (message_count_data.Month <= 5))]
message_count_data.insert(0, 'Year-Month', value=message_count_data.apply(lambda x: f'{int(x.Year)}-{int(x.Month):02d}', axis=1))
message_count_data

Unnamed: 0,Year-Month,Year,Month,Total messages,Non-spam messages,Non-spam %
0,2021-05,2021,5,524,521,99.43
1,2021-06,2021,6,1465,1463,99.86
2,2021-07,2021,7,6323,6267,99.11
3,2021-08,2021,8,5767,5731,99.38
4,2021-09,2021,9,7735,7676,99.24
5,2021-10,2021,10,6000,5957,99.28
6,2021-11,2021,11,3348,3332,99.52
7,2021-12,2021,12,3714,3701,99.65
8,2022-01,2022,1,2717,2713,99.85
9,2022-02,2022,2,1971,1969,99.9


In [125]:
message_count_data['Non-spam messages'].sum()

np.int64(45154)

### Compute the RMSE between predictions from ChatGPT and Human

In [78]:
chatgpt = results[results['Year-Month'] == '2021-05'].drop('Year-Month', axis=1)

In [79]:
chatgpt.index = pd.Index(['ChatGPT'])
chatgpt

Unnamed: 0,access.info,adv.impacts,community,other,conspiracy,alt.remedies,misinfo,vaccine.comp
ChatGPT,23,41,42,87,30,19,51,7


In [80]:
human = pd.DataFrame(data=np.array([[19, 43, 38, 94, 29, 19, 45, 13]]), index=['Human'], columns=chatgpt.columns)
human

Unnamed: 0,access.info,adv.impacts,community,other,conspiracy,alt.remedies,misinfo,vaccine.comp
Human,19,43,38,94,29,19,45,13


In [115]:
chatgpt_vs_human = pd.concat([chatgpt, human])
chart = alt.Chart(chatgpt_vs_human.transpose().reset_index()).mark_point().encode(
    x=alt.X('ChatGPT', title='Classification by ChatGPT'), 
    y=alt.Y('Human', title='Classification by Human'),
    color=alt.Color('index', title='Theme', legend=None),
    tooltip='index'
)
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='index'
)

chart + text + chart.transform_regression('ChatGPT', 'Human').mark_line(opacity=0.5)

In [104]:
# compute root mean squared error
distance = 0
for i in range(len(chatgpt_vs_human.iloc[0])):
    distance += (chatgpt_vs_human.iloc[0, i] - chatgpt_vs_human.iloc[1, i]) ** 2
math.sqrt(distance / len(chatgpt_vs_human.iloc[0]))

4.444097208657794

In [111]:
# compute correlation coefficient and r^2 
np_matrix = chatgpt_vs_human.to_numpy()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(np_matrix[0], np_matrix[1])
r_squared = r_value ** 2
r_squared, r_value, p_value

(np.float64(0.966881914918944),
 np.float64(0.9833015381453158),
 np.float64(1.1495144278186045e-05))

### Convert spam messages to tabular format

In [76]:
spam_messages_file = open('spammy_messages.jsonl')
output_file = open('spam.csv', 'w')
print('count,message', file=output_file)
for line in spam_messages_file:
    msg_info = json.loads(line)
    print(f'{msg_info["count"]},{msg_info["message"].replace("\n", " ")}', file=output_file)
output_file.close()
spam_messages_file.close()

### Flow of top users from month to month

In [152]:
# top_users.csv and top_user_messages_month.csv are produced by the find_user_flows.py script
top_user_data = pd.read_csv('top_users.csv')
top_user_data

Unnamed: 0,year-month,user_id,count
0,2021-05,user_3,59
1,2021-05,user_31,25
2,2021-05,user_4,23
3,2021-05,user_6,21
4,2021-05,user_8,17
...,...,...,...
60,2022-05,user_7480,146
61,2022-05,user_7684,88
62,2022-05,user_7707,49
63,2022-05,user_7046,7


In [158]:
top_user_contribution = pd.read_csv('top_user_messages_month.csv', index_col='year-month')
top_user_data = top_user_data.assign(top_user_contribution=
                                     top_user_data.apply(lambda x: top_user_contribution.top_user_percentage.loc[x['year-month']], 
                                                         axis=1))


In [163]:
base = alt.Chart(top_user_data, width=400).encode(x=alt.X('year-month', title='Months sampled'))
top_user_chart =  base.mark_bar().encode(y=alt.Y('sum(count)', title='Count of messages posted'),
                                            color=alt.Color('user_id', 
                                                           title='Users that are top 5 posters',
                                                           legend=alt.Legend(symbolLimit=20),
                                                           sort='-y').scale(scheme="category20"),
                                           tooltip='user_id')
user_contribution_chart = base.mark_line(color='red').encode(y=alt.Y('top_user_contribution',
                                                                     title='% of messages by top 5 posters'),
                                                            tooltip='top_user_contribution')
alt.layer(top_user_chart, user_contribution_chart).resolve_scale(y='independent')