# Downloading data

1. r/domesticviolence
2. r/MeToo
3. r/SexualHarassment
4. r/sexualassault

https://github.com/CornellNLP/ConvoKit/blob/master/examples/corpus_from_pandas.ipynb

In [1]:
! pip install convokit

In [2]:
import convokit

In [1]:
from convokit import Corpus, download
import pandas as pd

In [4]:
corpus = Corpus(download('subreddit-domesticviolence'))

Downloading subreddit-domesticviolence to /Users/mansisaxena/.convokit/downloads/subreddit-domesticviolence
Downloading subreddit-domesticviolence from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/dogecoin4hipsters~-~donaldtrump/domesticviolence.corpus.zip (6.7MB)... Done


In [5]:
corpus.print_summary_stats()

Number of Speakers: 4908
Number of Utterances: 22654
Number of Conversations: 4943


In [6]:
# you can ignore this
utt_df = corpus.get_utterances_dataframe().drop(columns=['vectors'])
convo_df = corpus.get_conversations_dataframe().drop(columns=['vectors'])
speaker_df = corpus.get_speakers_dataframe().drop(columns=['vectors']) 

In [7]:
speakers = speaker_df.index.to_list()  # list of unique number of speakers

In [8]:
convo_df.head(1)

Unnamed: 0_level_0,meta.title,meta.num_comments,meta.domain,meta.timestamp,meta.subreddit,meta.gilded,meta.gildings,meta.stickied,meta.author_flair_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
o0pbq,"My coworker is in an abusive relationship, and...",7,self.domesticviolence,1325565642,domesticviolence,-1,,False,


In [9]:
convo_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4943 entries, o0pbq to f97y0
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   meta.title              4943 non-null   object
 1   meta.num_comments       4943 non-null   object
 2   meta.domain             4943 non-null   object
 3   meta.timestamp          4943 non-null   object
 4   meta.subreddit          4943 non-null   object
 5   meta.gilded             4943 non-null   object
 6   meta.gildings           388 non-null    object
 7   meta.stickied           4943 non-null   object
 8   meta.author_flair_text  4943 non-null   object
dtypes: object(9)
memory usage: 386.2+ KB


In [10]:
 utt_df.head(1)

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.score,meta.top_level_comment,meta.retrieved_on,meta.gilded,meta.gildings,meta.subreddit,meta.stickied,meta.permalink,meta.author_flair_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
o0pbq,1325565642,"My coworker is in an abusive relationship, and...",DVsKat,,o0pbq,3,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/o0pbq/my_coworker...,


In [11]:
utt_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22654 entries, o0pbq to e8t8e05
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   timestamp               22654 non-null  object
 1   text                    22654 non-null  object
 2   speaker                 22654 non-null  object
 3   reply_to                17711 non-null  object
 4   conversation_id         22654 non-null  object
 5   meta.score              22654 non-null  object
 6   meta.top_level_comment  17711 non-null  object
 7   meta.retrieved_on       22654 non-null  object
 8   meta.gilded             22654 non-null  object
 9   meta.gildings           2291 non-null   object
 10  meta.subreddit          22654 non-null  object
 11  meta.stickied           22654 non-null  object
 12  meta.permalink          22654 non-null  object
 13  meta.author_flair_text  22654 non-null  object
dtypes: object(14)
memory usage: 2.6+ MB


In [12]:
processed_utt = utt_df[utt_df['text'] != "[removed]"]
print(len(processed_utt))
processed_utt = processed_utt[processed_utt['text'] != "[deleted]"]
print(len(processed_utt))
processed_utt = processed_utt[processed_utt['text'] != ""]
print(len(processed_utt))

22002
20308
18778


In [13]:
posts = processed_utt[processed_utt['reply_to'].isna()]

comments = processed_utt[processed_utt['reply_to'].notna()]

In [14]:
len(posts), len(comments)

(2339, 16439)

In [15]:
import numpy as np

convo_ids = convo_df.index.values
posts_ids = posts.index.values

# Finding common values between the two arrays
common_ids = np.intersect1d(convo_ids, posts_ids)

# Counting the number of common values
num_common_ids = len(common_ids)

print("Number of overlapping values:", num_common_ids)

Number of overlapping values: 2339


In [16]:
# Merge posts dataframe with convo_df on their indices using left join
merged_posts = posts.merge(convo_df[['meta.title', 'meta.num_comments']], left_index=True, right_index=True, suffixes=('_posts', '_convo'), how='left')
merged_posts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2339 entries, o0pbq to f89sf
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   timestamp               2339 non-null   object
 1   text                    2339 non-null   object
 2   speaker                 2339 non-null   object
 3   reply_to                0 non-null      object
 4   conversation_id         2339 non-null   object
 5   meta.score              2339 non-null   object
 6   meta.top_level_comment  0 non-null      object
 7   meta.retrieved_on       2339 non-null   object
 8   meta.gilded             2339 non-null   object
 9   meta.gildings           213 non-null    object
 10  meta.subreddit          2339 non-null   object
 11  meta.stickied           2339 non-null   object
 12  meta.permalink          2339 non-null   object
 13  meta.author_flair_text  2339 non-null   object
 14  meta.title              2339 non-null   object
 15  meta

In [17]:
merged_posts.rename(columns={'meta.title': 'title'}, inplace=True)
merged_posts = merged_posts[['timestamp', 'title', 'text', 'speaker', 'reply_to', 'conversation_id',
       'meta.score', 'meta.num_comments', 'meta.top_level_comment', 'meta.retrieved_on',
       'meta.gilded', 'meta.gildings', 'meta.subreddit', 'meta.stickied',
       'meta.permalink', 'meta.author_flair_text']]

In [18]:
merged_posts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2339 entries, o0pbq to f89sf
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   timestamp               2339 non-null   object
 1   title                   2339 non-null   object
 2   text                    2339 non-null   object
 3   speaker                 2339 non-null   object
 4   reply_to                0 non-null      object
 5   conversation_id         2339 non-null   object
 6   meta.score              2339 non-null   object
 7   meta.num_comments       2339 non-null   object
 8   meta.top_level_comment  0 non-null      object
 9   meta.retrieved_on       2339 non-null   object
 10  meta.gilded             2339 non-null   object
 11  meta.gildings           213 non-null    object
 12  meta.subreddit          2339 non-null   object
 13  meta.stickied           2339 non-null   object
 14  meta.permalink          2339 non-null   object
 15  meta

In [19]:
merged_posts.head()

Unnamed: 0_level_0,timestamp,title,text,speaker,reply_to,conversation_id,meta.score,meta.num_comments,meta.top_level_comment,meta.retrieved_on,meta.gilded,meta.gildings,meta.subreddit,meta.stickied,meta.permalink,meta.author_flair_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
o0pbq,1325565642,"My coworker is in an abusive relationship, and...","My coworker is in an abusive relationship, and...",DVsKat,,o0pbq,3,7,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/o0pbq/my_coworker...,
ocpar,1326308957,Why can't I bring myself to leave?,"Sorry if I ramble, I just feel Pike there is s...",dearlydistressedmmm,,ocpar,7,12,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/ocpar/why_cant_i_...,
onm6a,1326998843,My sister is in an abusive relationship and I ...,So I have been living with my sis for almost t...,[deleted],,onm6a,1,0,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/onm6a/my_sister_i...,
or4tz,1327209584,Four years ago my gay best friend beat me up. ...,I was 22; I am now 26. I'm female. My gay best...,[deleted],,or4tz,11,3,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/or4tz/four_years_...,
ot7hk,1327345546,Emotional abuse and blame,Currently in the middle on an intense situatio...,confusedorabused,,ot7hk,3,6,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/ot7hk/emotional_a...,


In [20]:
merged_posts.to_csv('domesticviolence_posts.csv') 
comments.to_csv("domesticviolence_comments.csv")