In [1]:
import pandas as pd

### Load ADHD Women Posts Dataset

In [None]:
posts = pd.read_csv('../data/datasets/reddit-adhd-dataset/adhdwomen.csv')

In [3]:
print(f"{posts.head()}")

                                         title  \
0         Adult Women Are the New Face of ADHD   
1           Why Women Hide Their ADHD Symptoms   
2  Adult ADHD and Burnout: Success or Failure?   
3            How Am I And My ADHD Still Alive?   
4         I'd like to see this subreddit grow!   

                                            selftext  score      id  \
0                                                NaN      3  29kaf8   
1                                                NaN      3  2ip2ra   
2                                                NaN      1  2q6jdk   
3                                                NaN      2  2sc7fa   
4  Hello, I'm a working, married, mother of 3.  I...      1  3296xx   

                                                 url  num_comments  \
0  http://www.thedailybeast.com/articles/2014/06/...             0   
1  https://euromd.com/9-diseases-and-conditions/1...             0   
2  http://rethinkadhd.wordpress.com/2014/12/23/ad...          

In [None]:
missing_values = posts.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
title                  0
selftext            5188
score                  0
id                     0
url                    0
num_comments           0
created_utc            0
created_datetime       0
dtype: int64


In [None]:
# join the title and the selftext in a single column if the selftext is not null or deleted
posts['body'] = posts['title']
posts.loc[posts['selftext'].notnull() & ~posts['selftext'].isin(['[deleted]']), 'body'] = posts['title'] + ' ' + posts['selftext']

In [None]:
# drop the columns that are not needed
posts = posts.drop(['score', 'num_comments', 'created_utc', 'created_datetime', 'title', 'selftext', 'url'], axis=1)
print(f"{posts.head()}")
print(f"Data Shape: {posts.shape}")

       id                                               body
0  29kaf8               Adult Women Are the New Face of ADHD
1  2ip2ra                 Why Women Hide Their ADHD Symptoms
2  2q6jdk        Adult ADHD and Burnout: Success or Failure?
3  2sc7fa                  How Am I And My ADHD Still Alive?
4  3296xx  I'd like to see this subreddit grow! Hello, I'...


In [None]:
print(f"{posts.describe()}")

            id          body
count    44384         44384
unique   22237         22079
top     29kaf8  Anyone else?
freq         2            10


In [None]:
# we want to keep only the unique posts
posts = posts.drop_duplicates(subset='body')
print(f"Data Shape: {posts.shape}")

### Load ADHD Women Comments Dataset

In [11]:
comment = pd.read_csv('../data/datasets/reddit-adhd-dataset/adhdwomen-comment.csv')

In [12]:
print(f"{comment.head()}")

                                                body       id  score  \
0  I'd like to see this sub be more active, too. ...  cqowxhs      1   
1  I've found people are more receptive when you ...  cvzg3v2      1   
2  Thank you so much. I have been trying to use m...  cw65vo8      1   
3                                          [deleted]  d2tscyn      1   
4  Sooooo, not sure why you were told it was 24 h...  d38enqz      1   

   created_utc     created_datetime  
0   1430023102  2015-04-26 04:38:22  
1   1444835103  2015-10-14 15:05:03  
2   1445326215  2015-10-20 07:30:15  
3   1462457040  2016-05-05 14:04:00  
4   1463457224  2016-05-17 03:53:44  


In [None]:
comment = comment.drop(['score', 'created_utc', 'created_datetime'], axis=1)
print(f"{comment.head()}")
print(f"Data Shape: {comment.shape}")

                                                body       id
0  I'd like to see this sub be more active, too. ...  cqowxhs
1  I've found people are more receptive when you ...  cvzg3v2
2  Thank you so much. I have been trying to use m...  cw65vo8
3                                          [deleted]  d2tscyn
4  Sooooo, not sure why you were told it was 24 h...  d38enqz


In [None]:
# print the number of unique values in each column
print(comment.describe())

body    195967
id      202658
dtype: int64


In [None]:
# drop all rows that arent unique in "body"
comment = comment.drop_duplicates(subset='body')
print(f"Data Shape: {comment.shape}")

### Concatenate the Two Datasets

In [18]:
# concatanate the two dataframes
df = pd.concat([posts, comment], ignore_index=True)
print(f"{df.head()}")
print(f"Data Shape: {df.shape}")

       id                                               body
0  29kaf8               Adult Women Are the New Face of ADHD
1  2ip2ra                 Why Women Hide Their ADHD Symptoms
2  2q6jdk        Adult ADHD and Burnout: Success or Failure?
3  2sc7fa                  How Am I And My ADHD Still Alive?
4  3296xx  I'd like to see this subreddit grow! Hello, I'...
Data Shape: (218046, 2)


In [None]:
# print the number of unique values in each column
print(df.describe())

id      218045
body    217985
dtype: int64


In [20]:
# drop all rows that arent unique in "body"
df = df.drop_duplicates(subset='body')
print(f"Data Shape: {df.shape}")

Data Shape: (217985, 2)


In [None]:
df.to_csv('data/adhdwomen-final.csv', index=False)