In [1]:
import pandas as pd

### Load ADHD Posts Dataset

In [2]:
posts = pd.read_csv('../data/datasets/reddit-adhd-dataset/ADHD.csv')

  posts = pd.read_csv('../data/datasets/reddit-adhd-dataset/ADHD.csv')


In [3]:
print(f"{posts.head()}")

                                               title  \
0          Android app to strengthen attention/focus   
1  Does anyone here have experience with Imipramine?   
2  New study shows that for people with ADHD, \n"...   
3                 What does the ADHD test look like?   
4       Are you guys good with maps and directions?    

                                            selftext score     id  \
0  Hey /r/ADHD,\n\nCheck out my simple Android ap...     6  k348a   
1  My doctor has suggested it for ADD with anxiet...     4  k3gdz   
2                                                NaN    22  k4q79   
3  I'm 21 and didn't know about ADHD until recent...     6  k5fvd   
4  It seems like one thing I've always been reall...     0  k6efi   

                                                 url  num_comments  \
0  https://www.reddit.com/r/ADHD/comments/k348a/a...           7.0   
1  https://www.reddit.com/r/ADHD/comments/k3gdz/d...           1.0   
2  http://www.utexas.edu/features/2011

In [4]:
missing_values = posts.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
title                  1
selftext            5143
score                 21
id                    21
url                   21
num_comments          21
created_utc           21
created_datetime      30
dtype: int64


In [5]:
posts = posts.dropna(subset=['title'])
print(f"Missing values after removing rows with missing values in the 'title' column: \n{posts.isnull().sum()}")

Missing values after removing rows with missing values in the 'title' column: 
title                  0
selftext            5143
score                 21
id                    21
url                   21
num_comments          21
created_utc           21
created_datetime      29
dtype: int64


In [6]:
# join the title and the selftext in a single column if the selftext is not null or deleted
posts['body'] = posts['title']
posts.loc[posts['selftext'].notnull() & ~posts['selftext'].isin(['[deleted]', '[removed]']), 'body'] = posts['title'] + ' ' + posts['selftext']

In [7]:
# drop the columns that are not needed
posts = posts.drop(['title', 'selftext', 'score', 'url', 'num_comments', 'created_utc', 'created_datetime'], axis=1)
print(f"{posts.head()}")
print(f"Data Shape: {posts.shape}")

      id                                               body
0  k348a  Android app to strengthen attention/focus Hey ...
1  k3gdz  Does anyone here have experience with Imiprami...
2  k4q79  New study shows that for people with ADHD, \n"...
3  k5fvd  What does the ADHD test look like? I'm 21 and ...
4  k6efi  Are you guys good with maps and directions?  I...
Data Shape: (336065, 2)


In [8]:
print(f"{posts.describe()}")

            id             body
count   336044           336065
unique  336044           330722
top      k348a  Do I have ADHD?
freq         1              119


In [9]:
# we want to keep only the unique posts
posts = posts.drop_duplicates(subset='body')
print(f"Data Shape: {posts.shape}")

Data Shape: (330722, 2)


### Load ADHD Women Comments Dataset

In [10]:
comment = pd.read_csv('../data/datasets/reddit-adhd-dataset/ADHD-comment.csv')

In [11]:
print(f"{comment.head()}")

                                                body       id  score  \
0                                          [deleted]  c08otkh    1.0   
1  If I try to look this up right now I will get ...  c09y8qz    2.0   
2  potassium is used as the thing that stops your...  c09yia6    2.0   
3  I've love a link to anything about this.  \n\n...  c0a81e6    3.0   
4  I don't know anything specific, but I would *d...  c0aixrg    2.0   

    created_utc     created_datetime  
0  1.239042e+09  2009-04-06 18:18:07  
1  1.243790e+09  2009-05-31 17:08:19  
2  1.243815e+09  2009-06-01 00:07:50  
3  1.244752e+09  2009-06-11 20:25:36  
4  1.245813e+09  2009-06-24 03:04:51  


In [12]:
comment = comment.drop(['score', 'created_utc', 'created_datetime'], axis=1)
print(f"{comment.head()}")
print(f"Data Shape: {comment.shape}")

                                                body       id
0                                          [deleted]  c08otkh
1  If I try to look this up right now I will get ...  c09y8qz
2  potassium is used as the thing that stops your...  c09yia6
3  I've love a link to anything about this.  \n\n...  c0a81e6
4  I don't know anything specific, but I would *d...  c0aixrg
Data Shape: (3356541, 2)


In [13]:
# print the number of unique values in each column
print(comment.describe())

             body       id
count     3356538  3356541
unique    2688395  2895398
top     [deleted]  gg8xds3
freq        83271        3


In [14]:
# get the number of null values in body column
missing_values = comment.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
body    3
id      0
dtype: int64


In [15]:
# remove the rows with missing values in the 'body' column and that have the value '[deleted]' or '[removed]' in the 'body' column
comment = comment.dropna(subset=['body'])
comment = comment[~comment['body'].isin(['[deleted]', '[removed]'])]
print(f"Data Shape: {comment.shape}")

Data Shape: (3229941, 2)


In [16]:
print(comment.describe())

                                                     body       id
count                                             3229941  3229941
unique                                            2688393  2788895
top     As per the rules in the side bar, yes or no qu...  gg8xds3
freq                                                15761        3


In [17]:
# drop all rows that arent unique in "body"
comment = comment.drop_duplicates(subset='body')
print(f"Data Shape: {comment.shape}")

Data Shape: (2688393, 2)


### Concatenate the Two Datasets

In [18]:
# concatanate the two dataframes
df = pd.concat([posts, comment], ignore_index=True)
print(f"{df.head()}")
print(f"Data Shape: {df.shape}")

      id                                               body
0  k348a  Android app to strengthen attention/focus Hey ...
1  k3gdz  Does anyone here have experience with Imiprami...
2  k4q79  New study shows that for people with ADHD, \n"...
3  k5fvd  What does the ADHD test look like? I'm 21 and ...
4  k6efi  Are you guys good with maps and directions?  I...
Data Shape: (3019115, 2)


In [19]:
# print the number of unique values in each column
print(df.describe())

             id               body
count   3019094            3019115
unique  3019094            3018495
top       k348a  How old were you?
freq          1                  2


In [20]:
# drop all rows that arent unique in "body"
df = df.drop_duplicates(subset='body')
print(f"Data Shape: {df.shape}")

Data Shape: (3018495, 2)


In [21]:
df.to_csv('data/adhd-final.csv', index=False)