In [30]:
import pandas as pd

### Load ADHD Posts Dataset

In [31]:
posts = pd.read_csv('../../data/datasets/reddit-adhd-dataset/ADHD.csv', low_memory=False)

In [32]:
print(f"{posts.head()}")

                                               title  \
0          Android app to strengthen attention/focus   
1  Does anyone here have experience with Imipramine?   
2  New study shows that for people with ADHD, \n"...   
3                 What does the ADHD test look like?   
4       Are you guys good with maps and directions?    

                                            selftext score     id  \
0  Hey /r/ADHD,\n\nCheck out my simple Android ap...     6  k348a   
1  My doctor has suggested it for ADD with anxiet...     4  k3gdz   
2                                                NaN    22  k4q79   
3  I'm 21 and didn't know about ADHD until recent...     6  k5fvd   
4  It seems like one thing I've always been reall...     0  k6efi   

                                                 url  num_comments  \
0  https://www.reddit.com/r/ADHD/comments/k348a/a...           7.0   
1  https://www.reddit.com/r/ADHD/comments/k3gdz/d...           1.0   
2  http://www.utexas.edu/features/2011

In [33]:
missing_values = posts.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
title                  1
selftext            5143
score                 21
id                    21
url                   21
num_comments          21
created_utc           21
created_datetime      30
dtype: int64


In [34]:
posts = posts.dropna(subset=['title', 'created_datetime'])
print(f"Missing values after removing rows with missing values in the 'title' and 'created_datetime' columns: \n{posts.isnull().sum()}")

Missing values after removing rows with missing values in the 'title' and 'created_datetime' columns: 
title                  0
selftext            5131
score                  0
id                     0
url                    0
num_comments           0
created_utc            0
created_datetime       0
dtype: int64


In [35]:
# join the title and the selftext in a single column if the selftext is not null or deleted
posts['body'] = posts['title']
posts.loc[posts['selftext'].notnull() & ~posts['selftext'].isin(['[deleted]', '[removed]']), 'body'] = posts['title'] + ' ' + posts['selftext']

In [36]:
posts['year'] = pd.to_datetime(posts['created_datetime']).dt.year

In [37]:
# drop the columns that are not needed
posts = posts.drop(['title', 'selftext', 'url', 'num_comments', 'created_utc', 'created_datetime'], axis=1)
print(f"{posts.head()}")
print(f"Data Shape: {posts.shape}")

  score     id                                               body  year
0     6  k348a  Android app to strengthen attention/focus Hey ...  2011
1     4  k3gdz  Does anyone here have experience with Imiprami...  2011
2    22  k4q79  New study shows that for people with ADHD, \n"...  2011
3     6  k5fvd  What does the ADHD test look like? I'm 21 and ...  2011
4     0  k6efi  Are you guys good with maps and directions?  I...  2011
Data Shape: (336036, 4)


In [38]:
missing_values = posts.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
score    0
id       0
body     0
year     0
dtype: int64


In [39]:
# print the number of unique values in each column
for col in posts.columns:
    print(f"Unique values in {col}: {posts[col].nunique()}")

Unique values in score: 2012
Unique values in id: 336036
Unique values in body: 330693
Unique values in year: 11


In [40]:
print(f"{posts.describe()}")

                year
count  336036.000000
mean     2018.201827
std         2.084316
min      2011.000000
25%      2017.000000
50%      2019.000000
75%      2020.000000
max      2021.000000


In [41]:
# we want to keep only the unique posts
posts = posts.drop_duplicates(subset='body')
print(f"Data Shape: {posts.shape}")

Data Shape: (330693, 4)


In [42]:
posts.to_csv('../data/adhd-posts.csv', index=False)

### Load ADHD Comments Dataset

In [43]:
comment = pd.read_csv('../../data/datasets/reddit-adhd-dataset/ADHD-comment.csv')

In [44]:
print(f"{comment.head()}")

                                                body       id  score  \
0                                          [deleted]  c08otkh    1.0   
1  If I try to look this up right now I will get ...  c09y8qz    2.0   
2  potassium is used as the thing that stops your...  c09yia6    2.0   
3  I've love a link to anything about this.  \n\n...  c0a81e6    3.0   
4  I don't know anything specific, but I would *d...  c0aixrg    2.0   

    created_utc     created_datetime  
0  1.239042e+09  2009-04-06 18:18:07  
1  1.243790e+09  2009-05-31 17:08:19  
2  1.243815e+09  2009-06-01 00:07:50  
3  1.244752e+09  2009-06-11 20:25:36  
4  1.245813e+09  2009-06-24 03:04:51  


In [45]:
comment['year'] = pd.to_datetime(comment['created_datetime']).dt.year

In [46]:
comment = comment.drop(['created_utc', 'created_datetime'], axis=1)
print(f"{comment.head()}")
print(f"Data Shape: {comment.shape}")

                                                body       id  score  year
0                                          [deleted]  c08otkh    1.0  2009
1  If I try to look this up right now I will get ...  c09y8qz    2.0  2009
2  potassium is used as the thing that stops your...  c09yia6    2.0  2009
3  I've love a link to anything about this.  \n\n...  c0a81e6    3.0  2009
4  I don't know anything specific, but I would *d...  c0aixrg    2.0  2009
Data Shape: (3356541, 4)


In [47]:
# print the number of unique values in each column
print(comment.describe())

              score          year
count  3.356541e+06  3.356541e+06
mean   2.517950e+00  2.017774e+03
std    1.003889e+01  2.320637e+00
min   -1.580000e+02  2.009000e+03
25%    1.000000e+00  2.016000e+03
50%    1.000000e+00  2.018000e+03
75%    2.000000e+00  2.020000e+03
max    4.145000e+03  2.021000e+03


In [48]:
# get the number of null values in body column
missing_values = comment.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
body     3
id       0
score    0
year     0
dtype: int64


In [49]:
# remove the rows with missing values in the 'body' column and that have the value '[deleted]' or '[removed]' in the 'body' column
comment = comment.dropna(subset=['body'])
comment = comment[~comment['body'].isin(['[deleted]', '[removed]'])]
print(f"Data Shape: {comment.shape}")

Data Shape: (3229941, 4)


In [50]:
print(comment.describe())

              score          year
count  3.229941e+06  3.229941e+06
mean   2.562633e+00  2.017817e+03
std    1.020624e+01  2.303927e+00
min   -1.580000e+02  2.009000e+03
25%    1.000000e+00  2.016000e+03
50%    1.000000e+00  2.018000e+03
75%    2.000000e+00  2.020000e+03
max    4.145000e+03  2.021000e+03


In [51]:
# drop all rows that arent unique in "body"
comment = comment.drop_duplicates(subset='body')
print(f"Data Shape: {comment.shape}")

Data Shape: (2688393, 4)


In [52]:
comment.to_csv('../data/adhd-comments.csv', index=False)

### Concatenate the Two Datasets

In [53]:
# concatanate the two dataframes
df = pd.concat([posts, comment], ignore_index=True)
print(f"{df.head()}")
print(f"Data Shape: {df.shape}")

  score     id                                               body  year
0     6  k348a  Android app to strengthen attention/focus Hey ...  2011
1     4  k3gdz  Does anyone here have experience with Imiprami...  2011
2    22  k4q79  New study shows that for people with ADHD, \n"...  2011
3     6  k5fvd  What does the ADHD test look like? I'm 21 and ...  2011
4     0  k6efi  Are you guys good with maps and directions?  I...  2011
Data Shape: (3019086, 4)


In [54]:
# print the number of unique values in each column
print(df.describe())

               year
count  3.019086e+06
mean   2.018040e+03
std    2.284300e+00
min    2.009000e+03
25%    2.017000e+03
50%    2.019000e+03
75%    2.020000e+03
max    2.021000e+03


In [55]:
# drop all rows that arent unique in "body"
df = df.drop_duplicates(subset='body')
print(f"Data Shape: {df.shape}")

Data Shape: (3018466, 4)


In [56]:
df.to_csv('../data/adhd-final.csv', index=False)