In [1]:
import pandas as pd

### Load ADHD Women Posts Dataset

In [2]:
posts = pd.read_csv('../../data/datasets/reddit-adhd-dataset/adhdwomen.csv')

In [3]:
print(f"{posts.head()}")

                                         title  \
0         Adult Women Are the New Face of ADHD   
1           Why Women Hide Their ADHD Symptoms   
2  Adult ADHD and Burnout: Success or Failure?   
3            How Am I And My ADHD Still Alive?   
4         I'd like to see this subreddit grow!   

                                            selftext  score      id  \
0                                                NaN      3  29kaf8   
1                                                NaN      3  2ip2ra   
2                                                NaN      1  2q6jdk   
3                                                NaN      2  2sc7fa   
4  Hello, I'm a working, married, mother of 3.  I...      1  3296xx   

                                                 url  num_comments  \
0  http://www.thedailybeast.com/articles/2014/06/...             0   
1  https://euromd.com/9-diseases-and-conditions/1...             0   
2  http://rethinkadhd.wordpress.com/2014/12/23/ad...          

In [4]:
missing_values = posts.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
title                  0
selftext            5188
score                  0
id                     0
url                    0
num_comments           0
created_utc            0
created_datetime       0
dtype: int64


In [5]:
posts = posts.dropna(subset=['title', 'created_datetime'])
print(f"Missing values after removing rows with missing values in the 'title' and 'created_datetime' columns: \n{posts.isnull().sum()}")

Missing values after removing rows with missing values in the 'title' and 'created_datetime' columns: 
title                  0
selftext            5188
score                  0
id                     0
url                    0
num_comments           0
created_utc            0
created_datetime       0
dtype: int64


In [6]:
# join the title and the selftext in a single column if the selftext is not null or deleted
posts['body'] = posts['title']
posts.loc[posts['selftext'].notnull() & ~posts['selftext'].isin(['[deleted]']), 'body'] = posts['title'] + ' ' + posts['selftext']

In [7]:
posts['year'] = pd.to_datetime(posts['created_datetime']).dt.year

In [8]:
# drop the columns that are not needed
posts = posts.drop(['title', 'selftext', 'url', 'num_comments', 'created_utc', 'created_datetime'], axis=1)
print(f"{posts.head()}")
print(f"Data Shape: {posts.shape}")

   score      id                                               body  year
0      3  29kaf8               Adult Women Are the New Face of ADHD  2014
1      3  2ip2ra                 Why Women Hide Their ADHD Symptoms  2014
2      1  2q6jdk        Adult ADHD and Burnout: Success or Failure?  2014
3      2  2sc7fa                  How Am I And My ADHD Still Alive?  2015
4      1  3296xx  I'd like to see this subreddit grow! Hello, I'...  2015
Data Shape: (44384, 4)


In [9]:
missing_values = posts.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
score    0
id       0
body     0
year     0
dtype: int64


In [10]:
# print the number of unique values in each column
for col in posts.columns:
    print(f"Unique values in {col}: {posts[col].nunique()}")

Unique values in score: 1107
Unique values in id: 22237
Unique values in body: 22079
Unique values in year: 8


In [11]:
print(f"{posts.describe()}")

              score          year
count  44384.000000  44384.000000
mean      43.268520   2020.256286
std      167.266479      0.674461
min        0.000000   2014.000000
25%        2.000000   2020.000000
50%        5.000000   2020.000000
75%       17.000000   2021.000000
max     3724.000000   2021.000000


In [12]:
# we want to keep only the unique posts
posts = posts.drop_duplicates(subset='body')
print(f"Data Shape: {posts.shape}")

Data Shape: (22079, 4)


In [None]:
posts.to_csv('data/adhdwomen-posts.csv', index=False)

### Load ADHD Women Comments Dataset

In [14]:
comment = pd.read_csv('../../data/datasets/reddit-adhd-dataset/adhdwomen-comment.csv')

In [15]:
print(f"{comment.head()}")

                                                body       id  score  \
0  I'd like to see this sub be more active, too. ...  cqowxhs      1   
1  I've found people are more receptive when you ...  cvzg3v2      1   
2  Thank you so much. I have been trying to use m...  cw65vo8      1   
3                                          [deleted]  d2tscyn      1   
4  Sooooo, not sure why you were told it was 24 h...  d38enqz      1   

   created_utc     created_datetime  
0   1430023102  2015-04-26 04:38:22  
1   1444835103  2015-10-14 15:05:03  
2   1445326215  2015-10-20 07:30:15  
3   1462457040  2016-05-05 14:04:00  
4   1463457224  2016-05-17 03:53:44  


In [16]:
comment['year'] = pd.to_datetime(comment['created_datetime']).dt.year

In [17]:
comment = comment.drop(['created_utc', 'created_datetime'], axis=1)
print(f"{comment.head()}")
print(f"Data Shape: {comment.shape}")

                                                body       id  score  year
0  I'd like to see this sub be more active, too. ...  cqowxhs      1  2015
1  I've found people are more receptive when you ...  cvzg3v2      1  2015
2  Thank you so much. I have been trying to use m...  cw65vo8      1  2015
3                                          [deleted]  d2tscyn      1  2016
4  Sooooo, not sure why you were told it was 24 h...  d38enqz      1  2016
Data Shape: (202658, 4)


In [18]:
# print the number of unique values in each column
print(comment.describe())

               score           year
count  202658.000000  202658.000000
mean        2.064143    2020.221363
std         5.352811       0.634876
min       -31.000000    2015.000000
25%         1.000000    2020.000000
50%         1.000000    2020.000000
75%         2.000000    2021.000000
max       556.000000    2021.000000


In [19]:
# get the number of null values in body column
missing_values = comment.isnull().sum()
print(f"Missing values: \n{missing_values}")

Missing values: 
body     0
id       0
score    0
year     0
dtype: int64


In [20]:
# drop all rows that arent unique in "body"
comment = comment.drop_duplicates(subset='body')
print(f"Data Shape: {comment.shape}")

Data Shape: (195967, 4)


In [21]:
print(comment.describe())

               score           year
count  195967.000000  195967.000000
mean        2.091714    2020.214822
std         5.436163       0.635008
min       -30.000000    2015.000000
25%         1.000000    2020.000000
50%         1.000000    2020.000000
75%         2.000000    2021.000000
max       556.000000    2021.000000


In [None]:
comment.to_csv('data/adhdwomen-comments.csv', index=False)

### Concatenate the Two Datasets

In [23]:
# concatanate the two dataframes
df = pd.concat([posts, comment], ignore_index=True)
print(f"{df.head()}")
print(f"Data Shape: {df.shape}")

   score      id                                               body  year
0      3  29kaf8               Adult Women Are the New Face of ADHD  2014
1      3  2ip2ra                 Why Women Hide Their ADHD Symptoms  2014
2      1  2q6jdk        Adult ADHD and Burnout: Success or Failure?  2014
3      2  2sc7fa                  How Am I And My ADHD Still Alive?  2015
4      1  3296xx  I'd like to see this subreddit grow! Hello, I'...  2015
Data Shape: (218046, 4)


In [24]:
# print the number of unique values in each column
print(df.describe())

               score           year
count  218046.000000  218046.000000
mean        6.211052    2020.219009
std        53.987378       0.639312
min       -30.000000    2014.000000
25%         1.000000    2020.000000
50%         1.000000    2020.000000
75%         2.000000    2021.000000
max      3230.000000    2021.000000


In [25]:
# drop all rows that arent unique in "body"
df = df.drop_duplicates(subset='body')
print(f"Data Shape: {df.shape}")

Data Shape: (217985, 4)


In [None]:
df.to_csv('data/adhdwomen-final.csv', index=False)