In [29]:
import pandas as pd
import re
from contractions import fix
import nltk
from nltk.corpus import stopwords

In [30]:
posts_df = pd.read_csv("reddit_posts.csv")
comments_df = pd.read_csv("reddit_comments.csv")

print(posts_df.head())
print(comments_df.head())

  subreddit                                              title  \
0  pregnant                             Out of control snoring   
1  pregnant                                 Pregnancy hormones   
2  pregnant                             Biological father test   
3  pregnant  People who say they loved being pregnant must ...   
4  pregnant                                            Zofran?   

                                                text   created_utc       id  
0  I snore. I always have, ever since I was a kid...  1.739371e+09  1ins3li  
1  In my third trimester and feeling extra clingy...  1.739371e+09  1ins3a5  
2  There is a possibility my pregnancy is caused ...  1.739364e+09  1inprew  
3  I’m only 5 and a half weeks pregnant at this p...  1.739370e+09  1inrzzd  
4  I'm not even 8 weeks, and I'm barely holding i...  1.739370e+09  1inrz7w  
   post_id                                            comment
0  1ins3li  \n\nWelcome to /r/pregnant! This is a space fo...
1  1ins3a5 

In [31]:
news_df = pd.read_csv('news_data.csv')

# Data Cleaning

## Inspect for missing values

In [32]:
posts_df.isnull().sum()

subreddit       0
title           0
text           71
created_utc     0
id              0
dtype: int64

In [33]:
# remove posts that are missing text
posts_df = posts_df.dropna(subset=['text'])

In [34]:
comments_df.isnull().sum()

post_id    0
comment    0
dtype: int64

In [35]:
news_df.isnull().sum()

title           0
description     0
content         0
published_at    0
source          0
url             0
dtype: int64

## Check for duplicate posts or comments

In [36]:
posts_df.duplicated().sum()

0

In [37]:
comments_df.duplicated().sum()

85

In [38]:
comments_df = comments_df.drop_duplicates()

In [39]:
comments_df.duplicated().sum()

0

In [40]:
news_df.duplicated().sum()

0

## Handling Special Characters
We will keep emojis in the posts and comments since they may contribute to sentiment. Some sentiment analysis packages like VADER detect emojis, so we will want to use a package that acknowledges them when we get to the analysis step.

## Cleaning the text
- remove links
- remove punctuation
- remove extra spaces

In [41]:
def clean_text(text):
    # make lowercase for consistency
    text = text.lower()
    text = re.sub(r"\s+", " ", text)  # remove extra spaces
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^\w\s\U0001F600-\U0001F64F]", "", text)  # keep words and emojis
    text = re.sub(r"[\x00-\x1F\x7F]", "", text) # remove control characters
    return text.strip()

comments_df["comment"] = comments_df["comment"].apply(clean_text)
posts_df["title"] = posts_df["title"].apply(clean_text)
posts_df["text"] = posts_df["text"].apply(clean_text)

In [42]:
posts_df.head()

Unnamed: 0,subreddit,title,text,created_utc,id
0,pregnant,out of control snoring,i snore i always have ever since i was a kid u...,1739371000.0,1ins3li
1,pregnant,pregnancy hormones,in my third trimester and feeling extra clingy...,1739371000.0,1ins3a5
2,pregnant,biological father test,there is a possibility my pregnancy is caused ...,1739364000.0,1inprew
3,pregnant,people who say they loved being pregnant must ...,im only 5 and a half weeks pregnant at this po...,1739370000.0,1inrzzd
4,pregnant,zofran,im not even 8 weeks and im barely holding it t...,1739370000.0,1inrz7w


In [43]:
news_df['title'] = news_df['title'].apply(clean_text)
news_df['description'] = news_df['description'].apply(clean_text)
news_df['content'] = news_df['content'].apply(clean_text)

## Remove Moderator Messages and Other Irrelevant Comments
These comments can pollute the results because they appear frequently in the subreddits but they do not provide any context about the post itself. They are auto-generated in the comments and should be removed from our analysis.

In [44]:
comments_df.shape

(33737, 2)

In [45]:
moderation_keywords = ["welcome to", "please read the rules", "your post has been removed"]
comments_df = comments_df[~comments_df["comment"].str.contains('|'.join(moderation_keywords), case=False, na=False)]

In [46]:
comments_df.shape

(33200, 2)

## Handling Slang, Contractions, Stop Words, etc.

The `contractions` python package expands both contractions and acronyms

In [47]:
# handles contractions

comments_df["comment"] = comments_df["comment"].apply(fix)
posts_df["title"] = posts_df["title"].apply(fix)
posts_df["text"] = posts_df["text"].apply(fix)

In [48]:
news_df['title'] = news_df['title'].apply(fix)
news_df['description'] = news_df['description'].apply(fix)
news_df['content'] = news_df['content'].apply(fix)

In [49]:
# handle stop words
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smcdougall/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
comments_df["comment"] = comments_df["comment"].apply(remove_stopwords)
posts_df["title"] = posts_df["title"].apply(remove_stopwords)
posts_df["text"] = posts_df["text"].apply(remove_stopwords)

In [51]:
news_df['title'] = news_df['title'].apply(remove_stopwords)
news_df['description'] = news_df['description'].apply(remove_stopwords)
news_df['content'] = news_df['content'].apply(remove_stopwords)

In [52]:
# lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/smcdougall/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
comments_df["comment"] = comments_df["comment"].apply(lemmatize_text)
posts_df["title"] = posts_df["title"].apply(lemmatize_text)
posts_df["text"] = posts_df["text"].apply(lemmatize_text)

In [54]:
news_df['title'] = news_df['title'].apply(lemmatize_text)
news_df['description'] = news_df['description'].apply(lemmatize_text)
news_df['content'] = news_df['content'].apply(lemmatize_text)

In [55]:
# remove comments that are too short because we can't do a proper sentiment analysis on them
comments_df = comments_df[comments_df["comment"].str.split().str.len().between(3, 1000)]

In [56]:
comments_df.shape

(31830, 2)

In [57]:
comments_df.head()

Unnamed: 0,post_id,comment
4,1inrzzd,people bad symptom 10 week tomorrow barely bad...
5,1inrzzd,actually like pregnant still 7 week 4 day seco...
6,1inrzzd,20 week love sure first trimester hard already...
7,1inrzzd,really rough first trimester nausea exhaustion...
8,1inrzzd,every pregnancy different often get better sec...


In [58]:
comments_df.shape

(31830, 2)

In [59]:
comments_df.to_csv("reddit_comments_cleaned.csv", index=False)
posts_df.to_csv("reddit_posts_cleaned.csv", index=False)
news_df.to_csv("news_cleaned.csv", index=False)