In [31]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...


True

In [2]:
# # condense into one df 
df1 = pd.read_csv('Datasets/aspergers.csv', parse_dates=True)
df2 = pd.read_csv('Datasets/depression.csv', parse_dates=True)
df3 = pd.read_csv('Datasets/ocd.csv', parse_dates=True)
df4 = pd.read_csv('Datasets/ptsd.csv', parse_dates=True)
df5 = pd.read_csv('Datasets/adhd.csv', parse_dates=True)

df = pd.concat([df1, df2, df3, df4,df5], ignore_index=True)


In [3]:
df.shape

(151288, 10)

In [4]:
## remove all rows with missing values
df = df.dropna()

In [5]:
df.shape

(149679, 10)

### Check "body" column

In [6]:
df["body"].value_counts()

body
[deleted]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

## Preprocessing the text used for Sentiment Analysis model

In [7]:
## Remove "[deleted]" or "[removed]"

df = df[~df["body"].isin(["[deleted]", "[removed]"])]

In [8]:
df.shape

(87078, 10)

In [9]:
# remove rows with very short responses, less than 3 characters
df = df[df["body"].str.len() > 2]


In [10]:
df.shape

(86899, 10)

In [11]:
## remove URL
df['body'] = df['body'].replace(to_replace=r'http\S+|www.\S+', value='', regex=True)


In [12]:
df["body"].value_counts()

body
[deleted]\n\n[View Poll](                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [13]:
## continue cleaning

# remove specific structured placeholders
df['body'] = df['body'].replace(to_replace=r'\[deleted\]\n\n\[View Poll\]', value='', regex=True)

# remove markdown and keep only the descriptive text
df['body'] = df['body'].replace(to_replace=r'\[([^\]]+)\]\([^)]*\)', value=r'\1', regex=True)



df['body'] = df['body'].str.strip()  #whitespace
df = df[df['body'] != '']  # remove empty rows


In [14]:
df.shape

(86868, 10)

In [15]:
df["body"].value_counts()

body
(                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [16]:
## further cleaning
# remove more specific structured placeholders
df['body'] = df['body'].replace(to_replace=r'\[removed\].*?\[View Poll\]\(', value='', regex=True)
df['body'] = df['body'].replace(to_replace=r'\[.*?\]\(.*?\)', value='', regex=True)  
df['body'] = df['body'].replace(to_replace=r'\[|\]', value='', regex=True)  

# remove entries that only consist of punctuation or are extremely short after previous cleanings
df['body'] = df['body'].replace(to_replace=r'^\s*\W*\s*$', value='', regex=True)


df['body'] = df['body'].str.strip()  # remove whitespace
df = df[df['body'] != '']  # remove empty rows


In [17]:
df.shape

(86762, 10)

In [18]:
df["body"].value_counts()

body
Here's last week's thread to post the weekly threads, to ensure the community gets what it feels it needs.\n\n**So, continuing with the theme... /r/aspergers, How is your week going so far?** :)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [22]:
# Normalize whitespace by replacing multiple spaces with a single space
df['body'] = df['body'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())


In [27]:
# convert text data to lowercase
df['body'] = df['body'].str.lower()


In [29]:
# save only the preprocessed 'body' column to a new CSV file
df['body'].to_csv('Datasets/preprocessed_texts.csv', index=False, header=True)


## Further Preprocessing the Text for Classification Model

In [30]:
## reduce datasize by randomly sample a subset data
sampled_df = df.sample(n=20000, random_state=1)

In [34]:
nlp = spacy.load('en_core_web_sm')
# preprocessing function:
def preprocess_text(text):
   
    # remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # tokenize and lemmatize the text
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english')]
    # join the tokens back into a string
    return ' '.join(tokens)

In [35]:
sampled_df['preprocessed_text'] = sampled_df['body'].apply(preprocess_text)

In [36]:
sampled_df = sampled_df.drop('body', axis=1)

In [38]:
sampled_df.head(1)

Unnamed: 0,author,created_utc,id,num_comments,score,subreddit,title,upvote_ratio,url,preprocessed_text
130939,beegpotatatos,2021-11-16T17:50:37.000Z,qvdkud,6,2,ADHD,How has being diagnosed helped you? apart from...,1.0,https://www.reddit.com/r/ADHD/comments/qvdkud/...,hello undiagnosed strongly feel add want get t...


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)

# apply it to preprocessed text
tfidf_matrix = tfidf_vect.fit_transform(sampled_df['preprocessed_text'])

In [40]:
# display the shape of the matrix
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

Shape of tfidf_matrix: (20000, 1000)


In [41]:
from scipy import sparse

# save tfidf_matrix
sparse.save_npz("Datasets/tfidf_matrix.npz", tfidf_matrix)