# Data Cleaning and NLP.ipynb

In [2]:
#Import all libraries used in this notebook
import pandas as pd 
import numpy as np
import regex as re
import time
import warnings
#warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
from bs4 import BeautifulSoup   
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix

# Import the Data

In [3]:
depanx = pd.read_csv('depanx.csv')

In [4]:
depanx.head()

Unnamed: 0,title,score,over_18,selftext,is_self,url,body,id,subreddit,top_rank
0,Depression for me is a constant feeling of wan...,7198,False,It’s a constant sense of wanting to go somewhe...,True,https://www.reddit.com/r/depression/comments/c...,It’s a constant sense of wanting to go somewhe...,cpidz8,depression,1
1,Depression is so much worse if you live with y...,6781,False,"I can't hide, I can't cry out loud, they don't...",True,https://www.reddit.com/r/depression/comments/c...,"I can't hide, I can't cry out loud, they don't...",c2fuf3,depression,2
2,A stranger just saved my life...,5946,False,I woke up today wanting to die. I was planning...,True,https://www.reddit.com/r/depression/comments/7...,I woke up today wanting to die. I was planning...,7k1nt2,depression,3
3,Shout out to the particular hell that is funct...,5531,False,"This is me. Don’t get me wrong, it’s better th...",True,https://www.reddit.com/r/depression/comments/c...,"This is me. Don’t get me wrong, it’s better th...",cd0hjp,depression,4
4,"Does anyone ever feel okay for a while, then a...",5054,False,Recently I thought I was beginning to feel bet...,True,https://www.reddit.com/r/depression/comments/b...,Recently I thought I was beginning to feel bet...,b7fxrm,depression,5


In [5]:
depanx.shape

(1972, 10)

In [6]:
#Looked for unnamed cols, there are none.

# Handling the Missing Values

In [7]:
depanx.isnull().mean()

title        0.000000
score        0.000000
over_18      0.000000
selftext     0.208925
is_self      0.000000
url          0.000000
body         0.208925
id           0.000000
subreddit    0.000000
top_rank     0.000000
dtype: float64

In [8]:
#All null values for "body" and "selftext"
depanx.loc[depanx['body'].isnull()]

Unnamed: 0,title,score,over_18,selftext,is_self,url,body,id,subreddit,top_rank
36,Does anybody else ever stay up late so they ca...,2754,False,,True,https://www.reddit.com/r/depression/comments/6...,,6zd0dq,depression,37
62,"I hope being dead is like sleeping,because im ...",2336,False,,True,https://www.reddit.com/r/depression/comments/d...,,db0bhz,depression,63
84,Depression is when you don't really care about...,2100,False,,True,https://www.reddit.com/r/depression/comments/4...,,4i203m,depression,85
87,Does anyone else get those little highs that l...,2084,False,,True,https://www.reddit.com/r/depression/comments/5...,,5cgdww,depression,88
99,I wish i could transfer all my remaining days ...,1974,False,,True,https://www.reddit.com/r/depression/comments/6...,,6svblf,depression,100
...,...,...,...,...,...,...,...,...,...,...
1946,Does anyone else wonder what type a life you w...,472,False,,True,https://www.reddit.com/r/Anxiety/comments/7nx3...,,7nx3i6,Anxiety,975
1949,Does anyone else think about every little mome...,471,False,,True,https://www.reddit.com/r/Anxiety/comments/6z5v...,,6z5v89,Anxiety,978
1954,Anxiety about the future,474,False,,False,http://imgur.com/JZ88Pwj,,1gieri,Anxiety,983
1959,What anxiety feels like. The Wall,467,False,,False,http://i.imgur.com/0otnPUU.jpg,,1vdhzu,Anxiety,988


In [9]:
#Looking at the above table i came to realisation and selftext and body are duplicates or rather different names for same feature. So, dropping 'selftext'.
depanx=depanx.drop(['selftext'], axis=1)

In [10]:
depanx.head()

Unnamed: 0,title,score,over_18,is_self,url,body,id,subreddit,top_rank
0,Depression for me is a constant feeling of wan...,7198,False,True,https://www.reddit.com/r/depression/comments/c...,It’s a constant sense of wanting to go somewhe...,cpidz8,depression,1
1,Depression is so much worse if you live with y...,6781,False,True,https://www.reddit.com/r/depression/comments/c...,"I can't hide, I can't cry out loud, they don't...",c2fuf3,depression,2
2,A stranger just saved my life...,5946,False,True,https://www.reddit.com/r/depression/comments/7...,I woke up today wanting to die. I was planning...,7k1nt2,depression,3
3,Shout out to the particular hell that is funct...,5531,False,True,https://www.reddit.com/r/depression/comments/c...,"This is me. Don’t get me wrong, it’s better th...",cd0hjp,depression,4
4,"Does anyone ever feel okay for a while, then a...",5054,False,True,https://www.reddit.com/r/depression/comments/b...,Recently I thought I was beginning to feel bet...,b7fxrm,depression,5


In [11]:
#Cretaing a data frame with rows with column= "body" with non - NAN entries

depanx=depanx[depanx.body.notnull()]

In [12]:
depanx


Unnamed: 0,title,score,over_18,is_self,url,body,id,subreddit,top_rank
0,Depression for me is a constant feeling of wan...,7198,False,True,https://www.reddit.com/r/depression/comments/c...,It’s a constant sense of wanting to go somewhe...,cpidz8,depression,1
1,Depression is so much worse if you live with y...,6781,False,True,https://www.reddit.com/r/depression/comments/c...,"I can't hide, I can't cry out loud, they don't...",c2fuf3,depression,2
2,A stranger just saved my life...,5946,False,True,https://www.reddit.com/r/depression/comments/7...,I woke up today wanting to die. I was planning...,7k1nt2,depression,3
3,Shout out to the particular hell that is funct...,5531,False,True,https://www.reddit.com/r/depression/comments/c...,"This is me. Don’t get me wrong, it’s better th...",cd0hjp,depression,4
4,"Does anyone ever feel okay for a while, then a...",5054,False,True,https://www.reddit.com/r/depression/comments/b...,Recently I thought I was beginning to feel bet...,b7fxrm,depression,5
...,...,...,...,...,...,...,...,...,...
1967,I hate it when people glorify anxiety,466,False,True,https://www.reddit.com/r/Anxiety/comments/6ug7...,I see these posts every week about how girls w...,6ug7hu,Anxiety,996
1968,Do phone calls upset anyone?,464,False,True,https://www.reddit.com/r/Anxiety/comments/6p86...,Just everything about them is unnerving.,6p86ib,Anxiety,997
1969,Does Anyone Else Get Serious Brain-Fog?,464,False,True,https://www.reddit.com/r/Anxiety/comments/6eqm...,"Honestly, sometimes it feels like I've lost al...",6eqmkb,Anxiety,998
1970,When i can't sleep at night due to my anxiety ...,463,False,True,https://www.reddit.com/r/Anxiety/comments/d2zv...,Knowing that i might just help even one person...,d2zvpi,Anxiety,999


In [13]:
depanx["text"]=depanx.apply(lambda x: x['title'] + x['body'], axis=1)

In [14]:
depanx.head()

Unnamed: 0,title,score,over_18,is_self,url,body,id,subreddit,top_rank,text
0,Depression for me is a constant feeling of wan...,7198,False,True,https://www.reddit.com/r/depression/comments/c...,It’s a constant sense of wanting to go somewhe...,cpidz8,depression,1,Depression for me is a constant feeling of wan...
1,Depression is so much worse if you live with y...,6781,False,True,https://www.reddit.com/r/depression/comments/c...,"I can't hide, I can't cry out loud, they don't...",c2fuf3,depression,2,Depression is so much worse if you live with y...
2,A stranger just saved my life...,5946,False,True,https://www.reddit.com/r/depression/comments/7...,I woke up today wanting to die. I was planning...,7k1nt2,depression,3,A stranger just saved my life...I woke up toda...
3,Shout out to the particular hell that is funct...,5531,False,True,https://www.reddit.com/r/depression/comments/c...,"This is me. Don’t get me wrong, it’s better th...",cd0hjp,depression,4,Shout out to the particular hell that is funct...
4,"Does anyone ever feel okay for a while, then a...",5054,False,True,https://www.reddit.com/r/depression/comments/b...,Recently I thought I was beginning to feel bet...,b7fxrm,depression,5,"Does anyone ever feel okay for a while, then a..."


# Checking for duplicates:

In [15]:
depanx.duplicated().sum() #No duplicates.


0

# Clean text

In [16]:
#reference:
# https://towardsdatascience.com/the-real-world-as-seen-on-twitter-sentiment-analysis-part-one-5ac2d06b63fb
# https://stackoverflow.com/questions/4328500/how-can-i-strip-all-punctuation-from-a-string-in-javascript-using-regex

In [17]:
def posts_to_words(raw_post):
    # Function to convert a raw post to a string of words
    # The input is a single string (a raw subreddit post), and 
    # the output is a single string (a preprocessed subreddit post)
    
    # 1. Remove HTML.
    post_text = BeautifulSoup(raw_post).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", post_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
 
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))


In [18]:
 depanx['text'].apply(lambda x : posts_to_words(x))


0       depression constant feeling wanting go home ma...
1       depression much worse live parents hide cry lo...
2       stranger saved life woke today wanting die pla...
3       shout particular hell functional depression ge...
4       anyone ever feel okay wave depression hits lik...
                              ...                        
1967    hate people glorify anxietyi see posts every w...
1968        phone calls upset anyone everything unnerving
1969    anyone else get serious brain fog honestly som...
1970    sleep night due anxiety scroll thread try repl...
1971    people w anxiety work customer service strong ...
Name: text, Length: 1560, dtype: object

In [19]:
depanx.shape

(1560, 10)

# NLP pre-processing and exploration

# lemmatize

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    lemma_words = ''
    for word in words:
        lemma_words += (lemmatizer.lemmatize(word) + ' ')
    return lemma_words    

In [22]:
#Applying the sbove defined function to "text":
depanx['text']=depanx['text'].apply(lemmatize_words)

In [23]:
depanx.shape

(1560, 10)

In [24]:
# drop rows where text = ''
#Has been alredy cleaned but double checking the work here.

depanx = depanx[depanx['text'] != '']
depanx = depanx.reset_index(drop=True)


In [25]:
depanx.head()

Unnamed: 0,title,score,over_18,is_self,url,body,id,subreddit,top_rank,text
0,Depression for me is a constant feeling of wan...,7198,False,True,https://www.reddit.com/r/depression/comments/c...,It’s a constant sense of wanting to go somewhe...,cpidz8,depression,1,Depression for me is a constant feeling of wan...
1,Depression is so much worse if you live with y...,6781,False,True,https://www.reddit.com/r/depression/comments/c...,"I can't hide, I can't cry out loud, they don't...",c2fuf3,depression,2,Depression is so much worse if you live with y...
2,A stranger just saved my life...,5946,False,True,https://www.reddit.com/r/depression/comments/7...,I woke up today wanting to die. I was planning...,7k1nt2,depression,3,A stranger just saved my life...I woke up toda...
3,Shout out to the particular hell that is funct...,5531,False,True,https://www.reddit.com/r/depression/comments/c...,"This is me. Don’t get me wrong, it’s better th...",cd0hjp,depression,4,Shout out to the particular hell that is funct...
4,"Does anyone ever feel okay for a while, then a...",5054,False,True,https://www.reddit.com/r/depression/comments/b...,Recently I thought I was beginning to feel bet...,b7fxrm,depression,5,"Does anyone ever feel okay for a while, then a..."


In [26]:
depanx.shape
#So, there were no rows with text="".

(1560, 10)

In [27]:
depanx.to_csv('comments_clean.csv', index=False)


# CountVectorizer


In [28]:
# Target is 'subreddit'. Current values: 'depression', 'anxiety'.
# Need to transform to 0 or 1 values. Set depression = 0, anxiety= 1

depanx['target'] = depanx['subreddit'].map({'depression':0, 'Anxiety':1})
depanx.drop('subreddit', axis=1, inplace=True)
depanx.head()



Unnamed: 0,title,score,over_18,is_self,url,body,id,top_rank,text,target
0,Depression for me is a constant feeling of wan...,7198,False,True,https://www.reddit.com/r/depression/comments/c...,It’s a constant sense of wanting to go somewhe...,cpidz8,1,Depression for me is a constant feeling of wan...,0
1,Depression is so much worse if you live with y...,6781,False,True,https://www.reddit.com/r/depression/comments/c...,"I can't hide, I can't cry out loud, they don't...",c2fuf3,2,Depression is so much worse if you live with y...,0
2,A stranger just saved my life...,5946,False,True,https://www.reddit.com/r/depression/comments/7...,I woke up today wanting to die. I was planning...,7k1nt2,3,A stranger just saved my life...I woke up toda...,0
3,Shout out to the particular hell that is funct...,5531,False,True,https://www.reddit.com/r/depression/comments/c...,"This is me. Don’t get me wrong, it’s better th...",cd0hjp,4,Shout out to the particular hell that is funct...,0
4,"Does anyone ever feel okay for a while, then a...",5054,False,True,https://www.reddit.com/r/depression/comments/b...,Recently I thought I was beginning to feel bet...,b7fxrm,5,"Does anyone ever feel okay for a while, then a...",0


Most frequent "depression" words:

In [29]:
#To get most frequently used words: change max_features

count_vect=CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=True,
    stop_words="english",
    max_features = 35
    
)

# input for CountVectorizer is an array of strings
vector_input_dep = depanx[depanx['target'] == 0]['text']

# fit_transform the vectorizer
dep_words = count_vect.fit_transform(vector_input_dep)

# convert output to a Numpy array
dep_words = dep_words.toarray()

In [30]:
dep_words

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 4, ..., 0, 2, 3],
       [0, 0, 0, ..., 0, 0, 0]])

In [31]:
# get the words
dep_word_list = count_vect.get_feature_names()
dep_word_list 

['better',
 'day',
 'depressed',
 'depression',
 'don',
 'edit',
 'feel',
 'feeling',
 'friend',
 'going',
 'good',
 'ha',
 'happy',
 'help',
 'just',
 'know',
 'life',
 'like',
 'll',
 'make',
 'people',
 'really',
 'say',
 'stop',
 'thank',
 'thing',
 'think',
 'thought',
 'time',
 've',
 'wa',
 'want',
 'way',
 'work',
 'year']

Most frequent "anxiety" words:

In [187]:
#Most frequent "depression" words:

count_vect=CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=True,
    stop_words="english",
    max_features = 35
    
)

# input for CountVectorizer is an array of strings
vector_input_anx=depanx[depanx['target']==1]['text']


#fit_transform the vectorizer
anx_words=count_vect.fit_transform(vector_input_anx)

# convert output to a Numpy array
anx_words=anx_words.toarray()


In [188]:
anx_words_list= count_vect.get_feature_names()

In [190]:
anx_words_list


['anxiety',
 'anxious',
 'day',
 'did',
 'does',
 'doing',
 'don',
 'edit',
 'feel',
 'feeling',
 'going',
 'good',
 'got',
 'ha',
 'help',
 'just',
 'know',
 'life',
 'like',
 'll',
 'make',
 'panic',
 'people',
 'really',
 'thing',
 'think',
 'thought',
 'time',
 'today',
 've',
 'wa',
 'want',
 'way',
 'work',
 'year']

# Edit Stop-Words

In [None]:
#There are still some non-meaning ful words in the "Most frequent words used for depression & Anxiety". Adding those to the stop word english dictionary.

In [193]:
from sklearn.feature_extraction import text
text.ENGLISH_STOP_WORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [195]:
add_stop_words=['did', 'don',  'going', 'got', 'ha', 'isn', 'wa','ll', 've']

In [196]:
stop_words=text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [197]:
stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

# depression vs Anxiety top 100 words!

# word/n-gram frequency:

In [202]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool.
# CountVectorizer transforms the body text from the reddit comments into features /words and creates columns  with word counts for each comment

#depression
count_vect=CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=True,
    stop_words="english",
    max_features = 35,
    ngram_range=(1, 3)
    
)

# input for CountVectorizer is an array of strings
vector_input_dep=depanx[depanx['target']==0]['text']


#fit_transform the vectorizer
dep_words=count_vect.fit_transform(vector_input_dep)

# convert output to a Numpy array
dep_words=dep_words.toarray()


In [203]:
dep_words_matrix=pd.DataFrame(dep_words, columns=count_vect.get_feature_names())

In [206]:
dep_words_matrix.sum().sort_values(ascending=False)

just          1142
feel           924
like           882
don            694
wa             682
want           587
people         549
life           504
depression     484
time           459
know           448
ve             429
day            408
feel like      378
really         329
make           328
thing          289
think          262
ha             237
friend         237
year           232
going          227
depressed      219
edit           207
feeling        195
help           186
way            178
happy          168
good           165
thought        165
work           161
better         161
say            158
stop           158
thank          158
dtype: int64

In [211]:
# Anxiety

# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool.
count_vect=CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = stop_words, 
                             max_features = 10000, 
                             ngram_range=(1, 3)
                            ) 

# input for CountVectorizer is an array of strings
vector_input_anx=depanx[depanx['target']==1]['text']

#fit_transform the vectorizer
anxiety_words=count_vect.fit_transform(vector_input_anx)

# convert output to a Numpy array
anxiety_words=anxiety_words.toarray()



In [212]:
anxiety_word_matrix=pd.DataFrame(anxiety_words, columns=count_vect.get_feature_names())

In [214]:
anxiety_word_matrix.sum().sort_values(ascending=False)


anxiety                            976
just                               744
like                               653
feel                               631
time                               442
                                  ... 
prescribed emergency                 1
prescribed emergency medication      1
prescribed medication                1
prescribed medication instances      1
post gotten bit                      1
Length: 10000, dtype: int64

# TF-IDF Vectorizer


In [215]:
#depression

#Instantiate the TFIDF Vectorizer
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

#fit_transform
dep_tf_words = tvec.fit_transform(vector_input_dep)

#into array of words
dep_tf_words = dep_tf_words.toarray()


dep_matrix = pd.DataFrame(dep_tf_words, columns=tvec.get_feature_names())

dep_matrix.sum().sort_values(ascending=False)

just                  38.306197
feel                  35.115579
like                  33.192872
want                  23.991713
depression            23.108524
                        ...    
worn cape              0.035454
worked curse           0.035454
worn                   0.035454
point thing just       0.035454
pretend touch cape     0.035454
Length: 10000, dtype: float64

In [217]:
#anxiety
#Instantiate the TFIDF Vectorizer
tvec=TfidfVectorizer(
                     analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3)
)


In [219]:
vector_input_anx=depanx[depanx['target']==1]['text']

#fit_transform
anx_tf_words=tvec.fit_transform(vector_input_anx)

#into array
anx_tf_words=anx_tf_words.toarray()

anx_matrix=pd.DataFrame(anx_tf_words, columns=tvec.get_feature_names())

anx_matrix.sum().sort_values(ascending=False)

anxiety                      29.093543
just                         24.147771
feel                         23.604969
like                         23.052387
day                          17.013008
                               ...    
pure maaaaaany                0.013184
practicing therapy            0.013184
practicing technique know     0.013184
process having learn          0.013184
reach benzo real              0.013184
Length: 10000, dtype: float64