In [37]:
import math
import re
import string
import sys

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
stemmer = PorterStemmer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\73183\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
#file = open('StopWords.txt') 
#sw = file.read().splitlines()

def remove_punc(line):

    return line.translate(str.maketrans('', '', string.punctuation))

def remove_url(line):

    return re.sub(r'http\S+', '', line)

def remove_digits(line):

    return re.sub(r'[0-9]+', '', line)

def remove_mention(line):
    
    return re.sub(r'@\S+', '', line)

def cut_sentence(line):
    '''
    cut the sentence to only keep words before "depression"
    '''
    words = word_tokenize(line)
    toRet = [] 
    for word in words:
        toRet.append(word)
        if word in ["depression", "depressed"]:
            break
    return toRet
    
def token(line):
    '''
    tokenize the whole tweet. Returns [[Pos tags sentence 1][Pos tags sentence 2]]
    This function only keep sentences contains target word i.e. "depression", "depreseed".
    Only words before the target word will be tokenized.
    Each token's POS tag will also be stored
    
    note: POS tagging is case sensitive, i.e. "i" is NN(norn) and "I" is PRP(pronoun)
    
    can add extra procedure to check the case of word and correct misspelling
    '''
    
    #preprocessing
    #line = line.lower()
    line = remove_mention(line)
    line = remove_url(line)
    line = remove_digits(line) 
    line = re.sub(r'’', "'", line) #some tweets misspelled it and casue problem in nltk
    #line = remove_punc(line)
    
    sentences = sent_tokenize(line)
    sentence_candi = list()
    target = ["depression", "depressed"]
    
    for sentence in sentences:
        if any(word in sentence for word in target):
            sentence_candi.append(nltk.pos_tag(cut_sentence(sentence)))
    
    return sentence_candi

In [13]:
token("im activating that day. I cannot handle my depression and jealousy")

[[('I', 'PRP'),
  ('can', 'MD'),
  ('not', 'RB'),
  ('handle', 'VB'),
  ('my', 'PRP$'),
  ('depression', 'NN')]]

In [9]:

def match_depress(tweet):
    '''
    Return the simplified version of tweet.
    "Simplified" means: 
    1. only care about sentence contains target word i.e. depression, depressed
    2. remove all word after the target word
    3. remove all word in discard_tags, currently contains Determiner(DT), Adjective(JJ, JJR, JJS), Coordinating Conjunction(CC), and Adverb(RB)* 
    
    
    note: 1.Aeed optimize tags in discard_tags. Norm(NN) might be a good idea
          2.A maximum iteration is implemented here to limit words extracted and make the simplied sentence recognized by my simple CFG tree(defined in next cell)
          Maximum iteration can be removed when we have a satisfying CFG tree which can proceed more general sentence
          3.The same as break when encounter pronoun. This can be remover when we have a satisfying CFG tree
    
    *All adverb is removed except "not" "n't", these words are used to detect negation
    '''
    res = list()
    discard_tags = ["DT", "JJ", "JJR", "JJS", "CC", "RB"] #words with these tags will be discarded
    sentences_tags = token(tweet) #sentences in tweet cut at keyword
    
    #for each cutted sentence
    for sentence in sentences_tags:
        
        sentence = sentence[::-1] #reverse the setence to iterate from keyword "depression"
        print(sentence)
        iteration = 0
        
        word_buffer = ""
        
        for word in sentence:
            
            #if (iteration == 0):
                #word_buffer = word[0]+" " + word_buffer
            
            #case when reached maximum iteration 
            if (iteration == 5):
                break
            
            if (word[1] not in discard_tags ) or (word[0] in ["not", "n't"]):
                word_buffer = word[0]+" " + word_buffer
                if (word[1] in ["PRP$", "PRP"]): #when encountered pronoun, we have enough information and no need to iterate more i.e. "my depression", "his depression"
                    break
            
            iteration+=1
            print(word_buffer)
            
            
        res.append(word_buffer)
        
    return res
        
    

In [14]:
#example to stop at pronoun
match_depress("im activating that day. I cannot handle my depression and jealousy")


[('depression', 'NN'), ('my', 'PRP$'), ('handle', 'VB'), ('not', 'RB'), ('can', 'MD'), ('I', 'PRP')]
depression 


['my depression ']

In [11]:
'''
Here is a simple Context Free Grammar Tree to proceed simplified sentence returned by match_depress().
Once the simplified sentence can sucessfully pass this CFG tree (returns sentece structure) it is related to depression.

Only words in the ending node(leaf) can be recognized, all other words will triger ValueError. That's why we need to simplify sentence first.
Here the CFG tree is a simple one and complex sentences need to be simplified very much to be recognized (see note2,3 in function match_depress ). 
This might lost useful information 

Possible optimization in:
1. enriching the word in ending node(leaf)
2. implementing multiple CFG tree accroding to multiple sentence structure to fit gerenral sentence. 
   Once a sentence is accpeted by one of them, it is related to depression

'''
grammar2 = nltk.CFG.fromstring("""
  S  -> NP VP| NP
  NP -> PPRP N | N | PRP
  Nom -> Adj Nom | N
  VP -> V Adj | V NP | V PP | V VP | V TO VP
  PP -> P NP| P PP
  PRP -> 'i' 
  PPRP -> 'my' | 'it'
  N -> 'depression' | 'nightmare'
  Adj  -> 'depressed' | 'depressing'
  V ->  "'s"|"'m" | 'is' | 'was'  | 'have' | 'detected' | 'diagnosed' | 'feel' | 'feeling' | 'hate' | 'want' | 'go' | 'fail' | 'fell' | 'back' | 'do' | 'did' | 'been'| 'can' |handle
  P -> 'on' | 'with' | 'as' | 'into' | 'like' | 'to' | 'since'
  TO -> 'to'
  """)

In [15]:
#sucessful example
sent = "I have depression "
sent = word_tokenize(sent.lower())
rd_parser = nltk.ChartParser(grammar2)
for tree in rd_parser.parse(sent):
      print(tree)

(S (NP (PRP i)) (VP (V have) (NP (N depression))))


In [16]:
#failed example (contain word "n't" which is not in CFG tree) triger ValueError
sent = "I don't have depression "
sent = word_tokenize(sent.lower())
rd_parser = nltk.ChartParser(grammar2)
for tree in rd_parser.parse(sent):
      print(tree)

ValueError: Grammar does not cover some of the input words: '"n\'t"'.

In [17]:
#failed example 2 (not a sentence structure ) return nothing
sent = "I depression "
sent = word_tokenize(sent.lower())
rd_parser = nltk.ChartParser(grammar2)
for tree in rd_parser.parse(sent):
      print(tree)

#Method 2

In [1]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\73183\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("My depression")

{'neg': 0.787, 'neu': 0.213, 'pos': 0.0, 'compound': -0.5719}

In [5]:
sia.polarity_scores("feel depressed")

{'neg': 0.767, 'neu': 0.233, 'pos': 0.0, 'compound': -0.5106}

In [25]:
sia.polarity_scores("hopefully, depression is better understood and easier to talk about now as compared to the 90's")

{'neg': 0.154, 'neu': 0.498, 'pos': 0.349, 'compound': 0.5719}

In [27]:
sia.polarity_scores("Fighting Depression Was My Biggest Flex Last Year Weary face Do Yall Know How Hard It Is To Fight Everyday With Your Mind")

{'neg': 0.406, 'neu': 0.594, 'pos': 0.0, 'compound': -0.8834}

In [28]:
sia.polarity_scores("Battling depression nobody knows how I feel")

{'neg': 0.592, 'neu': 0.408, 'pos': 0.0, 'compound': -0.7003}

In [29]:
sia.polarity_scores("Me when my anxiety and depression hit hard.")

{'neg': 0.576, 'neu': 0.424, 'pos': 0.0, 'compound': -0.7003}

In [30]:
sia.polarity_scores("a 2022 sangi vlive could cure my depression")

{'neg': 0.381, 'neu': 0.619, 'pos': 0.0, 'compound': -0.5719}

In [31]:
sia.polarity_scores("Babe wake up, it's your daily itufashi depression") #this will make sense if the subject of depression is "my" not "your"

{'neg': 0.346, 'neu': 0.654, 'pos': 0.0, 'compound': -0.5719}

In [32]:
sia.polarity_scores("After the supreme being,another thing that I fear in life is depression") 

{'neg': 0.373, 'neu': 0.432, 'pos': 0.195, 'compound': -0.5106}

In [33]:
sia.polarity_scores("When anxiety & depression rule, they can lead you to live a smaller life") #not a good example

{'neg': 0.351, 'neu': 0.649, 'pos': 0.0, 'compound': -0.6597}

Idea is to first filter tweet by keywords: "depression", "depressed". Then only cut and conduct dependency analysis on sentence contains keywords. After analyzing the dependency relation of keyword. 

In [1]:

import spacy
from spacy import displacy
!spacy download en_core_web_sm


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [33]:
sp = spacy.load('en_core_web_sm')
tweet = "I was diagonized with depression"
content = sp(tweet)

In [34]:
displacy.render(content, style='dep', jupyter=True, options={'distance': 120})

In [28]:
print('{:<20} {:<20} {:<20}'.format('Token', 'Dependency', 'Head'))
for token in content:
    print('{:<20} {:<20} {:<20}'.format(str(token.text), token.dep_, token.head.text))

Token                Dependency           Head                
Depression           nsubj                is                  
is                   ROOT                 is                  
popular              acomp                is                  
among                prep                 is                  
teenagers            pobj                 among               


First get the head of keywords:
(1) Head of key word is AUX (auxiliary) i.e. i'm depressed. He is depressed. Find the subject of AUX. Pass if the subject is first person.