In [6]:
import math
import re
import string
import sys

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
stemmer = PorterStemmer()

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [WinError 10054] An existing connection was forcibly
[nltk_data]     closed by the remote host>


In [7]:
#file = open('StopWords.txt') 
#sw = file.read().splitlines()

def remove_punc(line):

    return line.translate(str.maketrans('', '', string.punctuation))

def remove_url(line):

    return re.sub(r'http\S+', '', line)

def remove_digits(line):

    return re.sub(r'[0-9]+', '', line)

def remove_mention(line):
    
    return re.sub(r'@\S+', '', line)

def cut_sentence(line):
    '''
    cut the sentence to only keep words before "depression"
    '''
    words = word_tokenize(line)
    toRet = [] 
    for word in words:
        toRet.append(word)
        if word in ["depression", "depressed"]:
            break
    return toRet
    
def token(line):
    '''
    tokenize the whole tweet. 
    This function only keep sentences contains target word i.e. "depression", "depreseed".
    Only words before the target word will be tokenized.
    Each token's POS tag will also be stored
    
    note: POS tagging is case sensitive, i.e. "i" is NN(norn) and "I" is PRP(pronoun)
    
    can add extra procedure to check the case of word and correct misspelling
    '''
    
    #preprocessing
    #line = line.lower()
    line = remove_mention(line)
    line = remove_url(line)
    line = remove_digits(line) 
    line = re.sub(r'’', "'", line)
    #line = remove_punc(line)
    
    sentences = sent_tokenize(line)
    sentence_candi = list()
    target = ["depression", "depressed"]
    
    for sentence in sentences:
        if any(word in sentence for word in target):
            sentence_candi.append(nltk.pos_tag(cut_sentence(sentence)))
    
    return sentence_candi

In [13]:
token("im activating that day. I cannot handle my depression and jealousy")

[[('I', 'PRP'),
  ('can', 'MD'),
  ('not', 'RB'),
  ('handle', 'VB'),
  ('my', 'PRP$'),
  ('depression', 'NN')]]

In [9]:

def match_depress(tweet):
    '''
    Return the simplified version of tweet.
    "Simplified" means: 
    1. only care about sentence contains target word i.e. depression, depressed
    2. remove all word after the target word
    3. remove all word in discard_tags, currently contains Determiner(DT), Adjective(JJ, JJR, JJS), Coordinating Conjunction(CC), and Adverb(RB)* 
    
    
    note: 1.Aeed optimize tags in discard_tags. Norm(NN) might be a good idea
          2.A maximum iteration is implemented here to limit words extracted and make the simplied sentence recognized by my simple CFG tree(defined in next cell)
          Maximum iteration can be removed when we have a satisfying CFG tree which can proceed more general sentence
          3.The same as break when encounter pronoun. This can be remover when we have a satisfying CFG tree
    
    *All adverb is removed except "not" "n't", these words are used to detect negation
    '''
    res = list()
    discard_tags = ["DT", "JJ", "JJR", "JJS", "CC", "RB"] #words with these tags will be discarded
    sentences_tags = token(tweet) #sentences in tweet cut at keyword
    
    #for each cutted sentence
    for sentence in sentences_tags:
        
        sentence = sentence[::-1] #reverse the setence to iterate from keyword "depression"
        print(sentence)
        iteration = 0
        
        word_buffer = ""
        
        for word in sentence:
            
            #if (iteration == 0):
                #word_buffer = word[0]+" " + word_buffer
            
            #case when reached maximum iteration 
            if (iteration == 5):
                break
            
            if (word[1] not in discard_tags ) or (word[0] in ["not", "n't"]):
                word_buffer = word[0]+" " + word_buffer
                if (word[1] in ["PRP$", "PRP"]): #when encountered pronoun, we have enough information and no need to iterate more i.e. "my depression", "his depression"
                    break
            
            iteration+=1
            print(word_buffer)
            
            
        res.append(word_buffer)
        
    return res
        
    

In [14]:
#example to stop at pronoun
match_depress("im activating that day. I cannot handle my depression and jealousy")


[('depression', 'NN'), ('my', 'PRP$'), ('handle', 'VB'), ('not', 'RB'), ('can', 'MD'), ('I', 'PRP')]
depression 


['my depression ']

In [11]:
'''
Here is a simple Context Free Grammar Tree to proceed simplified sentence returned by match_depress().
Once the simplified sentence can sucessfully pass this CFG tree (returns sentece structure) it is related to depression.

Only words in the ending node(leaf) can be recognized, all other words will triger ValueError. That's why we need to simplify sentence first.
Here the CFG tree is a simple one and complex sentences need to be simplified very much to be recognized (see note2,3 in function match_depress ). 
This might lost useful information 

Possible optimization in:
1. enriching the word in ending node(leaf)
2. implementing multiple CFG tree accroding to multiple sentence structure to fit gerenral sentence. 
   Once a sentence is accpeted by one of them, it is related to depression

'''
grammar2 = nltk.CFG.fromstring("""
  S  -> NP VP| NP
  NP -> PPRP N | N | PRP
  Nom -> Adj Nom | N
  VP -> V Adj | V NP | V PP | V VP | V TO VP
  PP -> P NP| P PP
  PRP -> 'i' 
  PPRP -> 'my' | 'it'
  N -> 'depression' | 'nightmare'
  Adj  -> 'depressed' | 'depressing'
  V ->  "'s"|"'m" | 'is' | 'was'  | 'have' | 'detected' | 'diagnosed' | 'feel' | 'feeling' | 'hate' | 'want' | 'go' | 'fail' | 'fell' | 'back' | 'do' | 'did' | 'been'| 'can' |handle
  P -> 'on' | 'with' | 'as' | 'into' | 'like' | 'to' | 'since'
  TO -> 'to'
  """)

In [15]:
#sucessful example
sent = "I have depression "
sent = word_tokenize(sent.lower())
rd_parser = nltk.ChartParser(grammar2)
for tree in rd_parser.parse(sent):
      print(tree)

(S (NP (PRP i)) (VP (V have) (NP (N depression))))


In [16]:
#failed example (contain word "n't" which is not in CFG tree) triger ValueError
sent = "I don't have depression "
sent = word_tokenize(sent.lower())
rd_parser = nltk.ChartParser(grammar2)
for tree in rd_parser.parse(sent):
      print(tree)

ValueError: Grammar does not cover some of the input words: '"n\'t"'.

In [17]:
#failed example 2 (not a sentence structure ) return nothing
sent = "I depression "
sent = word_tokenize(sent.lower())
rd_parser = nltk.ChartParser(grammar2)
for tree in rd_parser.parse(sent):
      print(tree)