# Importing Necessary Library and Set Up


In [None]:
import gzip
import gensim 
import logging
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import nltk
#nltk.download('stopwords')
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.manifold import TSNE
from IPython.display import clear_output
from keras.models import model_from_yaml


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import pickle


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import tensorflow as tf

if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")

# Preprocess Function


In [None]:
import nltk
nltk.download('stopwords')

In [None]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    # Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    # Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

# Reading Train Data (Captured From Reddit)

In [None]:
train_reddit=pd.read_csv("subreddit_posts.csv", sep=',',names=['text','label'],header=None)
train_reddit['text'] = train_reddit.fillna({'text':''})
train_reddit

train_reddit['text'] = train_reddit['text'].map(lambda x: clean_text(x))
train_reddit

# Reading Train data of ERisk

In [None]:
evaluate=pd.read_csv("FinalTrain.csv", sep=',',names=['id','text','label'],header=None)
evaluate.fillna({'text':''})
evaluate['text'] = evaluate['text'].map(lambda x: clean_text(x))
evaluate

# Loading Test Data(ERisk)


In [None]:
test=pd.read_csv("FinalTest.csv", sep=',',names=['id','text','label'],header=None)
test
test.fillna({'text':''})
test['text'] = test['text'].map(lambda x: clean_text(x))
test

# Word2Vec Processing

In [None]:
frames = [train_reddit['text'],evaluate['text']]

totalData = pd.concat(frames)

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

tokenized_line=[]
lines=totalData.values.tolist()
i=0

for line in lines:
    
        tokenized_line.append(word_tokenize(line))
    
print(tokenized_line[0:100])

In [None]:
EMBEDDING_DIM=300 #can be 100,300 
w2vmodel=gensim.models.Word2Vec(sentences=tokenized_line,size=EMBEDDING_DIM,window=2,workers=15,min_count=1)
w2vmodel.train(tokenized_line,total_examples=len(tokenized_line),epochs=10)  
words=list(w2vmodel.wv.vocab)
print('Vocabulary Size: ',len(words))

In [None]:
tokenizer=Tokenizer()
frames = [train_reddit['text'],evaluate['text']]

totalData = pd.concat(frames)


tokenizer.fit_on_texts(totalData)

#pad sequences
maxLength=max([len(s.split()) for s in totalData])
print('Max-Length: ',maxLength)
#To Reduce time we Set max length to 300
maxLength=300
#vocabulary Size
vocabularySize=len(tokenizer.word_index)+1

train_Token=tokenizer.texts_to_sequences(train_reddit['text'])
evaluate_Token=tokenizer.texts_to_sequences(evaluate['text'])

#padding
train_padded=pad_sequences(train_Token,maxlen=maxLength,padding="post")
evaluate_padded=pad_sequences(evaluate_Token,maxlen=maxLength,padding="post")

print(train_padded.shape)
print('---')
print(evaluate_padded.shape)




In [None]:
tokenizer=Tokenizer(num_words=vocabularySize)
tokenizer.fit_on_texts(tokenized_line)
seq=tokenizer.texts_to_sequences(tokenized_line)


word_index=tokenizer.word_index
print('Total Unique Token: ',len(word_index))


pad=pad_sequences(seq,maxlen=maxLength)

frames = [train_reddit['label'],evaluate['label']]

totalLabel = pd.concat(frames).values
print('Shape of Texts',pad.shape)
print('Shape of Label',totalLabel.shape)



In [None]:
embedding_matrix=np.zeros((len(word_index)+1,EMBEDDING_DIM))


for word,i in word_index.items():
    try:
        embedding_vector=w2vmodel[word]
        embedding_matrix[i]=embedding_vector
        
    except KeyError:
        pass
print('Shape of Embedding Matrix: ',embedding_matrix.shape)    

In [None]:
model=w2vmodel
model['depress']

# Creating Lexicon for feature extraction

In [None]:
w1 = ["pleasure"]
happy=model.wv.most_similar (positive=w1,topn=1000)
happy=[word for word,score in happy]
print(happy)

In [None]:
w1 = ["depression"]
depwords=model.wv.most_similar (positive=w1,topn=1500)
depwords=[word for word,score in depwords]
print(depwords)


In [None]:
w1 = ["therapist"]
therapist_words=model.wv.most_similar (positive=w1,topn=1000)
therapist_words=[word for word,score in therapist_words]
print(therapist_words)

In [None]:
w1 = ["treatment"]
diagnosis_words=model.wv.most_similar (positive=w1,topn=1000)
diagnosis_words=[word for word,score in diagnosis_words]
print(diagnosis_words)

In [None]:
medicine=['Celexa', 'Lexapro', 'Prozac', 'Sarafem', 'Selfemra', 'ProzacWeekly', 'Luvox', 'Paxil', 'Paxil CR', 'Pexeva', 'Zoloft', 'Trintellix', 'Viibryd']
medicine

In [None]:
i_word=['i','I']

In [None]:
from sklearn.preprocessing import MinMaxScaler

def textToVector(text):
            i_word_count=0
            medicine_count=0
            diagnosis_words_count=0
            therapist_words_count=0
            depwords_count=0
            happy_count=0
            vc=0
            prpc=0
            ppc=0
            vector=[]
            
            
            
            tokens=word_tokenize(text)
            for word in tokens:
                if word in i_word:
                    i_word_count=i_word_count+1
                tagged = nltk.pos_tag([word])
                if tagged[0][1]=='PRP':   # PRP	personal pronoun	I, he, she
                    prpc=prpc+1
                if tagged[0][1]=='PRP$':# PRP$	possessive pronoun	my, his, hers
                    ppc=ppc+1                                                                
                        
                                                                                    # VB	verb, base form	take
                if tagged[0][1]=='VB' or tagged[0][1]=='VBD':                        # VBD	verb, past tense	took
                    vc=vc+1
                        
            tokens=word_tokenize(text)
            for word in tokens:
                if word in medicine:
                    medicine_count=medicine_count+1
                if word in depwords:
                    depwords_count=depwords_count+1
                if word in therapist_words:
                    therapist_words_count=therapist_words_count+1
                if word in diagnosis_words:
                    diagnosis_words_count=diagnosis_words_count+1
                if word in happy:
                    happy_count=happy_count+1

            scaler = MinMaxScaler()
            a=[i_word_count,prpc,ppc,vc,lee]=scaler.fit_transform(np.array([i_word_count,prpc,ppc,vc,len(tokens)]).reshape(-1, 1))
#             print(a[0][0])


    
            vector.append([i_word_count[0],prpc[0],ppc[0],vc[0],1 if medicine_count>=1 else -1,1 if depwords_count>=1 else -1,1 if therapist_words_count>=1 else -1,1 if diagnosis_words_count>=1 else -1,1 if happy_count>=1 else -1,lee[0]])
            return vector

In [None]:
from nltk.tokenize import word_tokenize
from IPython.display import clear_output
# nltk.download('punkt')
import nltk
def featureVector(df):

    gb = df.groupby('id')

    ii=0
    eachUser_vector=[]
    for key, item in gb:
        #Each User
         
        f=gb.get_group(key)
        data= f['text'];
        class_label=f['label'].values
        lines=data.values.tolist()
        ii=ii+1
        clear_output(wait=True)
        print(ii)
        each_sentenseVector=[]
        for line in lines:
            i_word_count=0
            medicine_count=0
            diagnosis_words_count=0
            therapist_words_count=0
            depwords_count=0
            vc=0
            prpc=0
            ppc=0
            happy_count=0
            
            tokens=word_tokenize(line)
#             print(tokens)
            try:
                for word in tokens:
                    if word in i_word:
                        i_word_count=i_word_count+1
                    tagged = nltk.pos_tag([word])
                    if tagged[0][1]=='PRP':   # PRP	personal pronoun	I, he, she
                        prpc=prpc+1
                    if tagged[0][1]=='PRP$':# PRP$	possessive pronoun	my, his, hers
                        ppc=ppc+1                                                                

                                                                                        # VB	verb, base form	take
                    if tagged[0][1]=='VB' or tagged[0][1]=='VBD':                        # VBD	verb, past tense	took
                        vc=vc+1
            except Exception as e:
                print(str(e))
                    
                    
            for word in tokens:
                if word in medicine:
                    medicine_count=medicine_count+1
                if word in depwords:
                    depwords_count=depwords_count+1
                if word in therapist_words:
                    therapist_words_count=therapist_words_count+1
                if word in diagnosis_words:
                    diagnosis_words_count=diagnosis_words_count+1
                if word in happy:
                    happy_count=happy_count+1
                
            each_sentenseVector.append([i_word_count,prpc,ppc,vc,medicine_count,depwords_count,therapist_words_count,diagnosis_words_count,happy_count,len(tokens)])
#             print(each_sentenseVector)
        eachUser_vector.append(each_sentenseVector)
#         print('-------------',eachUser_vector)
        
    return eachUser_vector
            
            
            
        
        
        

In [None]:
print(evaluate.head(1))
d=featureVector(evaluate)

print(d)
ff=['i', 'believe', 'we', 'get', 'it', 'next', 'week']
for word in ff:
    tagged = nltk.pos_tag([word])
    print(tagged)
    


In [None]:
>>> import nltk
>>> nltk.download('averaged_perceptron_tagger')

In [None]:
df=[]
for user in d:
    for totalpost in x:
        v = pd.DataFrame(user,columns=['I','PersonalPronoun' ,'PossessivePronoun','Verb','Medicine','DepressionWord','Therapist','Diagonesis','Happiness','Length'])
        df.append(v)    
                   
            

In [None]:
# print(df[0])
# #

In [None]:
l=[]
for data in df:
    I=data.I.mean()
    PRP=data.PersonalPronoun.mean()
    PP=data.PossessivePronoun.mean()
    V=data.Verb.mean()
    L=data.Length.mean()
    
    DEP=data.DepressionWord.sum()
    THERAP=data.Therapist.sum()
    DIAG=data.Diagonesis.sum()
    MED=data.Medicine.sum()
    HAP=data.Happiness.sum()
    
    l.append([I,PRP,PP,V,MED,DEP,THERAP,DIAG,MED,L])
    
    
    
t = pd.DataFrame(l,columns=['I','PersonalPronoun' ,'PossessivePronoun','Verb','Medicine','DepressionWord','Therapist','Diagonesis','Happiness','Length'])
    
print(t)    
    
temp=t   

# Normalize

In [None]:
t['Medicine'] = t['Medicine'].map(lambda x: -1 if x==0 else 1)
t['DepressionWord'] = t['DepressionWord'].map(lambda x: -1 if x==0 else 1)
t['Therapist'] = t['Therapist'].map(lambda x: -1 if x==0 else 1)
t['Diagonesis'] = t['Diagonesis'].map(lambda x: -1 if x==0 else 1)
t['Happiness'] = t['Happiness'].map(lambda x: -1 if x==0 else 1)




from sklearn.preprocessing import MinMaxScaler
# >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()

t[['I','PersonalPronoun','PossessivePronoun','Verb','Length']] = scaler.fit_transform(t[['I','PersonalPronoun','PossessivePronoun','Verb','Length']])



t

In [None]:
def getLabel(df):

    gb = df.groupby('id')

    lab=[]
    for key, item in gb:
        #Each User
        
        f=gb.get_group(key)
        data= f['text'];
        class_label=f['label'].values
        lab.append(class_label[0])
    return lab
        
        
        

In [None]:
# label=getLabel(evaluate)
# len(label)
# print(label)

In [None]:
metafeatures=[]
for row in train_reddit['text']:
    metafeatures.append(textToVector(row)[0])
    

In [None]:
metafeatures

In [None]:
mf=np.array(metafeatures)
mf.shape

In [None]:
from keras.layers import Input
from keras.layers import Bidirectional
from keras.layers import concatenate
from keras.models import Model
from keras.utils.vis_utils import plot_model

nlp_input = Input(shape=(maxLength,), name='nlp_input')
meta_input = Input(shape=(10,), name='meta_input')
emb = Embedding(input_dim=len(word_index)+1,output_dim=EMBEDDING_DIM,weights=[embedding_matrix],input_length=maxLength,mask_zero=True,trainable=False)(nlp_input)
nlp_out = Bidirectional(LSTM(EMBEDDING_DIM, dropout=0.3, recurrent_dropout=0.3))(emb)
nlp_out = Dense(EMBEDDING_DIM, activation='relu')(nlp_out)
meta_out =Dense(10, activation='relu')(meta_input)
x = concatenate([nlp_out, meta_out])
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[nlp_input , meta_input], outputs=[x])
model.summary()
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit([train_padded, mf], train_reddit['label'].values, batch_size=32, epochs=5)

In [None]:
# %matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def plot_confusion_matrix(y_true, y_pred, classes,title=None,cmap=plt.cm.Purples):
   


    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    
#     print('Confusion Matrix')

#     print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    plt.show()
    return ax








In [None]:
d=userPrediction(15,test)
actual=d['Actual']
predicted=d['Prediction']
rd.append(d.values)
cm.append([i,confusion_matrix(actual, predicted)])


d=userPrediction(23,test)
actual=d['Actual']
predicted=d['Prediction']
rd.append(d.values)
cm.append([i,confusion_matrix(actual, predicted)])

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import statistics as st

pd.set_option('display.max_rows', 1000)

rw=4
for data in rd:
    if rw==13:
            print('Risk Window Size: 15')
    elif rw==14:
            print('Risk Window Size: 23')
    else:
        print('Risk Window Size: ',rw)
    rw=rw+1
    
    v = pd.DataFrame(data,columns=['Subject','Prediction' ,'Actual','Required Post to Decide'])
    
#     print(v)
    rp=v['Required Post to Decide'].values

    vv=v.values
    pLatency=[]
    for subject,p,ac,reqP in vv:
        pLatency.append([p,ac,reqP,-1+(2/(1+np.power(np.e,-0.0078*(reqP-1)))) if p==1 else 0, 0.1296 if p==1 and ac==0 else 1 if p==0 and ac==1 else 0 if p==0 and ac==0 else 1*(1-(1/(1+ np.power(np.e,reqP-5)))), 0.1296 if p==1 and ac==0 else 1 if p==0 and ac==1 else 0 if p==0 and ac==0 else 1*(1-(1/(1+ np.power(np.e,reqP-50))))])
        
    pL = pd.DataFrame(pLatency,columns=['Prediction' ,'Actual','RequiredPost','pLatency','ERDE5','ERDE50'])
#     print(pL)
    actual=v['Actual']
    actual=np.array(actual.values,dtype='int')
    predicted=v['Prediction']
    predicted=np.array(predicted.values,dtype='int')
#     print(actual)
    plot_confusion_matrix(actual, predicted, classes=['0','1'],title='Confusion Matrix')

    print ('Classification Report : ')
    print (classification_report(actual, predicted) )
    f1=f1_score(actual, predicted, average='weighted') 
    print ('Accuracy Score :',accuracy_score(actual, predicted) )

    print('F1 Score: ',f1)
    
    TPositivereqP=[]
    TPositiveUser=[]
    for subject,p,ac,reqP in vv:
        if p == 1:
            TPositivereqP.append(reqP)
#             TPositiveUser +=1  
    for pLat,pred in zip(pL['pLatency'],pL['Prediction']):
        if pred==1:
            TPositiveUser.append(pLat)
            
    pLatMedian=st.median(TPositiveUser)
            

    
    print('Average Latency: ', st.median(TPositivereqP))
            
    print('F1 Latency: ',f1*(1-pLatMedian))
    print('ERDE5: ',sum(pL['ERDE5'].values)/len(rp))
    print('ERDE50: ',sum(pL['ERDE50'].values)/len(rp))
    
    
    
    print('\n\n<----------------------->\n\n')


    
    
    
    
        
    