In [1]:
import pandas as pd
import numpy as np

import os

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords 

from transformers import BertForSequenceClassification, BertTokenizer, BertForMaskedLM

from simpletransformers.language_modeling import LanguageModelingModel

from sklearn.metrics.pairwise import cosine_similarity, paired_euclidean_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler

from tqdm import tqdm
import torch

import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline

import plotly.graph_objects as go
from functools import partial

import pickle

from collections import deque

stop_words = set(stopwords.words('english')) 


%load_ext autoreload

%autoreload 2

from utils import *
from plotting import *

In [2]:
from utils import *

In [3]:
dataFolder = '/data1/roshansk/TwitterADR/'


df = pd.read_csv(os.path.join(dataFolder,'trainTweetData.csv'))
df1 = pd.read_csv(os.path.join(dataFolder,'train_tweet_annotations.tsv'),sep='\t', header = None)
df2 = pd.read_csv(os.path.join(dataFolder,'train_tweet_ids.tsv'),sep='\t', header = None)
df2.columns = ['id','user_id','text_id']

df1.columns = ['text_id','start','end','type','ADR','drug','drug1']


df1 = df1.merge(df2, on='text_id')
df = df1.merge(df[['id','text']], on='id')

trainDf = df.copy()

df = pd.read_csv(os.path.join(dataFolder,'testTweetData.csv'))
df1 = pd.read_csv(os.path.join(dataFolder,'test_tweet_annotations.tsv'),sep='\t', header = None)
df2 = pd.read_csv(os.path.join(dataFolder,'test_tweet_ids.tsv'),sep='\t', header = None)
df2.columns = ['id','user_id','text_id']

df1.columns = ['text_id','start','end','type','ADR','drug','drug1']


df1 = df1.merge(df2, on='text_id')
df = df1.merge(df[['id','text']], on='id')

testDf = df.copy()

fullDf = pd.concat([trainDf,testDf],axis =0)


fullDf['message'] = fullDf.text

fullDf.drop_duplicates(subset='message',inplace =True)

print(len(fullDf))
print(fullDf.ADR.value_counts())

489
withdrawal          13
tired                9
withdrawals          7
sleep                7
sick                 6
                    ..
sucidal thoughts     1
mood swings          1
tummytrouble         1
burnt                1
aches                1
Name: ADR, Length: 367, dtype: int64


In [4]:
model = BertForSequenceClassification.from_pretrained('/data1/roshansk/Exp1/checkpoint-141753-epoch-1', output_hidden_states= True)

tokenizer = BertTokenizer.from_pretrained('/data1/roshansk/Exp1/checkpoint-141753-epoch-1')

In [36]:
class ADRModel(object):
    
    def __init__(self, model, tokenizer, graph, queue=None, useMasterEmb = False, masterContrib = 0.5):
        
        self.model = model
        self.tokenizer = tokenizer
        self.graph = graph
        if queue is None:
            self.q = deque()
        else:
            self.q = queue
            
            
        self.masterEmb = None
        
        self.useMasterEmb = useMasterEmb
        self.masterContrib = masterContrib
        
        
        
        
    def getSymptomEmbedding(self, df, symptom,  embeddingType = 'last4sum', subset = None):
    
        embeddingList = []
        messageList = []

        if subset is not None:
            df = df.iloc[subset]

        if type(df) == pd.Series:
            df = pd.DataFrame(df).T

        symptomToken = self.tokenizer.encode(symptom)[1]

        for i in range(len(df)):

            if symptomToken in self.tokenizer.encode(df.iloc[i]['message'].lower()):

                tokens = self.tokenizer.encode(df.iloc[i]['message'].lower())
                decoded = self.tokenizer.decode(tokens).split(" ")
                logits, hidden_states = self.model(torch.Tensor(tokens).unsqueeze(0).long())

                hidden_states = torch.stack(hidden_states).squeeze(1).permute(1,0,2)


                try:
                    tokenIndex = tokens.index(symptomToken)
                except:
                    a= 1
                    continue


                if embeddingType == 'last4sum':
                    embedding = torch.sum(hidden_states[tokenIndex,9:13,:],0)
                elif embeddingType =='last4concat':
                    embedding = hidden_states[tokenIndex,9:13,:].reshape(-1)
                elif embeddingType == 'secondlast':
                    embedding = hidden_states[tokenIndex,-2,:]
                else:
                    embedding = hidden_states[tokenIndex,-1,:]


                embeddingList.append(embedding.detach().cpu().numpy())
                messageList.append(df.iloc[i]['message'].lower())

                if len(embeddingList)==30:
                    break



        return embeddingList, messageList
    
    
    
    def getSimilarWords(self,df, symptom, embList, similarityThreshold = 0.3, numThreshold = 10000):
    
     
        output = []


        symptomToken = self.tokenizer.encode(symptom)[1]

        for i in range(len(df)):

            if symptomToken in self.tokenizer.encode(df.iloc[i]['message'].lower()):

                tokens = self.tokenizer.encode(df.iloc[i]['message'].lower())
                decoded = self.tokenizer.decode(tokens).split(" ")
                logits, hidden_states = self.model(torch.Tensor(tokens).unsqueeze(0).long())

                hidden_states = torch.stack(hidden_states).squeeze(1).permute(1,0,2)


                hidden_states = hidden_states[:,9:13,:]
                hidden_states = torch.sum(hidden_states,1).detach().cpu().numpy()

                similarity = cosine_similarity(hidden_states, embList.reshape(1,-1)).reshape(-1)


                index = np.where([similarity> similarityThreshold])[1]

                selectTokens = np.array(tokens)[index]
                selectSim = similarity[index]



                for j in range(len(index)):
                    token = self.tokenizer.ids_to_tokens[selectTokens[j]]
                    sim = selectSim[j]
                    output.append((token, sim,i))


            if i==numThreshold:
                break

        return output
    
    
    
    def getOutput(self, out):
    
        output = out

        outMap = {}

        for i in range(len(output)):
            if output[i][0] in outMap:
                outMap[output[i][0]].append(output[i][1])
            else:
                outMap[output[i][0]] = [output[i][1]]


        outMap_ = {}

        for i in range(len(output)):
            if output[i][0] in outMap_:
                outMap_[output[i][0]].append(output[i][2])
            else:
                outMap_[output[i][0]] = [output[i][2]]


        outputDf = []

        for key in outMap.keys():
            length = len(outMap[key])
            mean = np.mean(outMap[key])

            outputDf.append([key, length, mean])

        outputDf = pd.DataFrame(outputDf)
        outputDf.columns = ['word','counts','mean_sim']
        outputDf = outputDf.sort_values('mean_sim', ascending=False)

        return outputDf, outMap, outMap_
    
    
    
    
    def exploreNode(self, word, depth, fullDf, maxDepth = 3, topk = 5):

    
        self.graph.addNode(word,0,depth)

        print(f"Depth : {depth} Exploring {word}")

        if depth == maxDepth:
            print("Reached max depth")
            return

        keyWord = word

        token = self.tokenizer.encode(keyWord)[1]

        if self.graph[word].vector is None:

            inEdgeList = self.graph[word].edges_in

            if len(inEdgeList)==0:
                textIDList = None
            else:
                textIDList = []

                for edge in inEdgeList:
                    textIDList.append(self.graph.edgeList[edge].textID)

                textIDList = list(set(list(itertools.chain.from_iterable(textIDList))))

            
            embList,msgList = self.getSymptomEmbedding(fullDf, keyWord, embeddingType='last4sum', subset = textIDList)

            meanEmb = np.array(embList)
            meanEmb = np.mean(meanEmb,0)


            self.graph[word].vector = meanEmb
            
            if self.masterEmb is None:
                self.masterEmb = meanEmb
            
            dist = getCosineDist(meanEmb, self.masterEmb)
            
            self.graph[word].masterDist = dist

        else:
            meanEmb = self.graph[word].vector


        symptom_ =''
        embList_ = meanEmb

        if self.useMasterEmb:
            
            finalEmb = self.masterContrib*self.masterEmb + (1 - self.masterContrib)*meanEmb
            
            out = self.getSimilarWords(fullDf.iloc[0:100], symptom_, finalEmb , similarityThreshold = 0.3, numThreshold = 100000)
        else:
            out = self.getSimilarWords(fullDf.iloc[0:100], symptom_, meanEmb, similarityThreshold = 0.3, numThreshold = 100000)

        outputDf, outMap, outMap_ = self.getOutput(out)

        outputDf = outputDf[outputDf.word!=keyWord]
    #     outputDf = outputDf[~outputDf.word.isin(list(graph.wordMap.keys()))]
        outputDf = outputDf.sort_values('mean_sim', ascending=False)
        outputDf = outputDf.head(topk)

        outputDf = outputDf[outputDf.mean_sim>0.4]

        print(outputDf)
        print("-----------------------")

        for i in range(len(outputDf)):

            word = outputDf.iloc[i]['word']
            numCount = outputDf.iloc[i]['counts']
            weight = outputDf.iloc[i]['mean_sim']
            textIDs = outMap_[word]

            wordList = set(self.graph.wordMap.keys())

            self.graph.addNode(word,0,depth+1)
            self.graph[word].textIDList.append(textIDs)
            self.graph.addEdge(keyWord, word, numCount, weight, textIDs)

            if word in wordList:
                continue

            if "#" in word:
                continue


            self.q.append((word, depth+1))
            
            
    def trainModel(self, maxDepth = 3, topk = 5):
        
        currDepth = 0
        
        while len(self.q)>0:
            token, depth = self.q.popleft()
            
            if depth> currDepth:
                self.getMeanEmbedding(depth-1)
                currDepth += 1
            
            self.exploreNode(word = token, depth = depth, fullDf = fullDf, maxDepth=maxDepth, topk=topk)


            
    def getMeanEmbedding(self, depth, topk = 3):
        
        candidates = self.graph.depthMap[depth]
        
        vals = [self.graph[x].masterDist for x in candidates]
        
        vals = [(x,y) for x,y in zip(candidates,vals)]
        
        vals = sorted(vals, key = lambda x : -x[1])
        
        meanEmb = self.masterEmb
        
        for i in range(min(topk, len(vals)) ):
            meanEmb += self.graph[ vals[i][0] ].vector
            
        meanEmb = meanEmb/(topk+1)
        
        self.masterEmb = meanEmb
        
        print("Master Embedding updated")
        
        
    
    def plotGraph(self):
        
        edgeList, nodeList, nodeValues, nodeCount, nodeText, nodeSize = getGraphComponents(self.graph)

        G=nx.Graph()

        G.add_nodes_from(nodeList)
        G.add_edges_from(edgeList)

        edge_trace, node_trace1, node_trace = getPlotlyComponents(G, nodeList, nodeSize, nodeValues, nodeText)


        fig = go.Figure(data=[edge_trace, node_trace1, node_trace],
             layout=go.Layout(
                title='<br>Network graph made with Python',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=50),

                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
        
        fig.show()
        
        


In [34]:
graph = Graph()

q = deque()
q.append(('tired',0))

ADR = ADRModel(model, tokenizer, graph, q, useMasterEmb=True, masterContrib=0.3)

In [13]:
ADR.trainModel(maxDepth=3,topk=4) #Master Contrib = 0.3

Depth : 0 Exploring tired
          word  counts  mean_sim
119  depressed       2  0.655812
70     anxious       2  0.650671
7       sleepy       2  0.628737
29        sick       5  0.608247
-----------------------
Master Embedding updated
Depth : 1 Exploring depressed
       word  counts  mean_sim
55  anxious       2  0.787428
5     tired       4  0.696910
75    angry       1  0.624555
52    weird       1  0.597824
-----------------------
Depth : 1 Exploring anxious
          word  counts  mean_sim
107  depressed       2  0.778725
5        tired       4  0.711929
84       angry       1  0.629397
69       happy       1  0.627978
-----------------------
Depth : 1 Exploring sleepy
          word  counts  mean_sim
6        tired       4  0.709991
187     asleep       1  0.697489
51     anxious       2  0.653966
91   depressed       2  0.588023
-----------------------
Depth : 1 Exploring sick
       word  counts  mean_sim
4     tired       4  0.670607
77  anxious       2  0.622801
28    di

In [35]:
ADR.trainModel(maxDepth=4,topk=4) #Master Contrib = 0.3

Depth : 0 Exploring tired
          word  counts  mean_sim
119  depressed       2  0.655812
70     anxious       2  0.650671
7       sleepy       2  0.628737
29        sick       5  0.608247
-----------------------
Master Embedding updated
Depth : 1 Exploring depressed
       word  counts  mean_sim
55  anxious       2  0.787428
5     tired       4  0.696910
75    angry       1  0.624555
52    weird       1  0.597824
-----------------------
Depth : 1 Exploring anxious
          word  counts  mean_sim
107  depressed       2  0.778725
5        tired       4  0.711929
84       angry       1  0.629397
69       happy       1  0.627978
-----------------------
Depth : 1 Exploring sleepy
          word  counts  mean_sim
6        tired       4  0.709991
187     asleep       1  0.697489
51     anxious       2  0.653966
91   depressed       2  0.588023
-----------------------
Depth : 1 Exploring sick
       word  counts  mean_sim
4     tired       4  0.670607
77  anxious       2  0.622801
28    di

In [37]:
edgeList, nodeList, nodeValues, nodeCount, nodeText, nodeSize = getGraphComponents(graph)

G=nx.Graph()

G.add_nodes_from(nodeList)
G.add_edges_from(edgeList)

edge_trace, node_trace1, node_trace = getPlotlyComponents(G, nodeList, nodeSize, nodeValues, nodeText)


fig = go.Figure(data=[edge_trace, node_trace1, node_trace],
     layout=go.Layout(
        title='<br>Network graph made with Python',
        titlefont_size=16,
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20,l=5,r=5,t=50),

        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
        )

fig.show()

In [40]:
graph.describeNode('insane')

Exploring insane
awesome    -> insane     | 1 |  0.565 | [20]
--------------------


In [41]:
fullDf.iloc[20]['message']

"Cymbalta, you're driving me insane."

In [30]:
emb,_ = getSymptomEmbedding(model,tokenizer, fullDf.iloc[39], 'awesome',0)

In [33]:
getCosineDist(ADR.masterEmb, emb[0])

array([[0.6298677]], dtype=float32)

In [32]:
emb

[array([ 2.96444631e+00,  3.27073956e+00, -9.96160686e-01, -1.19822514e+00,
         3.59868497e-01, -3.48791075e+00, -2.61698818e+00, -2.96509117e-02,
        -4.04700375e+00, -1.52275771e-01, -8.97271097e-01, -1.26959610e+00,
        -7.63639033e-01, -1.06069577e+00, -3.62464041e-02, -2.40681097e-01,
         8.91781807e-01, -2.79965729e-01, -3.12939167e+00, -4.79267502e+00,
        -1.75556517e+00,  2.88245976e-01, -4.18346214e+00,  1.24458587e+00,
         1.12164366e+00,  4.49977905e-01,  9.37017202e-01, -4.03610611e+00,
        -1.70025206e+00, -1.44806409e+00, -8.84110808e-01,  7.05302715e-01,
        -1.26265311e+00,  2.15598726e+00, -1.00698316e+00, -2.26036167e+00,
        -1.40388042e-01, -9.82859135e-02,  1.88572335e+00,  6.94026947e-01,
         1.33543718e+00, -2.99743032e+00, -1.56085563e+00,  3.24682212e+00,
         1.09423548e-01,  4.07448262e-01,  9.24590647e-01,  1.03492069e+00,
         1.58998752e+00, -2.42028236e+00,  2.23505163e+00, -8.98798943e-01,
        -3.4

In [9]:
ADR.trainModel(maxDepth=2,topk=4) #Master Contrib = 0.5

Depth : 0 Exploring tired
          word  counts  mean_sim
119  depressed       2  0.655812
70     anxious       2  0.650671
7       sleepy       2  0.628737
29        sick       5  0.608247
-----------------------
Master Embedding updated
Depth : 1 Exploring depressed
       word  counts  mean_sim
63  anxious       2  0.789780
5     tired       4  0.753729
85    angry       1  0.612941
59    weird       1  0.607613
-----------------------
Depth : 1 Exploring anxious
          word  counts  mean_sim
124  depressed       2  0.785289
5        tired       4  0.767029
6       sleepy       2  0.643648
78       happy       1  0.627828
-----------------------
Depth : 1 Exploring sleepy
          word  counts  mean_sim
6        tired       4  0.764397
220     asleep       1  0.695253
62     anxious       2  0.676594
107  depressed       2  0.621635
-----------------------
Depth : 1 Exploring sick
          word  counts  mean_sim
5        tired       4  0.735753
94     anxious       2  0.653078

In [None]:
ADR.trainModel(maxDepth=2,topk=4)

In [35]:
ADR.trainModel(maxDepth=2,topk=4)

Depth : 0 Exploring tired
          word  counts  mean_sim
119  depressed       2  0.655812
70     anxious       2  0.650671
7       sleepy       2  0.628737
29        sick       5  0.608247
-----------------------
Master Embedding updated
Depth : 1 Exploring depressed
       word  counts  mean_sim
6     tired       4  0.865564
70  anxious       2  0.650671
7    sleepy       2  0.628737
29     sick       5  0.608247
-----------------------
Depth : 1 Exploring anxious
          word  counts  mean_sim
6        tired       4  0.865564
119  depressed       2  0.655812
7       sleepy       2  0.628737
29        sick       5  0.608247
-----------------------
Depth : 1 Exploring sleepy
          word  counts  mean_sim
6        tired       4  0.865564
119  depressed       2  0.655812
70     anxious       2  0.650671
29        sick       5  0.608247
-----------------------
Depth : 1 Exploring sick
          word  counts  mean_sim
6        tired       4  0.865564
119  depressed       2  0.655812

In [29]:
ADR.trainModel(maxDepth=2,topk=4)

Depth : 0 Exploring tired
          word  counts  mean_sim
119  depressed       2  0.655812
70     anxious       2  0.650671
7       sleepy       2  0.628737
29        sick       5  0.608247
-----------------------
Depth : 1 Exploring depressed
       word  counts  mean_sim
40  anxious       2  0.773500
4     tired       4  0.629324
56    angry       1  0.627117
45      sad       1  0.580321
-----------------------
Depth : 1 Exploring anxious
         word  counts  mean_sim
84  depressed       2  0.760904
4       tired       4  0.646776
69      angry       1  0.630664
52      happy       1  0.619025
-----------------------
Depth : 1 Exploring sleepy
        word  counts  mean_sim
149   asleep       1  0.689712
5      tired       4  0.644995
40   anxious       2  0.621777
113     ##wn       1  0.576222
-----------------------
Depth : 1 Exploring sick
       word  counts  mean_sim
4     tired       4  0.592778
63  anxious       2  0.581277
25    dizzy       1  0.568730
60    weird       

0