In [1]:
import pandas as pd
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.models import KeyedVectors, Word2Vec, phrases
from gensim.parsing import preprocessing
from gensim.parsing.preprocessing import strip_tags, strip_punctuation,strip_numeric,remove_stopwords
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from os import walk
from os import listdir
from os.path import isfile, join

from pprint import pprint

import pickle

from matplotlib import pyplot as plt

import numpy as np
import seaborn as sns
import matplotlib.colors as mcolors
import re

In [2]:
#  Clean data
def text_cleaning(data):
    new_sentences = []
    filters = [lambda x: x.lower(), strip_tags, strip_punctuation,strip_numeric,remove_stopwords]
    excluded = ['breast', 'cancer', 'survivorship', 'born', 'alive', 'live', 'die', 'died']
    for i in range(data.shape[0]):
        txt = data.iloc[i]
        txt = txt.lower()
        txt = re.sub("(#.*?)[\s]"," ",txt) # remove all # from tweets
        txt = re.sub("breastcancer"," ",txt)
        
        c_words = []
        words = preprocessing.preprocess_string(txt, filters)
        for w in words:
            if len(w)>3 and w not in excluded:
                c_words.append(w)

        new_sentences.append(c_words)
    
    return new_sentences

In [3]:
# original data
data = pd.read_csv("breast_cancer.csv")['Text']
sentences_bcancer = text_cleaning(data)
original_bcancer = data.to_numpy()
original_bcancer[0]

'my mom could have worked while dying from stage 4 breast cancer & paid for tx out of pocket?'

In [4]:
# create best Bigrams of tweets(calculated based on how often words come together in all tweets)
bigram = gensim.models.Phrases(sentences_bcancer) 
bigram_mod = gensim.models.phrases.Phraser(bigram)
sentences_bcancer = [bigram_mod[doc] for doc in sentences_bcancer]

In [5]:
# original lexicon
data_lexicon = pd.read_csv("brest_cancer_lexicon.csv")['Drug'].to_numpy()
data_lexicox_clean = ["_".join(data.lower().split()) for data in data_lexicon]
data_lexicox_clean[:10]

['chemotherapy',
 'chemo',
 'radiotherapy',
 'evista',
 'raloxifene',
 'hydrochloride',
 'raloxifene_hydrochloride',
 'tamoxifen',
 'citrate',
 'tamoxifen_citrate']

In [6]:
# Load from drug-related w2vec
path_tune = "/Users/thiago/Github/Data/BioW2Vec/DSM-language-model-1B-LARGE/trig-vectors-phrase.bin"
word_vectors = KeyedVectors.load_word2vec_format(path_tune, binary=True, encoding='utf8', unicode_errors='ignore')

In [7]:
# expand original lexion, if not done yet
expand = False
expanded = []
if expand:
    for word in data_lexicox_clean:
        if word in word_vectors:
            expanded.append(word)
            similar = [x[0] for x in word_vectors.most_similar(word,topn=5)]
            expanded.extend(similar)
    
    # save to file
    out = "brest_cancer_lexicon_expanded.csv"
    dict = {'Drug': expanded}   
       
    df = pd.DataFrame(dict) 
    # saving the dataframe  
    df.to_csv(out)  
    print(expanded[0:10])
    

['chemotherapy', 'chemo', 'bleomycin', 'chemo_radiation', 'cisplatin', 'lymphoma', 'chemo', 'chemotherapy', 'chemo_treatments', 'chemo_radiation']


In [8]:
# load expanded lexicon
data_lexicon_expanded = pd.read_csv("brest_cancer_lexicon_expanded.csv")['Drug'].to_numpy()

In [9]:
# based on a giving tweet word from user, database of lexicon and ratio
# return a list of all words from user which is misspelled and are breast cancer expression
import Levenshtein
def get_mispelling( w_tweet, database, ratio):
    out = []
    for treatment in database:
        lev_ratio = Levenshtein.ratio(treatment,w_tweet)
        if lev_ratio > ratio:
            out.append(treatment)
    return out

In [13]:
# for each tweet, let's find if tweet has a treatment
#original_bcancer
new_database = []
treatment = []
for index,tweet in enumerate(sentences_bcancer):
    for word in tweet:
        if word in data_lexicon_expanded:
            new_database.append(original_bcancer[index])
            treatment.append(word)
        else:
            mispelled = get_mispelling(word,data_lexicon_expanded, 0.75)
            if len(mispelled) >0:
                for term in mispelled: # add tweet multiple times if have more than 1 treatment
                    new_database.append(original_bcancer[index])
                    treatment.append(term)


In [15]:
# save new dataset
out = "brest_cancer_by_treatments.csv"
dict = {'Tweet': new_database, "Treatment":treatment }   

df = pd.DataFrame(dict) 
# saving the dataframe  
df.to_csv(out)  
df

Unnamed: 0,Tweet,Treatment
0,"Before we start our Advent Calendar, we have a...",stearate
1,"Jo has had a tough 24 hours, poor nights sleep...",chemo
2,Current state. Stucco remediation thanks to @t...,stearate
3,"b""$mrk \n\nwhispers we hear is keytruda slated...",keytruda
4,"b""$mrk \n\nwhispers we hear is keytruda slated...",sulfate
...,...,...
1493,"b""#iamasurvivor and i will push for progress b...",herceptin
1494,"b""@senamyklobuchar @sentinasmith why haven't e...",halaven
1495,b'@joebiden no it's gone &amp; thank god. it s...,stearate
1496,b'@dailymailceleb @dailymailuk lovely see @kyl...,regimins
