In [4]:
# imports
import re #regular expressions
import spacy #natural language processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from nltk import tokenize # fast sentence tokenization
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
from nltk.stem import WordNetLemmatizer

import nltk
import string

nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

words = set(nltk.corpus.words.words())
nlp = spacy.load('en_core_web_sm') # load up nlp
stop = set(nlp.Defaults.stop_words) | set(string.punctuation) # stop words and punctuation set

[nltk_data] Downloading package words to
[nltk_data]     /Users/sairajuladevi/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sairajuladevi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sairajuladevi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
def clean_and_lemmatize(df, column):
    text = df[[column]].to_numpy()
    
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    lemmatizer = WordNetLemmatizer()
    
    cleaned = []
    lemmatized = []
    
    # # part of speech tagging
    # adjs = []
    # nouns = []
    # verbs = []

    # adj_counter = Counter()
    # noun_counter = Counter()
    # verb_counter = Counter()
    
    for sentence in text:
        # print(sentence[0], type(sentence[0]))
        # final word lists will put tokens into strings
        cleaned_lst = []
        lemmatized_lst = []
                
        for word in tokenizer.tokenize(sentence[0].lower()):    

            # booleans for filtering
            filtering = ((word not in stop) # word not in stopwords
                        # and (re.search("^[a-z]+$", word)) # select only valid words
                        and (len(word) > 2)
                        and not ("https" in word)) # words must be greater than 2 in length
            
                             
            # add to cleaned and lemmatized lists       
            if filtering:
                cleaned_lst.append(word)
                lemmatized_lst.append(lemmatizer.lemmatize(word))
                
                # word_pos = nlp(word)[0].pos_
            
                # if word_pos == 'ADJ':
                #     adjs.append(word)
                # elif word_pos == 'NOUN':
                #     nouns.append(word)
                # elif word_pos == 'VERB':
                #     verbs.append(word)
                    
            # create final parsed strings
        cleaned.append(' '.join(cleaned_lst))
        lemmatized.append(' '.join(lemmatized_lst))
        
    # # update counters for part of speech
    # adj_counter.update(adjs)
    # noun_counter.update(nouns)
    # verb_counter.update(verbs)
    
    
    return {'cleaned': cleaned, 'lemmatized': lemmatized}
            # 'adj_counter': adj_counter, 'noun_counter': noun_counter, 'verb_counter': verb_counter}


In [60]:
# you need to pip install openpyxl
df = pd.read_excel("Diabetes Continuous Glucose Monitoring – Data Export.xlsx")

df = df[['Post ID', 'Sound Bite Text', 'Title', 'Source Type', 'Sentiment', 'Positive Objects', 'Negative Objects']]
df = df.drop_duplicates(subset=['Sound Bite Text'])

df['Title'] = df[['Title']].fillna('')

df['entire_text'] = df['Sound Bite Text'] + ' ' + df['Title']

df.head()

Unnamed: 0,Post ID,Sound Bite Text,Title,Source Type,Sentiment,Positive Objects,Negative Objects,entire_text
0,BRDRDT2-t1_imq98sr,My numbers are great now. Estimated a1c of 7%i...,Have you been denied a second/third pump? Feel...,Forums,Neutrals,number,,My numbers are great now. Estimated a1c of 7%i...
1,BRDRDT2-t1_impbcf4,I tried it for a little while. No side effects...,Metformin,Forums,Positives,,,I tried it for a little while. No side effects...
2,1565738759353602048,i ran out of characters. youtu.be/RWgl2PDhQiM ...,,Twitter,Positives,"dexcom g6, omnipod system",,i ran out of characters. youtu.be/RWgl2PDhQiM ...
3,17944607459251789,MY lunch! Ate at 10:30am \n1 unit NovoLog insu...,,Instagram,Neutrals,,,MY lunch! Ate at 10:30am \n1 unit NovoLog insu...
4,BRDRDT2-t1_imq8h9m,This is also because like a soak in a hot tub ...,"No bath salts, bath oils, soaks?",Forums,Neutrals,,,This is also because like a soak in a hot tub ...


In [None]:
for i in df['Sound Bite Text'].sample(n=30):
    print(i + '\n')

In [61]:
entire_text_dic = clean_and_lemmatize(df,'entire_text')
# title_dic = clean_and_lemmatize(df,'Title')

In [62]:
df['Sentiment'].unique()
df['Sentiment'] = pd.Categorical(df['Sentiment'])

df['target'] = df['Sentiment'].cat.codes

target_mapping = df.groupby(['Sentiment','target']).size().reset_index()
target_mapping = target_mapping[target_mapping[0] != 0]


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer= TfidfVectorizer(min_df=50, stop_words="english", max_df=0.8)

df['lemmatized_entire_text'] = entire_text_dic['lemmatized']

tf_idf_entire_text = tfidf_vectorizer.fit_transform(entire_text_dic['lemmatized']).toarray()
# df['tf_idf_title'] = tfidf_vectorizer.fit_transform(entire_text_dic['lemmatized']).toarray()

In [41]:
df.to_csv("sai_diabetes.csv")

In [35]:
def getCategory(post):
    if 'dexcom' in post or 'g6' in post:
        if 'libre' in post:
            return 'both'
        else:
            return 'dexcom'
    elif 'libre' in post:
        return 'libre'
    else:
        return 'none'

In [36]:
dexcom = []
libre = []
both = []
neither = []
for ind, row in df.iterrows():
    subject = getCategory(row['cleaned'].lower())
    if subject == 'dexcom':
        dexcom.append(ind)
    elif subject == 'libre':
        libre.append(ind)
    elif subject == 'both':
        both.append(ind)
    else:
        neither.append(ind)


KeyError: 'cleaned'