# This notebook will be to combine all the steps that have already been completed so that its not scattered between many notebooks. I can refer to this notebook from now on moving forward

### What's been done already:
- import and combine all the abstract csvs
- merge abstracts with other grant info
- fill in nan values for ABSTRACT_TEXT column (if possible)

### Steps will be to:
#### Take relevant abstracts and split by year:
- Take only R01s (for starters)
- Split by year (FY column)
- Take only abstract text

#### Clean the text: 
- Remove bad characters
- Remove extra spaces, extra phrases
- Remove bad abstracts (will tokenize words, avg length of tokens)
- Remove stopwords, punctuation, numbers
- Stem tokens

*Can save files off as pickles here*

#### Prepare and execute a HDP model
- Count frequency of each word in corpus (for each year or total?)
- Retain words above a certain frequency (previously 3, more like 50-100?)
- Create dictionary
- Create corpus
- Train HDP model (T = 200 previously, T = 500 now? Test different values for T and find which one makes sense, maybe research different values to tweak)
- Iterate over each year and save the model(!)
- Calculate total number of topics and compare to total number of R01s

### After that will be thinking about and doing future directions

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import os
import re
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
import pickle
from IPython.display import clear_output
from gensim import corpora, models
from gensim.models import Phrases, Word2Vec
from gensim.models.phrases import Phraser

In [4]:
# Read in and combine dfs of full dataset
funded_projects = pd.DataFrame()
for root, dirs, files in os.walk("data/completed_full_csv_parts/", topdown=True):
    for file in files:
        df = pd.read_csv(os.path.join(root, file))
        funded_projects = funded_projects.append(df)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#funded_projects.shape

In [5]:
# keep just R01s
funded_R01_projects = funded_projects[funded_projects.grant_num.str.contains("R01")]

In [5]:
#funded_R01_projects.shape

In [6]:
years = funded_R01_projects.FY.astype(int).sort_values().unique().tolist()

In [7]:
# split by year, make list of dfs
dfs = list()
for year in years:
    df = funded_R01_projects[funded_R01_projects["FY"] == year]
    dfs.append(df)

In [16]:
strange_symbols = ["\xa0", "-\xad", "/"]

In [17]:
things_to_replace = ["unreadable",
                     "[unreadable]",
                     "DESCRIPTION",
                     "Application not provided", 
                     "Description Provided by Applicant",
                     "(Adapted from applicant's abstract)",
                     "(provided by applicant)",
                     "(Verbatim from applicants abstract)",
                     "Purpose",
                     "(Adapted from the Applicant's )",
                     " N A",
                     "(provided by investigator)",
                     "(From the Applicant's )",
                     "DESCRIPTION (provided by applicant)",
                     "ABSTRACT",
                     "Abstract",
                     "Project Summary/Abstract",
                     "PROJECT SUMMARY",
                     "Background:",
                     "Project Summary / Abstract",
                     "Project Summary",
                     "PROJECT SUMMARY/ABSTRACT",
                     "Summary/Abstract",
                     "||",
                     "Summary",
                     "PROJECT SUMMARY / ABSTRACT"]

In [18]:
punct = [p for p in string.punctuation]+["``", "''"]

In [19]:
stop_words = stopwords.words("english")

In [20]:
ps = PorterStemmer()

In [21]:
def average_token_length(doc):
    """calculate the average length of each token in a document"""
    
    #make list of lengths of each token
    lengths = list()
    for word in doc:
        l = len(word)
        lengths.append(l)
        
    #calculate mean
    if len(lengths) > 0:
        mean = np.mean(lengths)
    else:
        mean = 0
    
    return(mean)

In [22]:
def find_bad_abstracts(abs_list):
    
    abstracts = list()
    for abst in abs_list:
        abstract = word_tokenize(abst)
        abstracts.append(abstract)
        
    means = list()
    for a in abstracts:
        means.append(average_token_length(a))
        
    good_abstracts = []    
    for m, a in zip(means, abstracts):
        if m < 7:
            good_abstracts.append(a)
        
    return(good_abstracts)

In [23]:
def clean_abstract_text(df):
    
    
    abs_list = df.ABSTRACT_TEXT.tolist()
    
    for symb in strange_symbols:
        abs_list = [abstract.replace(symb, ' ') for abstract in abs_list]
        
    for phrase in things_to_replace:
        abs_list = [abstract.replace(phrase, '') for abstract in abs_list]
        
    abs_list = [re.sub(" +", " ", abstract) for abstract in abs_list]
    
    good_abstracts = find_bad_abstracts(abs_list)
    
    for ind, a in enumerate(good_abstracts):
        words = [word.lower() for word in a if word not in punct]
        words = [word for word in words if word not in stop_words]
        words = [word for word in words if not word.isdigit()]
        words = [ps.stem(word) for word in words]
        good_abstracts[ind] = words
        
    return(good_abstracts)

In [60]:
def train_hdp_model(abstract_list):
    """Take in list of documents, create a corpus, and run through hda model. Return model"""
        
    frequency = defaultdict(int)
    
    for a in abstract_list:
        for token in a:
            frequency[token] += 1
    
    abstracts = [[token for token in text if frequency[token] > 50] for text in abstract_list]
    
    dictionary = corpora.Dictionary(abstracts)
    
    corpus = [dictionary.doc2bow(a) for a in abstracts]
    
    hdp_model = models.HdpModel(corpus, id2word=dictionary, T = 500)
    
#     topics = hdp_model.print_topics(num_topics = -1, num_words = 1)

#     real_topics = []
#     for topic in topics:
#         if float(topic[1].split("*")[0]) > 0:
#             real_topics.append(topic)
 
    return(hdp_model)
#    return(len(real_topics))

In [1]:
# Uncomment this to iterate over years. For now just test one year

# topics_by_year = []
# l = len(abstracts_lists)
# for ind, fy in enumerate(abstracts_lists):
#     print("working on number {} out of {}".format(ind+1, l))
#     topic_number = hda_topic_number(fy)
#     topics_by_year.append(topic_number)

In [19]:
test = clean_abstract_text(dfs[0])

In [20]:
test

[['distribut',
  'aldehyd',
  'dehydrogenas',
  'ec',
  '1.2.1.3',
  'differ',
  'subcellular',
  'local',
  'suggest',
  'specif',
  'function',
  'isozym',
  'purif',
  'new',
  'isozym',
  'human',
  'liver',
  'character',
  'e1',
  'e2',
  'isozym',
  'present',
  'avail',
  'homogen',
  'form',
  'object',
  'propos',
  'work',
  'homogen',
  'isozym',
  'e1',
  'e2',
  'involv',
  'identif',
  'activ',
  'regulatori',
  'site',
  'chemic',
  'modif',
  'group',
  'specif',
  'reagent',
  'well',
  'affin',
  'reagent',
  'develop',
  'basi',
  'known',
  'substrat',
  'specif',
  'aldehyd',
  'dehydrogenas',
  'standard',
  'techniqu',
  'protein',
  'purif',
  'includ',
  'ion',
  'exchang',
  'affin',
  'chromatographi',
  'use',
  'character',
  'new',
  'isozym',
  'includ',
  'amino',
  'acid',
  'analysi',
  'molecular',
  'weight',
  'isozym',
  'subunit',
  'peptid',
  'map',
  'interact',
  'antibodi',
  'substrat',
  'coenzym',
  'specif',
  'kinet',
  'properti',
  'a

In [31]:
df_lens = []
for df in dfs:
    l = len(df)
    df_lens.append(l)
np.min(df_lens)

2802


[17502,
 2802,
 3521,
 6129,
 10221,
 17325,
 17834,
 18660,
 18683,
 18809,
 16685,
 16744,
 20300,
 11958,
 7042,
 3626,
 4520,
 9650,
 17332,
 32366,
 30922,
 31450,
 30620,
 29805,
 38078,
 34579,
 11224,
 27480,
 26168,
 25410,
 25154,
 25625,
 27407,
 9808]

In [42]:
hdp_test = train_hdp_model(test)

In [43]:
hdp_test.print_topics(num_topics = -1, num_words = 5)

[(0, '0.016*studi + 0.010*use + 0.008*effect + 0.007*determin + 0.007*cell'),
 (1, '0.016*cell + 0.016*studi + 0.011*use + 0.010*protein + 0.008*determin'),
 (2, '0.023*cell + 0.015*studi + 0.013*gene + 0.011*use + 0.009*determin'),
 (3, '0.015*studi + 0.009*use + 0.008*research + 0.008*data + 0.007*children'),
 (4, '0.015*studi + 0.008*use + 0.008*neuron + 0.007*system + 0.007*determin'),
 (5, '0.008*studi + 0.005*use + 0.005*cell + 0.005*effect + 0.005*alcohol'),
 (6, '0.008*studi + 0.007*cell + 0.006*age + 0.006*protein + 0.005*use'),
 (7, '0.008*age + 0.006*studi + 0.005*effect + 0.004*determin + 0.003*chang'),
 (8, '0.014*cell + 0.008*studi + 0.007*receptor + 0.006*factor + 0.005*use'),
 (9, '0.006*studi + 0.005*immun + 0.005*respons + 0.004*cell + 0.004*may'),
 (10,
  '0.007*studi + 0.007*cell + 0.004*function + 0.003*alcohol + 0.003*membran'),
 (11,
  '0.005*alcohol + 0.005*effect + 0.004*studi + 0.004*age + 0.003*behavior'),
 (12,
  '0.004*age + 0.004*damag + 0.003*treatment + 

# Looks like everything is working as expected. Right now all topics have greater than zero values for first word. Do some more cleaning to try and narrow topics.

## How to improve process:
### Preprocessing:
- Remove redundant entries, this may skew scores or topics
- Create new stopwords list, look at frequency and tfidf weighting
- Identify phrases using whole corpus. This will cut down on tokens, possibly making more distinct topics

### Model:
- Compare LSI and LDA to HDP. Main comparison is LSI to HDP

In [8]:
# Identify redundant entries

total_abstracts = len(dfs[0].ABSTRACT_TEXT)
total_unique_abstracts = len(dfs[0].ABSTRACT_TEXT.unique())

print("Total abstracts: {}\nUnique abstracts: {}\nDifference: {}".format(total_abstracts,
                                                                         total_unique_abstracts,
                                                                         total_abstracts-total_unique_abstracts))

Total abstracts: 17502
Unique abstracts: 17086
Difference: 416


In [14]:
# make a df with only the unique abstracts

unique_abstracts_df = dfs[0].drop_duplicates(subset = "ABSTRACT_TEXT")

clean text again to prepare for getting phrases
<br>
new func to clean text differently, sent tokens and then word tokens
<br>
Actually use old one. Each sublist will be tokenized words for each abstract.
<br>
Though not strictly what the structure actually is, will still work for phraser.
<br>
Important since otherwise previous functions will mess up sent_tokenizer

In [27]:
word_tokens_abstracts = clean_abstract_text(unique_abstracts_df)

In [28]:
word_tokens_abstracts

[['distribut',
  'aldehyd',
  'dehydrogenas',
  'ec',
  '1.2.1.3',
  'differ',
  'subcellular',
  'local',
  'suggest',
  'specif',
  'function',
  'isozym',
  'purif',
  'new',
  'isozym',
  'human',
  'liver',
  'character',
  'e1',
  'e2',
  'isozym',
  'present',
  'avail',
  'homogen',
  'form',
  'object',
  'propos',
  'work',
  'homogen',
  'isozym',
  'e1',
  'e2',
  'involv',
  'identif',
  'activ',
  'regulatori',
  'site',
  'chemic',
  'modif',
  'group',
  'specif',
  'reagent',
  'well',
  'affin',
  'reagent',
  'develop',
  'basi',
  'known',
  'substrat',
  'specif',
  'aldehyd',
  'dehydrogenas',
  'standard',
  'techniqu',
  'protein',
  'purif',
  'includ',
  'ion',
  'exchang',
  'affin',
  'chromatographi',
  'use',
  'character',
  'new',
  'isozym',
  'includ',
  'amino',
  'acid',
  'analysi',
  'molecular',
  'weight',
  'isozym',
  'subunit',
  'peptid',
  'map',
  'interact',
  'antibodi',
  'substrat',
  'coenzym',
  'specif',
  'kinet',
  'properti',
  'a

In [36]:
phrases = Phrases(word_tokens_abstracts)#, min_count = 5, threshold = 2)

In [55]:
bigram = Phraser(phrases)

In [43]:
print(bigram[word_tokens_abstracts[0]])

['distribut', 'aldehyd_dehydrogenas', 'ec', '1.2.1.3', 'differ', 'subcellular_local', 'suggest', 'specif', 'function', 'isozym', 'purif', 'new', 'isozym', 'human', 'liver', 'character', 'e1_e2', 'isozym', 'present', 'avail', 'homogen', 'form', 'object_propos', 'work', 'homogen', 'isozym', 'e1_e2', 'involv', 'identif', 'activ', 'regulatori', 'site', 'chemic_modif', 'group', 'specif', 'reagent', 'well', 'affin_reagent', 'develop', 'basi', 'known', 'substrat', 'specif', 'aldehyd_dehydrogenas', 'standard', 'techniqu', 'protein', 'purif', 'includ', 'ion_exchang', 'affin_chromatographi', 'use', 'character', 'new', 'isozym', 'includ', 'amino_acid', 'analysi', 'molecular_weight', 'isozym', 'subunit', 'peptid_map', 'interact', 'antibodi', 'substrat', 'coenzym', 'specif', 'kinet_properti', 'acetaldehyd', 'substrat', 'concurr', 'aldehyd_dehydrogenas', 'content', 'isozym', 'distribut', 'crude_extract', 'differ', 'human', 'tissu', 'post-mortem', 'alcohol', 'non-alcohol', 'investig', 'isozym', 'post

In [48]:
word_tokens_abstracts[0]

['distribut',
 'aldehyd',
 'dehydrogenas',
 'ec',
 '1.2.1.3',
 'differ',
 'subcellular',
 'local',
 'suggest',
 'specif',
 'function',
 'isozym',
 'purif',
 'new',
 'isozym',
 'human',
 'liver',
 'character',
 'e1',
 'e2',
 'isozym',
 'present',
 'avail',
 'homogen',
 'form',
 'object',
 'propos',
 'work',
 'homogen',
 'isozym',
 'e1',
 'e2',
 'involv',
 'identif',
 'activ',
 'regulatori',
 'site',
 'chemic',
 'modif',
 'group',
 'specif',
 'reagent',
 'well',
 'affin',
 'reagent',
 'develop',
 'basi',
 'known',
 'substrat',
 'specif',
 'aldehyd',
 'dehydrogenas',
 'standard',
 'techniqu',
 'protein',
 'purif',
 'includ',
 'ion',
 'exchang',
 'affin',
 'chromatographi',
 'use',
 'character',
 'new',
 'isozym',
 'includ',
 'amino',
 'acid',
 'analysi',
 'molecular',
 'weight',
 'isozym',
 'subunit',
 'peptid',
 'map',
 'interact',
 'antibodi',
 'substrat',
 'coenzym',
 'specif',
 'kinet',
 'properti',
 'acetaldehyd',
 'substrat',
 'concurr',
 'aldehyd',
 'dehydrogenas',
 'content',
 'is

In [44]:
word_tokens_test = word_tokens_abstracts[0:10]

In [59]:
bigrams_list = []
for a in word_tokens_abstracts:
    bi = bigram[a]
    bigrams_list.append(bi)

In [58]:
bigrams_list

[['distribut',
  'aldehyd_dehydrogenas',
  'ec',
  '1.2.1.3',
  'differ',
  'subcellular_local',
  'suggest',
  'specif',
  'function',
  'isozym',
  'purif',
  'new',
  'isozym',
  'human',
  'liver',
  'character',
  'e1_e2',
  'isozym',
  'present',
  'avail',
  'homogen',
  'form',
  'object_propos',
  'work',
  'homogen',
  'isozym',
  'e1_e2',
  'involv',
  'identif',
  'activ',
  'regulatori',
  'site',
  'chemic_modif',
  'group',
  'specif',
  'reagent',
  'well',
  'affin_reagent',
  'develop',
  'basi',
  'known',
  'substrat',
  'specif',
  'aldehyd_dehydrogenas',
  'standard',
  'techniqu',
  'protein',
  'purif',
  'includ',
  'ion_exchang',
  'affin_chromatographi',
  'use',
  'character',
  'new',
  'isozym',
  'includ',
  'amino_acid',
  'analysi',
  'molecular_weight',
  'isozym',
  'subunit',
  'peptid_map',
  'interact',
  'antibodi',
  'substrat',
  'coenzym',
  'specif',
  'kinet_properti',
  'acetaldehyd',
  'substrat',
  'concurr',
  'aldehyd_dehydrogenas',
  'c

In [62]:
bigram_hdp_test = train_hdp_model(bigrams_list)

In [64]:
bigram_hdp_test.print_topics(num_topics = -1, num_words = 5)

[(0,
  '0.019*studi + 0.012*use + 0.009*effect + 0.008*develop + 0.007*determin'),
 (1, '0.019*studi + 0.018*cell + 0.013*use + 0.008*determin + 0.007*develop'),
 (2, '0.019*studi + 0.016*cell + 0.012*use + 0.009*effect + 0.009*determin'),
 (3, '0.018*studi + 0.015*gene + 0.014*cell + 0.013*use + 0.012*protein'),
 (4, '0.019*studi + 0.016*cell + 0.012*use + 0.010*protein + 0.009*determin'),
 (5, '0.019*studi + 0.012*use + 0.009*determin + 0.009*activ + 0.008*protein'),
 (6, '0.017*studi + 0.013*use + 0.012*protein + 0.009*structur + 0.009*cell'),
 (7, '0.028*cell + 0.015*studi + 0.013*use + 0.009*gene + 0.008*determin'),
 (8, '0.014*studi + 0.013*cell + 0.011*use + 0.009*protein + 0.008*specif'),
 (9, '0.009*memori + 0.006*experi + 0.006*age + 0.006*process + 0.005*task'),
 (10,
  '0.007*studi + 0.004*use + 0.004*activ + 0.003*determin + 0.003*structur'),
 (11, '0.007*studi + 0.007*age + 0.005*immun + 0.004*cell + 0.004*antibodi'),
 (12,
  '0.005*effect + 0.005*studi + 0.005*ethanol + 