In [17]:
import pandas as pd
import sklearn
import numpy as np
import nltk
import re
from Bio import Medline

In [18]:
! pip install biopython



In [19]:
# Function that uses the Medline module from
# the Biopython library to parse and read MEDLINE
# formatted files. Results are stored in a Pandas 
# DataFrame
def read_medline_data(filename):
    recs = Medline.parse(open(filename, 'r'))
    text = pd.DataFrame(columns = ["title", "authors", "abstract"])
    count = 0
    for rec in recs:
        try:
            abstr = rec["AB"]
            title = rec["TI"]
            auths = rec["AU"]
            text = text.append(pd.DataFrame([[title, auths, abstr]],
                                     columns=['title', 'authors', 'abstract']),
                              ignore_index=True)            
        except:
            pass
    return text

In [20]:
# Read in MEDLINE formatted text
papers = read_medline_data("1.txt")

In [21]:
papers

Unnamed: 0,title,authors,abstract
0,Depression and Mania in Bipolar Disorder.,"[Tondo L, VÃ¡zquez GH, Baldessarini RJ]","BACKGROUND: Episode duration, recurrence rates..."
1,Cognitive Impairment in Bipolar Disorder: Trea...,"[SolÃ© B, JimÃ©nez E, Torrent C, Reinares M, B...","Over the last decade, there has been a growing..."
2,Bipolar disorder: clinical overview.,"[MÃ¼ller JK, Leweke FM]",Bipolar disorder is a severe psychiatric disor...
3,Diagnosis and treatment of patients with bipol...,"[McCormick U, Murray B, McNew B]",PURPOSE: This review article provides an overv...
4,Bipolar Disorder: Its Etiology and How to Mode...,"[Freund N, Juckel G]",Characterized by the switch of manic and depre...
5,Borderline personality disorder and bipolar di...,"[Paris J, Black DW]",Borderline personality disorder (BPD) and bipo...
6,Bipolar disorder.,"[Smith DJ, Whitham EA, Ghaemi SN]",Bipolar disorder is a serious disorder of mood...
7,Older Age Bipolar Disorder.,"[Dols A, Beekman A]",Further understanding of older age bipolar dis...
8,The relationship between borderline personalit...,"[Zimmerman M, Morgan TA]",It is clinically important to recognize both b...
9,"Update on the Epidemiology, Diagnosis, and Tre...","[Chen P, Dols A, Rej S, Sajatovic M]",PURPOSE OF REVIEW: The population over age 60 ...


In [22]:
print ("Title: ", papers['title'][0])
print ('\n')
print ("Abstract: ", papers['abstract'][0])

Title:  Depression and Mania in Bipolar Disorder.


Abstract:  BACKGROUND: Episode duration, recurrence rates, and time spent in manic and depressive phases of bipolar disorder (BD) is not well defined for subtypes of the disorder. METHODS: We reviewed the course, timing, and duration of episodes of mania and depression among 1130 clinically treated DSM-IV-TR BD patients of various types, and compared duration and rates as well as total proportion of time in depressive versus manic episodes during 16.7 average years at risk. RESULTS: As expected, episodes of depressions were much longer than manias, but episode-duration did not differ among BD diagnostic types: I, II, with mainly mixed-episodes (BD-Mx), or with psychotic features (BD-P). Recurrence rates (episodes/year) and proportion of time in depression and their ratios to mania were highest in BD-II and BD-Mx subjects, with more manias/year in psychotic and BD-I subjects. In most BD-subtypes, except with psychotic features, there w

In [23]:
# Function that cleans text by removing '\x0c' and '\n' characters
# as well as all non-alpha characters and finally converts everything
# to lower case
def clean_text(text):
    stop_words = ['\x0c', '\n']
    for i in stop_words:
        text.replace(i, ' ')
    clean_text = re.sub('[^a-zA-Z]+', ' ', text)
    return clean_text.lower()

# Create a column for cleaned Abstract and cleaned Title
papers['clean_abstract'] = papers['abstract'].apply(clean_text)
papers['clean_title'] = papers['title'].apply(clean_text)

papers.head()

Unnamed: 0,title,authors,abstract,clean_abstract,clean_title
0,Depression and Mania in Bipolar Disorder.,"[Tondo L, VÃ¡zquez GH, Baldessarini RJ]","BACKGROUND: Episode duration, recurrence rates...",background episode duration recurrence rates a...,depression and mania in bipolar disorder
1,Cognitive Impairment in Bipolar Disorder: Trea...,"[SolÃ© B, JimÃ©nez E, Torrent C, Reinares M, B...","Over the last decade, there has been a growing...",over the last decade there has been a growing ...,cognitive impairment in bipolar disorder treat...
2,Bipolar disorder: clinical overview.,"[MÃ¼ller JK, Leweke FM]",Bipolar disorder is a severe psychiatric disor...,bipolar disorder is a severe psychiatric disor...,bipolar disorder clinical overview
3,Diagnosis and treatment of patients with bipol...,"[McCormick U, Murray B, McNew B]",PURPOSE: This review article provides an overv...,purpose this review article provides an overvi...,diagnosis and treatment of patients with bipol...
4,Bipolar Disorder: Its Etiology and How to Mode...,"[Freund N, Juckel G]",Characterized by the switch of manic and depre...,characterized by the switch of manic and depre...,bipolar disorder its etiology and how to model...


In [24]:
print ("Title: ", papers['clean_title'][0])
print ('\n')
print ("Abstract: ", papers['clean_abstract'][0])

Title:  depression and mania in bipolar disorder 


Abstract:  background episode duration recurrence rates and time spent in manic and depressive phases of bipolar disorder bd is not well defined for subtypes of the disorder methods we reviewed the course timing and duration of episodes of mania and depression among clinically treated dsm iv tr bd patients of various types and compared duration and rates as well as total proportion of time in depressive versus manic episodes during average years at risk results as expected episodes of depressions were much longer than manias but episode duration did not differ among bd diagnostic types i ii with mainly mixed episodes bd mx or with psychotic features bd p recurrence rates episodes year and proportion of time in depression and their ratios to mania were highest in bd ii and bd mx subjects with more manias year in psychotic and bd i subjects in most bd subtypes except with psychotic features there was more time in depressive than manic m

In [25]:
'''Build tf-idf matrix based on Abstract and Title
Use NLTK word_tokenize() and SnowballStemmer() to tokenize and stem document Title and Abstract'''

# Function that takes text, tokenizes it and returns list of stemmed tokens
def tokenize_and_stem(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    return [i for i in [stemmer.stem(t) for t in tokens] if len(i) > 2]

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Windows
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
# Import the TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Create vectorizer for Abstracts, max_df is set to 0.5, we only want
# to include terms that appear in less tha 50% of the documents (i.e. rare terms)
abs_tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=0, max_features=200000,
               stop_words='english', use_idf=True, tokenizer=tokenize_and_stem)

# Create vectorizer for Title, max_df is set to 0.5, we only want 
# to include terms that appear in less than 50% of the documents (i.e. rare terms)
title_tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=0, max_features=200000,
               stop_words='english', use_idf=True, tokenizer=tokenize_and_stem)

# Compute TF-IDF weights for Abstracts
tfidf_weights_abs = abs_tfidf_vectorizer.fit_transform(papers['clean_abstract'])

  'stop_words.' % sorted(inconsistent))


In [28]:
# Compute TF-IDF weights for Title
tfidf_weights_title = title_tfidf_vectorizer.fit_transform(papers['clean_title'])

# Get feature names for Abstract and Title models
tfidf_features_title = title_tfidf_vectorizer.get_feature_names()
tfidf_features_abs = abs_tfidf_vectorizer.get_feature_names()
            
    

  'stop_words.' % sorted(inconsistent))


In [29]:
tfidf_features_abs


['abnorm',
 'absenc',
 'absent',
 'abstract',
 'abus',
 'acclim',
 'accord',
 'account',
 'accur',
 'accuraci',
 'achiev',
 'acut',
 'addit',
 'address',
 'adhd',
 'adjust',
 'administr',
 'adolesc',
 'adult',
 'advanc',
 'advantag',
 'advers',
 'affect',
 'age',
 'agent',
 'aggress',
 'aid',
 'aim',
 'alcohol',
 'allostat',
 'allow',
 'alreadi',
 'alter',
 'alway',
 'amen',
 'analyz',
 'ani',
 'anoth',
 'anteced',
 'anterior',
 'antipsychot',
 'anxieti',
 'apart',
 'apn',
 'appear',
 'appli',
 'apprais',
 'appreci',
 'approach',
 'appropri',
 'approxim',
 'area',
 'aripiprazol',
 'aris',
 'articl',
 'asenapin',
 'aspect',
 'assess',
 'assist',
 'associ',
 'assum',
 'attempt',
 'attent',
 'atyp',
 'author',
 'avail',
 'averag',
 'awar',
 'axi',
 'background',
 'base',
 'basi',
 'batteri',
 'becaus',
 'befor',
 'begin',
 'begun',
 'behavior',
 'besid',
 'best',
 'better',
 'bibliographi',
 'biolog',
 'biomark',
 'bodi',
 'book',
 'borderlin',
 'bpd',
 'brain',
 'brief',
 'briefli',
 'br

In [30]:
# Function for returning the top_k features of an Abstract
# or Title
def get_top_features(rownum, weights, features, top_k=10):
    weight_vec = weights.toarray()[rownum,:]
    top_idx = np.argsort(weight_vec)[::-1][:top_k]
    return [features[i] for i in top_idx]

# Top k features of Abstract 1
get_top_features(1, tfidf_weights_abs, tfidf_features_abs)

['cognit',
 'strategi',
 'impair',
 'dysfunct',
 'prevent',
 'effect',
 'treat',
 'psychosoci',
 'develop',
 'potenti']

In [31]:
# Top k features of Title 1
get_top_features(1, tfidf_weights_title, tfidf_features_title)

['impair',
 'prevent',
 'strategi',
 'cognit',
 'treatment',
 'earli',
 'diagnos',
 'diagnosi',
 'differ',
 'differenti']

In [32]:

# Build model to return 5 closest neighbors
from sklearn.neighbors import NearestNeighbors

# Create the k-NN model using k=5
nn_abs = NearestNeighbors(n_neighbors=5, algorithm='auto')
nn_title = NearestNeighbors(n_neighbors=5, algorithm='auto')

# Fit the models to the TF-IDF weights matrix
nn_fitted_abs = nn_abs.fit(tfidf_weights_abs)
nn_fitted_title = nn_title.fit(tfidf_weights_title)

# function to return the top-k nearest papers

def find_nearest_papers(row, kNNmodel, tfidf_weights, tfidf_features, papers):
    keywords = get_top_features(row, tfidf_weights, tfidf_features)
    dist,idx = kNNmodel.kneighbors(tfidf_weights[row,:])
    idx = list(idx[0])
    return {'papers':papers.iloc[idx], 'keywords':keywords}

In [33]:
find_nearest_papers(1, nn_fitted_abs, tfidf_weights_abs, tfidf_features_abs, papers)['papers']

Unnamed: 0,title,authors,abstract,clean_abstract,clean_title
1,Cognitive Impairment in Bipolar Disorder: Trea...,"[SolÃ© B, JimÃ©nez E, Torrent C, Reinares M, B...","Over the last decade, there has been a growing...",over the last decade there has been a growing ...,cognitive impairment in bipolar disorder treat...
22,"Improving Functioning, Quality of Life, and We...","[BonnÃ­n CDM, Reinares M, MartÃ­nez-ArÃ¡n A, J...",People with bipolar disorder frequently experi...,people with bipolar disorder frequently experi...,improving functioning quality of life and well...
19,Cognitive aging of bipolar patients.,"[Kohler S, Marlinge E]","Bipolar disorder is a severe, recurrent mood d...",bipolar disorder is a severe recurrent mood di...,cognitive aging of bipolar patients
25,The Lithium Battery: assessing the neurocognit...,"[Malhi GS, McAulay C, Gershon S, Gessler D, Fr...",OBJECTIVE: The aim of the present study was to...,objective the aim of the present study was to ...,the lithium battery assessing the neurocogniti...
21,Longitudinal changes in the antecedent and ear...,"[Pfennig A, Leopold K, Ritter P, BÃ¶hme A, Sev...",OBJECTIVE: Prospective study designs ideally a...,objective prospective study designs ideally al...,longitudinal changes in the antecedent and ear...


In [34]:
find_nearest_papers(1, nn_fitted_title, tfidf_weights_title, tfidf_features_title, papers)['papers']

Unnamed: 0,title,authors,abstract,clean_abstract,clean_title
1,Cognitive Impairment in Bipolar Disorder: Trea...,"[SolÃ© B, JimÃ©nez E, Torrent C, Reinares M, B...","Over the last decade, there has been a growing...",over the last decade there has been a growing ...,cognitive impairment in bipolar disorder treat...
6,Bipolar disorder.,"[Smith DJ, Whitham EA, Ghaemi SN]",Bipolar disorder is a serious disorder of mood...,bipolar disorder is a serious disorder of mood...,bipolar disorder
24,Bipolar disorder.,"[Keck PE Jr, McElroy SL, Arnold LM]",Bipolar disorder (manic-depressive illness) is...,bipolar disorder manic depressive illness is a...,bipolar disorder
10,Bipolar disorder.,[Dilsaver SC],Bipolar illness is a serious heritable mood di...,bipolar illness is a serious heritable mood di...,bipolar disorder
19,Cognitive aging of bipolar patients.,"[Kohler S, Marlinge E]","Bipolar disorder is a severe, recurrent mood d...",bipolar disorder is a severe recurrent mood di...,cognitive aging of bipolar patients


In [35]:
title = "A contemporary review of obstructive sleep apnea." #provide actual name of a paper
papers[papers['title']==title]

Unnamed: 0,title,authors,abstract,clean_abstract,clean_title


In [36]:
nearest_papers = find_nearest_papers(4, nn_fitted_abs, tfidf_weights_abs, tfidf_features_abs, papers)
for i in nearest_papers['keywords']: print ("Keywords: ", i)

Keywords:  rodent
Keywords:  imag
Keywords:  behavior
Keywords:  factor
Keywords:  biolog
Keywords:  genet
Keywords:  recent
Keywords:  epigenet
Keywords:  modern
Keywords:  pet


In [40]:
# Show the abstracts of similar papers
for i in nearest_papers['papers']['abstract']: print ("Abstract: "+i+"\n")

Abstract: Characterized by the switch of manic and depressive phases, bipolar disorder was described as early as the fifth century BC. Nevertheless up to date, the underlying neurobiology is still largely unclear, assuming a multifactor genesis with both biological-genetic and psychosocial factors. Significant process has been achieved in recent years in researching the causes of bipolar disorder with modern molecular biological (e.g., genetic and epigenetic studies) and imaging techniques (e.g., positron emission tomography (PET) and functional magnetic resonance imaging (fMRI)). In this chapter we will first summarize our recent knowledge on the etiology of bipolar disorder. We then discuss how several factors observed to contribute to bipolar disorder in human patients can be manipulated to generate rodent models for bipolar disorder. Finally, we will give an overview on behavioral test that can be used to assess bipolar-disorder-like behavior in rodents.

Abstract: Bipolar disorder

In [38]:
a = [1,2,3,4,5,6]
a[::-1]

[6, 5, 4, 3, 2, 1]

In [39]:
a[0][2]

TypeError: 'int' object is not subscriptable