---
title: Metada Extraction.
tldr: This is short description of the content and findings of the post.
---

### Importing libraries

In [None]:
import requests
from html.parser import HTMLParser
from bs4 import BeautifulSoup
#from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import pickle
#from itertools import chain
GEO_URL = "http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="

### Fetching summary from GEO database of the GSE id

In [None]:
def get_summary_and_title(gse_id):
    url = GEO_URL + gse_id
   
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    title = soup.find("td", text="Title").find_next_sibling("td").text
    # pltfm_org = soup.find("td ", text="Platform organism").find_next_sibling("td").text
    # sample_org = soup.find("td", text="Sample organism").find_next_sibling("td").text
    exp_type = soup.find("td", text="Experiment type").find_next_sibling("td").text
    abstract = soup.find("td", text="Summary").find_next_sibling("td").text

    dataset_meta = {'gse_id': gse_id, 'title': title, 'exp_type': exp_type, 'abstract': abstract}
    return(dataset_meta)

In [None]:
#summary = get_summary_and_title('GSE9452')

In [None]:
#summary = summary['abstract']

### Fetching the metadata keywords from JENSEN API

In [None]:
def query_jensen_api(input_string):
   query_string = input_string.replace(" ", "+")
   url = 'http://tagger.jensenlab.org/GetEntities?document=' + \
       query_string + '&entity_types=-2+-25+-26+-27+-21+-22+-23+0+-1+-3+ \
       -11+-24+-28+-29+-30+-31+-36&format=tsv'
   response = requests.get(url)
   response_jensen = pd.DataFrame([x.split('\t') for x in str(
       response.text).split("\n")], columns=["Name", "Annotation","Identifier"])
   response_jensen_wo_duplicates = response_jensen.drop_duplicates(["Name"])
   return response_jensen_wo_duplicates

### Function to call all above functions to get the metadata keywords

In [None]:
def get_metadata(sig_id):
   creeds_response_dict = get_creeds_response(sig_id)
   gse_id = creeds_response_dict['geo_id']
   data = get_summary_and_title(gse_id)
   jensen_output = query_jensen_api(data['abstract'])
   annotated_data = (annotate_biomedical_entities(jensen_output))
   return annotated_data


### Biomedical terms annotated by JENSEN

In [None]:
biomedicalTermsJensenAnnotated = {'APO_phenotypes': -28,
                                  'BTO_Tissues': -25,
                                  'DOID_Diseases': -26,
                                  'ENVO_environments': -27,
                                  'FYPO_phenotypes': -29,
                                  'GOBiologicalProcess': -21,
                                  'GOCellularComponent': -22,
                                  'GOMolecularFunction': -23,
                                  'GOOther': -24,
                                  'MPheno_phenotypes': -30,
                                  'NBO_behaviors': -31,
                                  'NCBI_Chemicals': -1,
                                  'NCBI_Species': -2,
                                  'NCBI_Species_Proteins': -3,
                                  'Wikipedia': -11,
                                  'mammalian_phenotypes': -36}

### Function to annotate biomedical entities

In [None]:
def annotate_biomedical_entities(response_jensen_wo_duplicates):
    """
    This function annotate words along with biomedical entities

    Parameters
    ----------
    response_jensen_wo_duplicates : pandas dataframe
         pandas dataframe having word along with annotation and identifier

    Returns
    -------
    dict
        dictionary where each entity contains list of
        words from input string as values
    """

    annotated_dict = dict()
    for index, row in response_jensen_wo_duplicates.iterrows():
        k = [key for (key, value) in biomedicalTermsJensenAnnotated.
             items() if int(row[1]) == value]
        if int(row[1]) > 1:
            k = ["Genes"]
        m = "Not_Known" if len(k) == 0 else k[0]
        annotated_dict.setdefault(m, [])
        annotated_dict[m].append(row[0])
    return annotated_dict


In [None]:
summary_jensen = query_jensen_api('We assessed MLL/SET proteins and found that SETD1A is required for survival of acute myeloid leukemia (AML) cells')
summary_jensen

In [None]:
#meta_data = annotate_biomedical_entities(summary_jensen)
#meta_data

In [None]:
#meta_list = list(meta_data.values())

In [None]:
#flat_list = [item for sublist in meta_list for item in sublist]

In [None]:
#flat_list

### Reading GEO ids

In [None]:
aml_microarray_geoids = pd.read_csv('GEO_AML_Microarray.csv')

### First few GEO ids

In [None]:
geo_ids = aml_microarray_geoids['gse_id']
geo_ids.head()

In [None]:
#summary = get_summary_and_title('GSE9452')
#summary = summary['abstract']
#annotate_biomedical_entities(summary_jensen)

### Fetching the metadata keywords for all GEO ids

In [None]:
# geoids_metadata = {}
# all_geoids_metadata = []
# invalid_geo_ids = []

# for i in range(len(geo_ids)):
    
#     try:    
#         summary = get_summary_and_title(geo_ids[i])
#         summary = summary['abstract']
    
#         jensen_output = query_jensen_api(summary)
    
#         meta_data = annotate_biomedical_entities(jensen_output)
#         meta_list = list(meta_data.values())
#         meta_data_list = [item for sublist in meta_list for item in sublist]
#         #print(meta_data)
        
#         all_geoids_metadata.extend(meta_data_list)
#         #print(all_geoids_metadata)
        
#         geoids_metadata[geo_ids[i]] = meta_data_list
#         #print(geoids_metadata)
        
#     except AssertionError: 
#         print('No output in {} from JENSEN'.format(geo_ids[i]))
#         invalid_geo_ids.append(geo_ids[i])

In [None]:
#geoids_metadata

### Serializing the above metadata using Pickle

In [None]:
file_Name = "jensen_metadata_raw"
#fileObject = open(file_Name,'wb') 

#pickle.dump(geoids_metadata, fileObject)  
#fileObject.close()

### Loading the Pickle metadata file

In [None]:
fileObject = open(file_Name,'rb')
a = pickle.load(fileObject)

### Assigning the pickle metadata file to the variable

In [None]:
geoids_metadata = a

### List of GEO ids for which JENSEN didn't gave any metadata keywords as output 

In [None]:
invalid_geo_ids

### Extracting the GEO ids for which metadata is present

In [None]:
valid_geo_ids = [x for x in geo_ids if x not in invalid_geo_ids]
len(valid_geo_ids)

### Looking at the metadata keywords

In [None]:
set(all_geoids_metadata)

### Creating the column names for our final matrix

In [None]:
cols = set(all_geoids_metadata)
column_names = list(cols)
column_names

### Creating the Matrix with with rows as GSE ids and columns as metadata keywords

In [None]:
dataset_metadata_df = pd.DataFrame(columns=column_names, index=valid_geo_ids)
dataset_metadata_df

### Filling the matrix with 1's and 0's where the metadata is present or absent respectvely

In [None]:
for i in range(len(valid_geo_ids)):
    #try:
    row_name = dataset_metadata_df.index[i]
    #print(row_name)

    for j in range(len(column_names)):

        col_name = column_names[j]
        #print(col_name)

        if col_name in geoids_metadata[row_name]:
            dataset_metadata_df.loc[row_name, col_name] = 1
        else:
            dataset_metadata_df.loc[row_name, col_name] = 0
    #except KeyError: 
        #print('No output in {} from JENSEN'.format(geo_ids[i]))
        #invalid_geo_ids.append(geo_ids[i])

In [None]:
#geoids_metadata[row_name]

### Looking at the matrix

In [None]:
dataset_metadata_df

### Serializing the matrix using Pickle

In [None]:
#import pickle

file_Name = "jensen_metadata"
#fileObject = open(file_Name,'wb') 

#pickle.dump(dataset_metadata_df, fileObject)  
#fileObject.close()

### Loading the pickle file

In [None]:
fileObject = open(file_Name,'rb')
b = pickle.load(fileObject)

### Assigning the pickle metadata file to the variable

In [None]:
dataset_metadata_df = b
dataset_metadata_df

### Shape of the matrix

In [None]:
b.shape

In [None]:
pd.options.display.max_rows = 5000

### Importing plotting libraries

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


### Frequencies of the metadata

In [None]:
col_sum = dataset_metadata_df.sum()
col_sum

### Sorting the frequencies

In [None]:
col_sum_sorted = col_sum.sort_values(ascending=False)
col_sum_sorted

### Top 50 most frequent metadata

In [None]:
col_sum_sorted[:50]

### Top 50 unique metadata

In [None]:
col_sum_sorted[-50:]

### Unique freuencies

In [None]:
np.unique(col_sum_sorted)

In [None]:
len(np.unique(col_sum_sorted))

### Plotting the frequencies in x-axis and no. of metadata with those frequencies in y-axis

In [None]:
#col_sum_10 = col_sum[col_sum >= 10]
#col_sum.plot()
sns.set(rc={'figure.figsize':(20,9)})


g = sns.countplot(col_sum_sorted).set_title("Metadata frequencies and their counts for JENSEN")
#g.set(xlabel='metadata frequencies', ylabel='no. of metadata with those frequencies')
plt.xlabel("metadata frequencies")
plt.ylabel("no. of metadata with those frequencies")
#sns_plot = sns.pairplot(df, hue='species', size=2.5)
g.figure.savefig("Metadata frequencies and their counts for JENSEN.png")


### No. of metadata present only once or  1 time

In [None]:
len(col_sum_sorted[col_sum_sorted==1])

### Metadata present only once

In [None]:
col_sum_sorted[col_sum_sorted==1]

### No. of metadata present only 3 times

In [None]:
len(col_sum_sorted[col_sum_sorted==3])

### Metadata present only 3 times

In [None]:
col_sum_sorted[col_sum_sorted==3]

### No. of metadata present only 5 times

In [None]:
len(col_sum_sorted[col_sum_sorted==5])

### Metadata present only 5 times

In [None]:
col_sum_sorted[col_sum_sorted==5]

### No. of metadata present 10 times

In [None]:
len(col_sum_sorted[col_sum_sorted==10])

### Metadata present only 10 times

In [None]:
len(col_sum_sorted[col_sum_sorted==10])

### Plotting metadata- 'bone marrow' and their counts

In [None]:
sns.countplot(dataset_metadata_df['bone marrow'])

### Plotting metadata- 'acute myeloid leukemia' and their counts

In [None]:
sns.countplot(dataset_metadata_df['acute myeloid leukemia'])

### Plotting metadata- 'CD34' and their counts

In [None]:
sns.countplot(dataset_metadata_df['CD34'])

### Plotting metadata- 'hematopoietic cells' and their counts

In [None]:
sns.countplot(dataset_metadata_df['hematopoietic cells'])

### Plotting metadata- 'AF9' and their counts

In [None]:
sns.countplot(dataset_metadata_df['AF9'])

### Plotting metadata- 'NPM1' and their counts

In [None]:
sns.countplot(dataset_metadata_df['NPM1'])

### Plotting metadata- 'CEBPA' and their counts

In [None]:
sns.countplot(dataset_metadata_df['CEBPA'])

### Plotting the metadata and their counts

In [None]:
#sns.barplot(x = "gene expression", y = "dataset_metadata_df", data = dataset_metadata_df)
#sns.barplot(data = dataset_metadata_df[['acute myeloid leukemia', 'gene expression', 'CEBPA']])

#ax = sns.barplot(x=col_sum_sorted, y=col_sum_sorted.index, data=col_sum_sorted)
#ax.set_xlabel(col_sum_sorted.index)
col_sum_sorted_df = pd.DataFrame(col_sum_sorted)
col_sum_sorted_df.reset_index(level=0, inplace=True)
col_sum_sorted_df
sns.set(rc={'figure.figsize':(15,500)})
ax = sns.barplot(x=0, y='index', data=col_sum_sorted_df)
ax.set_xlabel('index')
ax.set_title('metadata and their presence in no.of datasets for JENSEN')
plt.xlabel("no. of datasets")
plt.ylabel("metadata")
ax.figure.savefig("metadata and their presence in no.of datasets for JENSEN.png")


### GSE id's with no. of metadata present

In [None]:
row_sum = dataset_metadata_df.sum(axis = 1)
row_sum

### Sorting the GSE ids in descending order w.r.t no. of metadata present

In [None]:
row_sum_sorted = row_sum.sort_values(ascending=False)
row_sum_sorted

### 20 GSE id's with most metadata 

In [None]:
row_sum_sorted[:20]

### 20 GSE id's with least metadata

In [None]:
row_sum_sorted[-20:]

### GSE id's with the metadata present

In [None]:
gse_meta_dict = {}

for i in range(len(dataset_metadata_df.index)):
    gse = dataset_metadata_df.index[i]
    #print(gse)
    cols = (dataset_metadata_df.loc[gse] == 1)
    #print(cols)
    true_cols = cols[cols == True]
    #print(true_cols)
    gse_meta_dict[gse] = list(true_cols.index)

In [None]:
gse_meta_dict

In [None]:
row_sum_sorted

### GSE ids and their no. of metadata present

In [None]:
row_sum_sorted_df = pd.DataFrame(row_sum_sorted)
row_sum_sorted_df.reset_index(level=0, inplace=True)
row_sum_sorted_df
sns.set(rc={'figure.figsize':(15,100)})
ax = sns.barplot(x=0, y='index', data=row_sum_sorted_df)
ax.set_xlabel('index')

ax.set_title('GSE ids and their no. of metadata present for JENSEN')
plt.xlabel("no. of metadata")
plt.ylabel("GSE ids")
ax.figure.savefig("GSE ids and their no. of metadata present for JENSEN.png")

In [None]:
col_sum_sorted[col_sum_sorted <= 30].index

### Filtering out the metadata keywords which are very common

In [None]:
filtered_metadata = dataset_metadata_df[col_sum_sorted[col_sum_sorted <= 30].index]
filtered_metadata

### Getting the cosine similarity for our matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim_matrix = cosine_similarity(filtered_metadata)
cos_sim_matrix

In [None]:
np.set_printoptions(threshold=np.nan)

print(pd.DataFrame(cos_sim_matrix))

### Shape of the similarity matrix

In [None]:
cos_sim_matrix.shape

### Converting the similarity matrix to a dataframe

In [None]:
cos_sim_matrix_df = pd.DataFrame(cos_sim_matrix)
cos_sim_matrix_df

### Giving the similarity matrix dataframe rows and columns as GSE ids

In [None]:
cos_sim_matrix_df.columns = filtered_metadata.index
cos_sim_matrix_df.index = filtered_metadata.index

In [None]:
cos_sim_matrix_df

In [None]:
cos_sim_matrix_df.shape

In [None]:
#df.loc['y'] = pandas.Series({'a':1, 'b':5, 'c':2, 'd':3})

In [None]:
#rowIndex = df.index[someRowNumber]
#df.loc[rowIndex, 'New Column Title'] = "some value"

In [None]:
#row_nm = dataset_metadata_df.index[1]
#row_nm

In [None]:
#geoids_metadata['GSE111678']

In [None]:
#dataset_metadata_df = pd.DataFrame()

#for i in range(len(geo_ids)):
    #dataset_metadata_df['geo_ids'] = geo_ids[i]
    

In [None]:
#unique_metadata = []

#for val in geoids_metadata.values(): 
    #if val in unique_metadata: 
        #continue 
    #else:
        #unique_metadata.append(val)

#print(unique_metadata)

In [None]:
#geoids_metadata.values()

In [None]:
#summary = get_summary_and_title('GSE97346')

In [None]:
#summary = summary['abstract']
#summary

In [None]:
#summary_jensen = query_jensen_api(summary)
#summary_jensen

In [None]:
#meta_data = annotate_biomedical_entities(summary_jensen)
#meta_data

In [None]:
#summary

In [None]:
#jen = query_jensen_api(summary)
#jen

In [None]:
#import matplotlib.pyplot as plt
#import seaborn as sns

#sns.distplot(dataset_metadata_df['NCBI_Species_Proteins'])

In [None]:
#dataset_metadata_df['NCBI_Species_Proteins']

In [None]:
#dataset_metadata_df.iloc[:, 0]