In [1]:
#pip install bio

In [8]:
import pandas as pd
import glob
import csv
import numpy as np
from Bio import Entrez
from os import listdir 
import os
import re
from matplotlib import pyplot as plt
import random

In [10]:
pd.set_option('display.max_rows', 6000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 800)
pd.set_option('display.max_colwidth', None)

In [19]:
def text_process(func_word):     
    result = re.sub('[^A-Za-z0-9]+', ' ', func_word) #remove underscores
    result = re.sub(" \d+", "", result) # remove numbers
    result = result.lower() 
    result = re.sub(r'(?:^| )\w(?:$| )', ' ', result).strip() #remove single letters
    stopwords = {'cell','of','small','in','is','he', 'linked','to', 'an','from', 'by', 'on', 'the', 'or', 'like', 'layer','that','biology','peptide',
                'ii', 'groups', 'into', 'type', 'reactome', 'kegg', 'pathway', 'and', 'processing', 'diabetes','term'
                , 'signal', 'during','synthesis', 'secretion', 'cross', 'presentation', 'regulated', 'sodium', 'secreted','gap',
                'factors', 'pid', 'channel', 'transport', 'activation', 'molecules', 'expression', 'pre','absence' ,'transmits', 'nucleus',
                'downstream', 'golgi', 'mutants','human', 'by','hormone', 'pathways','biocarta','st','sa','sig','long','go','class','other','3','nuclear','metabolic',
                'connection','chain','trans','aggregation','through','mature','signals','and10','phases','adhesion','exposed','aggregation','mediated','via','role','second','oxygen','biological'
                ,'family','containing','receptors','disease','homo','highly','sapiens','diseases','associated','growth','elements','medial','antennae','cytoskeletal','rich' , 'repeat','double' , 'strand', 'break'}
    result  = ' '.join(filter(lambda x: x.lower() not in stopwords,  result.split()))
    if (len(result.split()) >= 2):
        stopwords1 = {'binding', 'protein', 'gr','chaperones','targetting ','factor', 'remodelling','activity','attractive','activated', 'active','regulation', 'group', 'chemical', 'sensory', 'other', 'process', 'release',
                    'species','receptor' ,'positive', 'derived','compound','permeable', 'cellular', 'particle', 'organism', 'involved', 'movement', 'termination','phosphate',
                        'interaction', 'glycosylation','environment', 'pathway', 'signaling', 'coupled', 'mrna', 'response', 'negative','elevation','cleavage','xenobiotics','cytochrome',
                        'modified', 'response', 'left', 'right', 'formation', 'nucleotide', 'gene', 'complex','migration','transporters','death','signalling',
                        'dependent', 'maintenance', 'process', 'acid', 'cancer','n','o','one','homologous','non','metabolism','biosynthesis','transcription','methionine',
                        'tumor','necrosis','elongation','pol','splicing','carbon','pool','series','glycosphingolipid','salt','interactions','transcriptional','white','phosphorylation',
                        'oxidative','ligand','noncanonical','transcript','cytosolic','levels','cascade','events','genomic','global','basal','organization','junction','extension','association',
                        'life','alpha','cycle','degradation','production','stabilization','proteins','amino','messengers','hydroxylation','hormones','membrane','glucose','transendothelial'}
        result = ' '.join(filter(lambda x: x.lower() not in stopwords1,  result.split())) #second round
        words = result.split()
        result = " ".join(sorted(set(words), key=words.index)) #remove duplicate words
        result = re.sub(r'(?:^| )\w(?:$| )', ' ', result).strip() #remove single letters
    if (len(result.split()) >= 3):
        stopwords3 = {'dna','mediated','TRANSPORT','replication','cytokine','rna','synapse ','digestion','interferon','cysteine','anemia','nephrin','stimulation','induced','induction','mitochondrial','stimulates','cardiomyopathy',
        'differentiation','peptide','channels','subunit','chemokine','chemokines','activates','activated','elevation','phagocytosis','kinases','modification','post','platlet','origin','neurotransmitter',
        'systemic','incision','oxidation','development','early','stimulation','apoptosis','glycoproteins','infection','heterotrimer','targets','infection','proteasome','respiratory','system'}
        result = ' '.join(filter(lambda x: x.lower() not in stopwords3,  result.split()))
    result = result.strip()
    intact_func = func_word
    result = ' OR '.join(result.split())
    return result, intact_func

#func_name to get the function name from main file & row to get the genes from data
def make_terms(data, func_name, row):
    func_word, intact_func = text_process(func_name) 
    gene_list = data.iloc[row, 1]
    stopwords4 = {'type'} 
    gene_list = ' '.join(filter(lambda x: x.lower() not in stopwords4,  gene_list.split())) 
    words = [func_word, "AND", gene_list]
    gene_func_terms =  ' '.join(words)
    return gene_func_terms, func_word, intact_func


def co_occurance(terms):
    Entrez.email = "smadhu270@gmail.com"
    search_results = Entrez.read(
        Entrez.esearch(
            db="pubmed", term= terms, mindate=1990, maxdate=2022, datetype="pdat", usehistory="y", retmax='1000'
        )
    )
    count = int(search_results["Count"])
    pmids = list(search_results["IdList"])
    print(terms)
    print("counts:", count)
    print("pmids:", len(pmids))
    return count, pmids

def mk_mats(data):
    mat_names= ["total_count", "func_name", "disease_name"]
    master_count = pd.DataFrame(index = data.iloc[:,0], columns = list(mat_names))
    pmid_mat = pd.DataFrame(columns = data.iloc[:,0], index = list(range(0,2000)))
    return master_count, pmid_mat
def call_func(data):
    master_count, pmid_mat = mk_mats(data)
    for row in range(data.shape[0]):
        print("row_number:", row)
        func_count = 0
        func_array = []
        pmid_array = []
        func_name = data.iloc[row, 0]
        gene_func_list, func_word, intact_func = make_terms(data, func_name, row)
        print(gene_func_list)
        if len(str(gene_func_list).split()) < 4:
            continue
        if len(gene_func_list) != 0:
           counts, pmids = co_occurance(gene_func_list)
           func_count = +counts
           func_array.append(counts)
           pmid_array.append(pmids)
        if(sum(func_array) != 0):
           master_count.loc[intact_func].loc["total_count"] = func_count
           master_count.loc[intact_func].loc["func_name"] = func_name
           master_count.loc[intact_func].loc["disease_name"] = data.iloc[row, 1]
           pmid_list = [item for sublist in pmid_array for item in sublist]
           if(len(pmid_list) != 0):
               pmid_list = pd.DataFrame(pmid_list); pmid_list.columns = [intact_func]
               pmid_list.index = list(range(0, len(pmid_list)))
               pmid_mat.loc[:, func_name] =  pmid_list
    return master_count, pmid_mat




In [31]:
###############       EXECUTION        ##################################

In [40]:
path_lis = pd.read_csv("all_pathway_list.txt", sep = "\t", header = None, low_memory=False)
norm_pathway = pd.read_csv("bay_norm_path_nlp.txt", sep = "\t", header = None, low_memory=False)
sequence=pd.DataFrame(norm_pathway.iloc[:,0])
df1=path_lis

# Default inner join
df1 = pd.merge(df1, sequence, how="outer",indicator=True)
df4 = df1.loc[df1["_merge"] == "left_only"].drop("_merge", axis=1)
original_pathway=df4

random.seed(44)
null_funcs = pd.DataFrame(random.sample(list(original_pathway.iloc[:,0]), norm_pathway.shape[0]))
a=pd.DataFrame(['Diabetes mellitus']*10)
null_data = pd.concat([null_funcs, a],axis=1)
null_data.columns =['func_name', 'disease_name']


In [4]:
null_data.head()

In [2]:
null_freq, null_pmid = call_func(null_data)

In [3]:
term_freq, pmid_mat = call_func(norm_pathway) #pmid_mat <- pubmed ids are saved

In [44]:
term_freq.to_csv("bay_norm_pathways_matches.csv")
pmid_mat.to_csv("bay_norm_pathways_match_pids.csv")
null_freq.to_csv("bay_norm_null_pathways_matches.csv")
null_pmid.to_csv("bay_norm_null_pathways_match_pids.csv")