# Literature search for identified genes
### Aim:
Search the PubMed database for publications related to the identified gene and cell cycle
<br>
### Output:
DF of genes with communities enriched in cell cycle terms exported as a CSV file

In [1]:
# Import packages and DepMap tools
import pandas as pd
import os
from DepMapTools.Mine import MineData
from DepMapTools.Networks import NetworkAnalysis
import networkx as nx

In [2]:
# Instantiate MineData class
md = MineData()
na = NetworkAnalysis()

In [3]:
# Load CSV file of list of identified genes
PRD = '.'
path = os.path.join(PRD,
                    'identified_genes_final.csv')
df = pd.read_csv(path)

In [4]:
# Get list of genes for each term
g1 = list(df[df['Term'] == 'G1/S']['genes'])
g2 = list(df[df['Term'] == 'G2/M']['genes'])
ma = list(df[df['Term'] == 'metaphase/anaphase']['genes'])

In [5]:
# Get interactions
g1_int = na.generate_interactions(g1)
g2_int = na.generate_interactions(g2)
ma_int = na.generate_interactions(ma)

In [6]:
# Make networks
g1_net = na.network_import(g1_int)
g2_net = na.network_import(g2_int)
ma_net = na.network_import(ma_int)

In [7]:
# Remove self loops
g1_net.remove_edges_from(nx.selfloop_edges(g1_net))
g2_net.remove_edges_from(nx.selfloop_edges(g2_net))
ma_net.remove_edges_from(nx.selfloop_edges(ma_net))

In [8]:
# Identify the isolated nodes from the TSC1 network
g1_iso = [x for  x in g1_net.nodes() if g1_net.degree(x) < 1]
g2_iso = [x for  x in g2_net.nodes() if g2_net.degree(x) < 1]
ma_iso = [x for  x in ma_net.nodes() if ma_net.degree(x) < 1]

In [9]:
# Get the number of publications for each search term
g1_results = md.get_publications(g1, 'cell cycle', 'rp467@sussex.ac.uk')
g2_results = md.get_publications(g2, 'cell cycle', 'rp467@sussex.ac.uk')
ma_results = md.get_publications(ma, 'cell cycle', 'rp467@sussex.ac.uk')

In [10]:
# Extract scores
g1_score = df[df['Term'] == 'G1/S']
g1_score = g1_score.drop('Term', axis=1)
g2_score = df[df['Term'] == 'G2/M']
g2_score = g2_score.drop('Term', axis=1)
ma_score = df[df['Term'] == 'metaphase/anaphase']
ma_score = ma_score.drop('Term', axis=1)

In [11]:
# Compile DFs
# G1/S
df1 = pd.DataFrame(g1_results, columns=['gene', 'gene_term']).assign(Term='G1/S')
df1 = df1.reset_index(level=0)
df1 = pd.merge(df1, g1_score, how='right', left_on='index', right_on='genes')
df1 = df1.drop('genes', axis=1)
df1['isolated_node'] = df1['index'].isin(g1_iso)

# G2/M
df2 = pd.DataFrame(g2_results, columns=['gene', 'gene_term']).assign(Term='G2/M')
df2 = df2.reset_index(level=0)
df2 = pd.merge(df2, g2_score, how='right', left_on='index', right_on='genes')
df2 = df2.drop('genes', axis=1)
df2['isolated_node'] = df2['index'].isin(g2_iso)

# Meta/ana
df3 = pd.DataFrame(ma_results, columns=['gene', 'gene_term']).assign(Term='Meta/Ana')
df3 = df3.reset_index(level=0)
df3 = pd.merge(df3, ma_score, how='right', left_on='index', right_on='genes')
df3 = df3.drop('genes', axis=1)
df3['isolated_node'] = df3['index'].isin(ma_iso)

In [12]:
# Compile final DFs
final_df = pd.DataFrame()
final_df = final_df.append([df1, df2, df3], ignore_index=True)
final_df = final_df.rename(columns={'index':'gene_name', 'gene':'gene_pub', 'gene_term':'gene_term_pub', 'Term':'cell_cycle_term', 'isolated_node':'isolated_node'})

In [13]:
# Export df to CSV
PRD = "."
export_path = os.path.join(PRD, 'identified_genes_publications.csv')
final_df.to_csv(fr'{export_path}', index = False)