In [2]:
import pandas as pd
import plotly.graph_objects as go
import os

### COMBINE SUBCLUSTER FILES

In [26]:
dir = r"data\20230416\neighbors"
df = pd.DataFrame()
for filename in os.listdir(dir):
    f = os.path.join(dir, filename)
    # checking if it is a file
    if os.path.isfile(f):
        temp_df = pd.read_csv(f)
        temp_df.rename(columns={'cluster_num':'Subcluster Number'}, inplace=True)
        cluster_number_long = filename.split("_")[1].split('.')[0]
        cluster_number = cluster_number_long.replace("cluster", "")
        temp_df['SSN Cluster Number'] = cluster_number
        df = pd.concat([df, temp_df], axis=0)

In [30]:
df.columns

Index(['Unnamed: 0', 'sort_key', 'accession', 'id', 'num', 'family',
       'ipro_family', 'start', 'stop', 'rel_start', 'rel_stop', 'direction',
       'type', 'seq_len', 'taxon_id', 'anno_status', 'desc', 'family_desc',
       'ipro_family_desc', 'color', 'gene_key', 'SSN Cluster Number'],
      dtype='object')

In [32]:
df.to_csv(r'data/neighbors.csv')

### assign tags for regulator and transporter

In [3]:
attributes = pd.read_csv(r'data\attributes.csv')
neighbors = pd.read_csv(r'data\neighbors.csv')

In [6]:
neighbors['transporter'] = neighbors['desc'].apply(lambda x: 'transport' in x if type(x)==str else False)
neighbors['regulator'] = neighbors['desc'].apply(lambda x: 'regulat' in x if type(x)==str else False)
neighbors['EamA'] = neighbors['desc'].apply(lambda x: 'EamA' in x if type(x)==str else False)

In [20]:
neighbors.to_csv(r'data\neighbors.csv')

### get pairs of neighbors

In [3]:
neighbors = pd.read_csv('./data/neighbors.csv')
attributes = pd.read_csv('./data/attributes.csv')
subcluster_df = pd.read_csv('./data/AlaX_10neighbors.csv')

THRESHOLD = 10
# flag_proteins = attributes.loc[attributes['SSN Cluster Number']==SSN].index
SSN=5

In [4]:
flag_proteins = attributes.loc[attributes['SSN Cluster Number']==SSN].index
subcluster_df = subcluster_df.loc[subcluster_df['SSN Cluster Number']==SSN]

count_neighbors = {}
dict_flag_proteins = {}
filtered_neighbors = pd.DataFrame()
for flag in flag_proteins:
    taxon = attributes.loc[flag, 'taxon_id']
    flag_num = attributes.loc[flag, 'num']
    flag_neighbors = neighbors.loc[(neighbors['taxon_id']==taxon)&(neighbors['num']>=(flag_num-THRESHOLD))&(neighbors['num']<=(flag_num+THRESHOLD))&(neighbors['family']!='none')]
    dict_flag_proteins = flag_neighbors['family_desc'].unique()
    filtered_neighbors = pd.concat([filtered_neighbors, flag_neighbors], axis=0)
    for neighbor in flag_neighbors['family_desc'].unique():
        count_neighbors[neighbor] = count_neighbors[neighbor]+1 if neighbor in count_neighbors.keys() else 1

neighbor_list = filtered_neighbors.groupby('taxon_id')['family_desc'].apply(list).reset_index(name="family_desc")

In [5]:
top_neighbors = dict(sorted(count_neighbors.items(), key=lambda x:x[1], reverse=True)[:5])

In [26]:
count_secondaryneighbors = pd.DataFrame(columns=['index'])
for neighbor in top_neighbors.keys():
    neighbor_list[neighbor+'_presence'] = neighbor_list.apply(lambda x: neighbor in x['family_desc'], axis=1)
    temp = neighbor_list.loc[neighbor_list[neighbor+'_presence']].explode('family_desc')
    temp_secondary = temp['family_desc'].value_counts(ascending=False).reset_index(name=neighbor).iloc[:10,:]
    # count_secondaryneighbors = pd.concat([count_secondaryneighbors, temp_secondary[neighbor]], axis=1)
    count_secondaryneighbors = pd.merge(count_secondaryneighbors, temp_secondary, how="outer", on='index')
    # count_secondaryneighbors = count_secondaryneighbors.join(temp_secondary)
count_secondaryneighbors = count_secondaryneighbors.rename(columns={'index':'pair'}).set_index('pair')

### Too big of a file

In [4]:
# filtered_neighbors = pd.DataFrame()
# for flag in attributes.index:
#     taxon = attributes.loc[flag, 'taxon_id']
#     flag_num = attributes.loc[flag, 'num']
#     neighbors.loc[(neighbors['taxon_id']==taxon)&(neighbors['num']>=(flag_num-THRESHOLD))&(neighbors['num']<=(flag_num+THRESHOLD))&(neighbors['family']!='none'), 'flag'] = attributes.loc[flag, 'family_desc']
#     flag_neighbors = neighbors.loc[(neighbors['taxon_id']==taxon)&(neighbors['num']>=(flag_num-THRESHOLD))&(neighbors['num']<=(flag_num+THRESHOLD))&(neighbors['family']!='none')]
#     filtered_neighbors = pd.concat([filtered_neighbors, flag_neighbors], axis=0)

In [5]:
# filtered_neighbors.to_csv('filtered_neighbors.csv')

In [None]:
# neighbor_list = filtered_neighbors.groupby('taxon_id')['family_desc'].apply(list).reset_index(name="family_desc")
# neighbor_list['pair'] = neighbor_list['family_desc']
# neighbor_list = neighbor_list.explode('family_desc')
# neighbor_list = neighbor_list.explode('pair')

In [None]:
# top_neighbors = neighbor_list.loc[neighbor_list['family_desc']!=neighbor_list['pair'],['family_desc', 'pair']].value_counts(ascending=False).reset_index(name='count')

### SQL LITE File

In [2]:
import sqlite3
import pandas as pd

In [25]:

# Create a SQL connection to our SQLite database
con = sqlite3.connect(r'data\20230416\SQLITE\25585_96473_230403_IPR018163_43_Cluster5_90_full_ssn_arrow_data_co0_ns10.sqlite')

cur = con.cursor()

# # The result of a "cursor.execute" can be iterated over by row
table = 'neighbors'
# query = "SELECT * FROM sqlite_master WHERE type='table';"
# query = "SELECT * FROM {} LIMIT 5;".format(table)
# query = 'PRAGMA table_info({});'.format(table)
# for row in cur.execute(query):
#     print(row)
    
df = pd.read_sql_query("SELECT * from {}".format(table), con)

# Be sure to close the connection
con.close()

df.to_csv(r'data\20230416\neighbors\neighbors_cluster5.csv')

In [42]:
# df.to_csv('data/cluster_degree.csv')

In [None]:
# attributes
# sqlite_sequence
# neighbors
# families
# cluster_degree
# metadata
# cluster_index
# uniref50_cluster_index
# uniref50_cluster_index
# uniref50_index
# uniref90_cluster_index
# uniref90_range
# uniref90_index

### Superseded other

In [14]:
df = pd.read_csv("230106_PF04073_Uniref50_60_300max_NeighborPFamperCluster.csv")
df.dropna(subset=['# of Pfam Neighbors'], inplace=True)
df = df.loc[df['Pfam Description']!='none']
df = df.sort_values('Co-occurrence', ascending=True)

In [15]:
# df.to_csv('cleaned_data.csv')

In [22]:
# len(df['shared name'].unique())
df['SSN Cluster Number'].unique()[0:50]

array([ 27,   4,  36,  74,  66,  23,  37,  28,  64,  11,  62,  24,  67,
        49,  54,  43,   9,  90,  34,  86,  89,  75,  78,  52,  96,  26,
         5,   3,  33,  38,  16,  84,  69,  42,  82,  97,  92,  20,  44,
        25,  40, 101,  51,  56,  91,  55,  48,  39,  63,  87], dtype=int64)

In [6]:
df.dropna(subset=['# of Pfam Neighbors'], inplace=True)
df = df.loc[df['Pfam Description']!='none']
df = df.sort_values('Co-occurrence', ascending=True)
for SSN in sorted(df['SSN Cluster Number'].unique()):
    subset_df = df.loc[df['SSN Cluster Number']==SSN]
    if len(subset_df['shared name'].unique())>=15:
        top_shared_names = subset_df['shared name'].unique()[-15:]
        subset_df = subset_df.loc[subset_df['shared name'].isin(top_shared_names)]
    temp_fig = make_bar(subset_df)
    st.plotly_chart(temp_fig)

In [25]:
df.columns

Index(['# of Pfam Neighbors', '# of Queries with Pfam Neighbors',
       '# of Sequences in SSN Cluster',
       '# of Sequences in SSN Cluster with Neighbors', 'Average Distance',
       'Co-occurrence', 'Co-occurrence Ratio',
       'Hub Average and Median Distance', 'Hub Co-occurrence and Ratio',
       'Hub Pfam Neighbors', 'Hub Queries with Pfam Neighbors',
       'Median Distance', 'name', 'node.fillColor', 'node.shape', 'node.size',
       'Pfam', 'Pfam Description', 'Query Accessions',
       'Query-Neighbor Accessions', 'Query-Neighbor Arrangement', 'selected',
       'shared name', 'SSN Cluster Number'],
      dtype='object')

In [26]:
df['Hub Average and Median Distance'].unique()

array([nan], dtype=object)

In [None]:
def make_bar(subset_df: pd.DataFrame()):
    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=subset_df['Pfam Description'], 
            y=subset_df['Co-occurrence'],
            hovertext=subset_df['# of Queries with Pfam Neighbors'],
            hoverinfo="text",
            orientation='h'
            )
    )
    fig.update_layout(
        title="Co-occurrences for 60 AlnmtScore Subcluster "+str(subset_df['SSN Cluster Number'].iloc[0]),
        xaxis_title="Co-occurrence",
        yaxis_title="Shared Name",
        # height = 600,
        # width = 800,
        # legend_title="Legend Title",
        # font=dict(
        #     family="Courier New, monospace",
        #     size=18,
        #     color="RebeccaPurple"
        # )
    )
    return fig
