In [1]:
import pandas as pd

# Load the BioGRID .tab3 file
datapath = "/Users/sabye/Downloads/Codes/wgRFE-BIN/Data/"
biogrid_data = pd.read_csv(datapath + "BIOGRID-ALL-4.4.241.tab3.txt", sep='\t', low_memory=False)

# Extract the relevant columns
gene_a = biogrid_data['Official Symbol Interactor A']
gene_b = biogrid_data['Official Symbol Interactor B']

# Create a new dataframe with unique genes from both columns 'gene_a' and 'gene_b'
unique_genes = pd.DataFrame(pd.concat([gene_a, gene_b]).unique(), columns=['features'])

# Compute the degree of each node
degree_count = pd.concat([gene_a, gene_b]).value_counts().reset_index()
degree_count.columns = ['features', 'degree']

# Merge the degree count with the unique genes dataframe
unique_genes = unique_genes.merge(degree_count, on='features', how='left')

print("len:", len(unique_genes))

# Display the new dataframe with unique genes and their degrees
print(unique_genes.head())

len: 81115
  features  degree
0   MAP2K4   130.0
1     MYPN    40.0
2    ACVR1   143.0
3    GATA2   226.0
4     RPA2  3580.0


In [2]:
# Read the dataframe from the specified CSV file
ggi_cnv = pd.read_csv("/Users/sabye/Downloads/Codes/wgRFE-BIN/Result/Preprocessed_KIRC_cnv.csv")
ggi_cnv = ggi_cnv.set_index('sample')
ggi_cnv = ggi_cnv.iloc[:, :-1]  # All columns except the last one

# Save the name of features into a column named 'features'
ggi_cnv_features = pd.DataFrame(ggi_cnv.columns, columns=['features'])

# Merge with the ggi_mrna_features DataFrame
ggi_cnv_features = ggi_cnv_features.merge(unique_genes, on='features', how='left')

num_matched = (ggi_cnv_features['degree'] > 0).sum()
print("Number of CNV features with matched genes in BIN:", num_matched)

# Fill NaN values with 0 in case some features don't have a corresponding degree
ggi_cnv_features['degree'] = ggi_cnv_features['degree'].fillna(0)

print("len:", len(ggi_cnv_features))

# Display the new dataframe with features
print(ggi_cnv_features.head())

Number of CNV features with matched genes in BIN: 18575
len: 24776
  features  degree
0    ACAP3    21.0
1   ACTRT2    10.0
2     AGRN   278.0
3  ANKRD65     3.0
4   ATAD3A   402.0


In [3]:
# Read the dataframe from the specified CSV file
ggi_mrna = pd.read_csv("/Users/sabye/Downloads/Codes/wgRFE-BIN/Result/Preprocessed_KIRC_mrna.csv")
ggi_mrna = ggi_mrna.set_index('sample')
ggi_mrna = ggi_mrna.iloc[:, :-1]  # All columns except the last one

# Save the name of features into a column named 'features'
ggi_mrna_features = pd.DataFrame(ggi_mrna.columns, columns=['features'])

# Merge with the ggi_mrna_features DataFrame
ggi_mrna_features = ggi_mrna_features.merge(unique_genes, on='features', how='left')

num_matched = (ggi_mrna_features['degree'] > 0).sum()
print("Number of mrna features with matched genes in BIN:", num_matched)

# Fill NaN values with 0 in case some features don't have a corresponding degree
ggi_mrna_features['degree'] = ggi_mrna_features['degree'].fillna(0)

print("len:", len(ggi_mrna_features))

# Display the new dataframe with features
print(ggi_mrna_features.head())

Number of mrna features with matched genes in BIN: 17267
len: 20238
    features  degree
0  ARHGEF10L    62.0
1      HIF3A    11.0
2      RNF17    12.0
3      RNF10   103.0
4      RNF11   270.0


In [4]:
datapath_out = datapath = "/Users/sabye/Downloads/Codes/wgRFE-BIN/Result/"

In [5]:
max_possible_degree_cnv = len(ggi_cnv_features) - 1  # Maximum possible degree in an undirected graph
ggi_cnv_features['degree_centrality'] = ggi_cnv_features['degree'] / max_possible_degree_cnv

print("len:", len(ggi_cnv_features))
print(ggi_cnv_features.head(3))

ggi_cnv_features.to_csv(datapath_out + "KIRC_cnv_weights_ggi_degCen.csv")

print ("--------------------")

# Find genes with min, max and total degree
min_degree_cnv = ggi_cnv_features['degree'].min()
print ('mix:', min_degree_cnv)

max_degree_cnv = ggi_cnv_features['degree'].max()
print ('max:', max_degree_cnv)

total_degree_cnv = ggi_cnv_features['degree'].sum()
print(f"Total degree of all genes: {total_degree_cnv}") 

len: 24776
  features  degree  degree_centrality
0    ACAP3    21.0           0.000848
1   ACTRT2    10.0           0.000404
2     AGRN   278.0           0.011221
--------------------
mix: 0.0
max: 6030.0
Total degree of all genes: 2690961.0


In [6]:
max_possible_degree_mrna = len(ggi_mrna_features) - 1  # Maximum possible degree in an undirected graph
ggi_mrna_features['degree_centrality'] = ggi_mrna_features['degree'] / max_possible_degree_mrna

print("len:", len(ggi_mrna_features))
print(ggi_mrna_features.head(3))

ggi_mrna_features.to_csv(datapath_out + "KIRC_mrna_weights_ggi_degCen.csv")

print ("--------------------")

# Find genes with min, max and total degree
min_degree_mrna = ggi_mrna_features['degree'].min()
print ('mix:', min_degree_mrna)

max_degree_mrna = ggi_mrna_features['degree'].max()
print ('max:', max_degree_mrna)

total_degree_mrna = ggi_mrna_features['degree'].sum()
print(f"Total degree of all genes: {total_degree_mrna}") 

len: 20238
    features  degree  degree_centrality
0  ARHGEF10L    62.0           0.003064
1      HIF3A    11.0           0.000544
2      RNF17    12.0           0.000593
--------------------
mix: 0.0
max: 6030.0
Total degree of all genes: 2598133.0
