In [7]:
import pandas as pd

data = {
    'Gene': ['Gene1', 'Gene1', 'Gene1', 'Gene1', 'Gene1', 'Gene1', 'Gene2', 'Gene2', 'Gene2', 'Gene2', 'Gene3', 'Gene3', 'Gene3', 'Gene3','Gene3'],
    'TF': ['TF1', 'TF1', 'TF1', 'TF2', 'TF3', 'TF3', 'TF2', 'TF2', 'TF3', 'TF3', 'TF1', 'TF3', 'TF3', 'TF3','TF3'],
    'Start_Position': [100, 105, 130, 110, 105, 110, 200, 205, 210, 215, 300, 305, 310, 350, 355],
    'End_Position': [110, 115, 140, 120, 115, 120, 210, 215, 220, 225, 310, 315, 320, 360,365],
    'Strand': ['+', '+', '+', '-', '+', '-', '+', '+', '-', '-', '-', '-', '+', '-','+']
}
df1 = pd.DataFrame(data)

# Save DataFrame as a TSV file
df1.to_csv("output_data.tsv", sep='\t', index=False)

# Save DataFrame as an Excel file
df1.to_excel("output_data.xlsx", index=False)

In [8]:
#df = pd.read_excel("fimo.xlsx")
df = pd.read_csv("fimo.tsv",delimiter='\t')

def find_overlapping_intervals(start_pos, end_pos):
    if len(start_pos) == 0:
        return []
    
    # Create DataFrame from the input lists
    intervals = pd.DataFrame({
        'Start': start_pos,
        'End': end_pos
    })
    
    # Sort intervals by start position
    intervals = intervals.sort_values(by='Start').reset_index(drop=True)
    
    results = []
    current_start = intervals.loc[0, 'Start']
    current_end = intervals.loc[0, 'End']
    
    for i in range(1, len(intervals)):
        next_start = intervals.loc[i, 'Start']
        next_end = intervals.loc[i, 'End']
        
        if next_start <= current_end:
            # Extend the current interval
            current_end = max(current_end, next_end)
        else:
            # Record the non-overlapping interval
            results.append((current_start, current_end))
            current_start = next_start
            current_end = next_end
    
    # Append the last interval
    results.append((current_start, current_end))
    
    return results

In [9]:
# Initialize output list
output = []

# Loop through each unique gene-TF-strand combination
for (gene, tf, strand), subset_df in df.groupby(['sequence_name', 'motif_alt_id', 'strand']):
    start_positions = subset_df['start'].tolist()
    end_positions = subset_df['stop'].tolist()
    
    # Find overlapping intervals
    overlapping_intervals = find_overlapping_intervals(start_positions, end_positions)
    
    # Add results to output
    for start, end in overlapping_intervals:
        output.append([gene, tf, start, end])

# Convert to DataFrame
output_df = pd.DataFrame(output, columns=['Gene', 'TF', 'Start', 'End'])

# Output to .tsv and .xlsx files
output_df.to_csv('output_data_SS.tsv', sep='\t', index=False)
output_df.to_excel('output_data_SS.xlsx', index=False)

# Print the DataFrame
print("Refined Data:\n", output_df)

Refined Data:
                     Gene        TF  Start   End
0      Glyma.01G043300.1     AGL13    196   213
1      Glyma.01G043300.1     AGL27    887   900
2      Glyma.01G043300.1      AGL6    196   214
3      Glyma.01G043300.1      AGL6    874   892
4      Glyma.01G043300.1     AHL12    601   608
...                  ...       ...    ...   ...
72480  Glyma.20G163200.1  ZmbZIP96   1054  1064
72481  Glyma.20G163200.1   bHLH112   1166  1172
72482  Glyma.20G163200.1   bHLH130   1165  1172
72483  Glyma.20G163200.1  squamosa    102   114
72484  Glyma.20G163200.1  squamosa    120   132

[72485 rows x 4 columns]


In [10]:
#contingency table
df_play = pd.read_csv('output_data_ss.tsv',delimiter='\t')
df_play.fillna('NA',inplace=True)
cont_table_fimo = pd.crosstab(df_play['Gene'],df_play['TF'])#, margins=True)
print(cont_table_fimo)
#print(df_play)
cont_table_fimo.to_csv('cont_table_fimo_ss.tsv',sep='\t')
cont_table_fimo.to_excel('cont_table_fimo_ss.xlsx',engine='openpyxl')

TF                 ABF1  ABF2  ABF3  ABF4  ABI5  ABR1  AG  AGL1  AGL13  AGL15  \
Gene                                                                            
Glyma.01G043300.1     0     0     0     0     0     0   0     0      1      0   
Glyma.01G053800.1     2     2     2     0     2     0   0     0      5      2   
Glyma.01G056800.1     0     2     0     1     1     0   0     0      2      0   
Glyma.01G128100.1     0     0     0     0     0     0   2     0      5      1   
Glyma.01G189100.1     0     0     0     0     0     1   1     0      3      2   
...                 ...   ...   ...   ...   ...   ...  ..   ...    ...    ...   
Glyma.19G221700.1     0     2     0     2     1     0   0     0      1      0   
Glyma.19G254800.1     0     0     0     0     0     0   0     0      1      0   
Glyma.20G028000.1     0     0     0     0     0     0   1     0      2      1   
Glyma.20G030500.1     0     0     0     0     0     0   0     0      1      0   
Glyma.20G163200.1     1     

In [12]:
import numpy as np
from scipy.stats import spearmanr

# Load your contingency table, skipping the first column (gene names)
# Replace 'contingency_table.csv' with your actual file
contingency_table = pd.read_excel('cont_table_fimo_ss.xlsx', index_col=0)

# Drop the first column if it's the gene names column
tf_table = contingency_table.iloc[:, 1:]  # Exclude the first column

# Define the columns of interest
columns_of_interest = ['CAI', 'rENC']

# Prepare to store results
significant_correlations = []

# Iterate through each column of interest
for col in columns_of_interest:
    for tf in tf_table.columns:
        if tf != col:
            # Calculate Spearman's rank correlation and p-value
            corr, p_value = spearmanr(tf_table[col], tf_table[tf])
            
            # Filter based on statistical significance (e.g., p < 0.05)
            if p_value < 0.05:
                significant_correlations.append((col, tf, corr, p_value))

# Convert the results to a DataFrame for better readability
significant_df = pd.DataFrame(significant_correlations, columns=['Column_of_Interest', 'TF', 'Correlation', 'P-Value'])

# Sort by the strength of the correlation
significant_df = significant_df.sort_values(by='Correlation', ascending=False)

# Save the significant correlations to a file
significant_df.to_csv('significant_cai_nc_correlations_spearman.csv', index=False)

# Display the top few results
print(significant_df.head())


    Column_of_Interest      TF  Correlation       P-Value
61                 CAI    rENC     0.513522  1.685391e-13
104               rENC     CAI     0.513522  1.685391e-13
20                 CAI   ERF15     0.247036  8.281423e-04
64                rENC    ASR3     0.235171  1.483650e-03
38                 CAI  NAC047     0.214339  3.861255e-03


In [13]:
import pandas as pd
from scipy.stats import spearmanr

# Load your contingency table, skipping the first column (gene names)
# Replace 'contingency_table.xlsx' with your actual file
contingency_table = pd.read_excel('cont_table_fimo_ss.xlsx', index_col=0)

# Drop the first column if it's the gene names column
tf_table = contingency_table.iloc[:, 1:]  # Exclude the first column

# Define the columns of interest
columns_of_interest = ['rENC', 'CAI']

# Prepare dictionaries to store significant correlations
significant_enc = {}
significant_cai = {}

# Function to calculate significant correlations
def calculate_significant_correlations(column):
    significant = {}
    for tf in tf_table.columns:
        if tf != column:
            corr, p_value = spearmanr(tf_table[column], tf_table[tf])
            if p_value < 0.05:
                significant[tf] = (corr, p_value)
    return significant

# Calculate significant correlations with ENC
significant_enc = calculate_significant_correlations('rENC')

# Calculate significant correlations with CAI
significant_cai = calculate_significant_correlations('CAI')

# Find common TFs that are significant with both ENC and CAI
common_tfs = set(significant_enc.keys()).intersection(set(significant_cai.keys()))

# Prepare the results
common_tfs_results = []

for tf in common_tfs:
    enc_corr, enc_p_value = significant_enc[tf]
    cai_corr, cai_p_value = significant_cai[tf]
    common_tfs_results.append((tf, enc_corr, enc_p_value, cai_corr, cai_p_value))

# Convert the results to a DataFrame for better readability
common_tfs_df = pd.DataFrame(common_tfs_results, columns=['TF', 'ENC_Correlation', 'ENC_P-Value', 'CAI_Correlation', 'CAI_P-Value'])

# Sort by the strength of the correlation with ENC
common_tfs_df = common_tfs_df.sort_values(by='ENC_Correlation', ascending=False)

# Save the results to a file
common_tfs_df.to_csv('common_tfs_with_significant_correlations.csv', index=False)

# Display the results
print(common_tfs_df)


                         TF  ENC_Correlation  ENC_P-Value  CAI_Correlation  \
2                 PK06182.1         0.194959     0.008725         0.155839   
3                    ERF011         0.181976     0.014489         0.191182   
8                    NAC038         0.173325     0.019974         0.174723   
11                     HHO5         0.171394     0.021419         0.169751   
1                    ERF012         0.158967     0.033049         0.177103   
12                     IDD4         0.157717     0.034472         0.193619   
5   estExt_gwp_gw1.C_550163         0.153784     0.039290         0.153570   
9         PHYPADRAFT_182268         0.153784     0.039290         0.153570   
4                      MYB3         0.153063     0.040233         0.152176   
6                     ERF15         0.152237     0.041336         0.247036   
7                    DOF3.2        -0.150992     0.043045        -0.180553   
10                   DOF1.7        -0.167797     0.024352       