In [4]:
import pandas as pd
import re

# Read the TSV file
df = pd.read_csv("dataset.tsv", sep="\t")

# Function to extract modified residues containing "arginine"
def extract_arginine_residues(residue_info):
    if pd.isna(residue_info):  # Handle NaN values
        return []
    
    pattern = r'MOD_RES (\d+); /note="([^"]*arginine[^"]*)"'
    matches = re.findall(pattern, str(residue_info), flags=re.IGNORECASE)  # Case-insensitive
    return matches  # Returns a list of (position, residue_name) tuples

# Create a new DataFrame
filtered_rows = []

for _, row in df.iterrows():
    residues = extract_arginine_residues(row["Modified residue"])
    for pos, residue_name in residues:
        filtered_rows.append({
            "Entry": row["Entry"],
            "Organism": row["Organism"],
            "Sequence": row["Sequence"],
            "Modified residue": residue_name,  # Stores the full modified residue name
            "Position": int(pos)
        })

# Convert list to DataFrame
new_df = pd.DataFrame(filtered_rows)

# Save to a new file
new_df.to_csv("data/full_arginine_data.tsv", sep="\t", index=False)

# Display the first few rows
print(new_df.head())


    Entry              Organism  \
0  A0AV96  Homo sapiens (Human)   
1  A0AV96  Homo sapiens (Human)   
2  A0AV96  Homo sapiens (Human)   
3  A0AV96  Homo sapiens (Human)   
4  A0AV96  Homo sapiens (Human)   

                                            Sequence  \
0  MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...   
1  MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...   
2  MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...   
3  MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...   
4  MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...   

                         Modified residue  Position  
0                  Omega-N-methylarginine       332  
1  Asymmetric dimethylarginine; alternate       394  
2       Omega-N-methylarginine; alternate       394  
3  Asymmetric dimethylarginine; alternate       405  
4       Omega-N-methylarginine; alternate       405  


In [5]:
# Filter the DataFrame for "Homo sapiens (Human)"
human_df = new_df[new_df["Organism"] == "Homo sapiens (Human)"]
mouse_df = new_df[new_df["Organism"] == "Mus musculus (Mouse)"]
rat_df = new_df[new_df["Organism"] == "Rattus norvegicus (Rat)"]
chimp_df = new_df[new_df["Organism"]== "Pan troglodytes (Chimpanzee)"]

# drop df rows where modified residue == ADP-ribosylarginine


# Drop rows where Modified residue is ADP-ribosylarginine
human_df = human_df[human_df["Modified residue"] != "ADP-ribosylarginine"]
mouse_df = mouse_df[mouse_df["Modified residue"] != "ADP-ribosylarginine"]
rat_df = rat_df[rat_df["Modified residue"] != "ADP-ribosylarginine"]
chimp_df = chimp_df[chimp_df["Modified residue"] != "ADP-ribosylarginine"]

# Optionally, check the resulting DataFrames
print("Human DataFrame shape:", human_df.shape)
print("Mouse DataFrame shape:", mouse_df.shape)
print("Rat DataFrame shape:", rat_df.shape)
print("Chimp DataFrame shape:", chimp_df.shape)

Human DataFrame shape: (1403, 5)
Mouse DataFrame shape: (1361, 5)
Rat DataFrame shape: (614, 5)
Chimp DataFrame shape: (69, 5)


In [None]:
import os
import pandas as pd

# Assume you already have these DataFrames: human_df, chimp_df, mouse_df, rat_df
organism_dfs = {
    "Human": human_df,
    "Chimpanzee": chimp_df,
    "Mouse": mouse_df,
    "Rat": rat_df
}

# List of modification types to consider.
modifications = [
    "Asymmetric dimethylarginine",
    "Dimethylated arginine",
    "Omega-N-methylarginine",
    "Symmetric dimethylarginine"
]

# Define window sizes and corresponding left/right offsets.
window_dict = {
    11: 5,   # 5 left, 5 right => 11-length
    15: 7,   # 7 left, 7 right => 15-length
    25: 12,  # 12 left, 12 right => 25-length
    51: 25   # 25 left, 25 right => 51-length
}

# Loop over each organism and modification type.
for org, df in organism_dfs.items():
    for mod in modifications:
        # Filter to keep only rows where "Modified residue" contains the mod string (case-insensitive).
        mod_df = df[df["Modified residue"].str.contains(mod, case=False, na=False)]
        if mod_df.empty:
            continue
        
        pos_data = []
        neg_data = []
        
        # Process each row in the filtered DataFrame.
        for idx, row in mod_df.iterrows():
            sequence = row["Sequence"]
            mod_pos = row["Position"]  # 1-indexed position of the modified residue.
            entry = row["Entry"]
            organism_val = row["Organism"]
            mod_res_note = row["Modified residue"]
            
            mod_index = mod_pos - 1  # Convert to 0-index for slicing.
            
            # Dictionaries to hold sequences for each window size.
            pos_seqs = {}
            neg_seqs = {w: [] for w in window_dict.keys()}
            
            # Positive: extract sequence window for each size.
            for win, offset in window_dict.items():
                start_index = max(0, mod_index - offset)
                end_index = min(len(sequence), mod_index + offset + 1)
                pos_seqs[win] = sequence[start_index:end_index]
            
            # Negative: for every occurrence of "R" not equal to mod_pos.
            for i, aa in enumerate(sequence):
                if aa == "R":
                    pos_i = i + 1  # 1-indexed position.
                    if pos_i == mod_pos:
                        continue
                    for win, offset in window_dict.items():
                        neg_start = max(0, i - offset)
                        neg_end = min(len(sequence), i + offset + 1)
                        neg_seq = sequence[neg_start:neg_end]
                        neg_seqs[win].append(neg_seq)
            
            # Aggregate negative sequences (join them with semicolons).
            neg_seqs_joined = {w: ";".join(neg_seqs[w]) for w in window_dict.keys()}
            
            # Build positive row.
            pos_row = {
                "Entry": entry,
                "Organism": organism_val,
                "Modified residue": mod_res_note,
                "Modified_position": mod_pos
            }
            for win in window_dict.keys():
                pos_row[f"Positive_sequence_{win}"] = pos_seqs[win]
            pos_data.append(pos_row)
            
            # Build negative row.
            neg_row = {
                "Entry": entry,
                "Organism": organism_val,
                "Modified residue": mod_res_note,
                "Modified_position": mod_pos
            }
            for win in window_dict.keys():
                neg_row[f"Negative_sequences_{win}"] = neg_seqs_joined[win]
            neg_data.append(neg_row)
        
        # Create DataFrames for positive and negative sequences.
        pos_df = pd.DataFrame(pos_data)
        neg_df = pd.DataFrame(neg_data)
        
        # Build the directory path, e.g., data/Human/Asymmetric_dimethylarginine
        folder = os.path.join("data", org, mod.replace(" ", "_"))
        os.makedirs(folder, exist_ok=True)
        
        # Build file names.
        pos_file = os.path.join(folder, f"{org.lower()}_positive_{mod.replace(' ', '_').lower()}_sequences_combined.csv")
        neg_file = os.path.join(folder, f"{org.lower()}_negative_{mod.replace(' ', '_').lower()}_sequences_combined.csv")
        
        # Save the CSV files.
        pos_df.to_csv(pos_file, index=False)
        neg_df.to_csv(neg_file, index=False)
        
        print(f"Saved {org} {mod} positive and negative files (combined windows) in separate files.")


Saved Human Asymmetric dimethylarginine positive and negative files (combined windows) in separate files.
Saved Human Dimethylated arginine positive and negative files (combined windows) in separate files.
Saved Human Omega-N-methylarginine positive and negative files (combined windows) in separate files.
Saved Human Symmetric dimethylarginine positive and negative files (combined windows) in separate files.
Saved Chimpanzee Asymmetric dimethylarginine positive and negative files (combined windows) in separate files.
Saved Chimpanzee Dimethylated arginine positive and negative files (combined windows) in separate files.
Saved Chimpanzee Omega-N-methylarginine positive and negative files (combined windows) in separate files.
Saved Chimpanzee Symmetric dimethylarginine positive and negative files (combined windows) in separate files.
Saved Mouse Asymmetric dimethylarginine positive and negative files (combined windows) in separate files.
Saved Mouse Dimethylated arginine positive and neg

In [15]:
import os
import pandas as pd

# Define the root directory where your organism folders are located
root_folder = "data"

# List to hold summary information
summary_list = []

# Loop through each organism folder (e.g., Human, Chimpanzee, Mouse, Rat)
for organism in os.listdir(root_folder):
    organism_path = os.path.join(root_folder, organism)
    if os.path.isdir(organism_path):
        # Loop through each modification folder within the organism folder
        for modification in os.listdir(organism_path):
            modification_path = os.path.join(organism_path, modification)
            if os.path.isdir(modification_path):
                # Loop through each CSV file in the modification folder
                for file in os.listdir(modification_path):
                    if file.endswith(".csv"):
                        file_path = os.path.join(modification_path, file)
                        try:
                            df = pd.read_csv(file_path)
                            rows, cols = df.shape
                        except Exception as e:
                            rows, cols = None, None
                        summary_list.append({
                            "Organism": organism,
                            "Modification": modification.replace("_", " "),
                            "File": file,
                            "Rows": rows,
                            "Columns": cols,
                            "File_Path": file_path
                        })

# Create a DataFrame with the summary information
summary_df = pd.DataFrame(summary_list)

# Save the summary DataFrame to an Excel file
output_excel = "summary_file_shapes.xlsx"
summary_df.to_excel(output_excel, index=False)

print(f"Summary of file shapes saved to {output_excel}")


Summary of file shapes saved to summary_file_shapes.xlsx
