In [5]:
import pandas as pd
import os

# Load the CSV file into a DataFrame
file_path = "/Users/shayneskrtic/Desktop/data_copy.csv"
df = pd.read_csv(file_path)

# Drop duplicates based on the 'pdb_id' column
df = df.drop_duplicates(subset=['pdb_id'])

# Get the initial number of rows before dropping empty 'ec_numbers'
initial_rows = df.shape[0]

# Drop rows where 'ec_numbers' column is empty
df = df[df['ec_numbers'].apply(lambda x: len(eval(x)) > 0)]

# Get the final number of rows after dropping empty 'ec_numbers'
final_rows = df.shape[0]

# Calculate the number of dropped rows
dropped_rows = initial_rows - final_rows

print(f"Number of dropped rows: {dropped_rows}")

# Create new file name by appending "_dropped_null_ec" to the original file name
new_file_path = os.path.splitext(file_path)[0] + '_dropped_null_ec.csv'

# Write the DataFrame to a new CSV file
df.to_csv(new_file_path, index=False)

print(f"DataFrame has been written to {new_file_path}")


Number of dropped rows: 1537
DataFrame has been written to /Users/shayneskrtic/Desktop/data_copy_dropped_null_ec.csv


In [3]:
path = "/Users/shayneskrtic/Desktop/data_copy.csv"

In [2]:
import pandas as pd
import ast

def load_and_extract_signatures(csv_file_path):
    df = pd.read_csv(csv_file_path)
    unique_signatures = set()
    
    for motifs in df['motifs']:
        motifs_list = ast.literal_eval(motifs)
        for motif in motifs_list:
            unique_signatures.add(motif['signature_ac'])
    
    unique_signatures_list = sorted(list(unique_signatures))
    return unique_signatures_list


csv_file_path = '/Users/shayneskrtic/Desktop/data_copy_dropped_null_ec.csv'
unique_signatures = load_and_extract_signatures(csv_file_path)
print(len(unique_signatures))


392


In [3]:
import pandas as pd
import ast
aa_groups = {
    'Hydrophobic': ['A', 'V', 'I', 'L', 'M', 'F', 'W', 'Y'],
    'Polar': ['S', 'T', 'N', 'Q', 'C'],
    'Basic': ['K', 'R', 'H'],
    'Acidic': ['D', 'E'],
    'Special': ['G', 'P']
}

def group_aa_frequencies(aa_percent, aa_groups):
    grouped_data = {group: 0 for group in aa_groups.keys()}
    for aa, freq in aa_percent.items():
        for group, aas in aa_groups.items():
            if aa in aas:
                grouped_data[group] += freq
                break
    return grouped_data
def process_csv(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)
    
    hydrophobic = []
    polar = []
    basic = []
    acidic = []
    special = []
    
    # Iterate over each row in the DataFrame
    for aa_percent_str in df['amino_acid_Percent']:
        # Convert the string representation of the dictionary to an actual dictionary
        aa_percent = ast.literal_eval(aa_percent_str)
        
        # Group the frequencies
        grouped_frequencies = group_aa_frequencies(aa_percent, aa_groups)
        
        # Append the grouped frequencies to the respective lists
        hydrophobic.append(grouped_frequencies['Hydrophobic'])
        polar.append(grouped_frequencies['Polar'])
        basic.append(grouped_frequencies['Basic'])
        acidic.append(grouped_frequencies['Acidic'])
        special.append(grouped_frequencies['Special'])
    
    # Add the new columns to the DataFrame
    df['Hydrophobic'] = hydrophobic
    df['Polar'] = polar
    df['Basic'] = basic
    df['Acidic'] = acidic
    df['Special'] = special
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv_path, index=False)

# Example usage
input_csv_path = '/Users/shayneskrtic/Desktop/data_copy_dropped_null_ec.csv'
output_csv_path = '/Users/shayneskrtic/Desktop/data_copy_dropped_null_ec_OHE_AAS.csv'
process_csv(input_csv_path, output_csv_path)
print(f"Processed CSV saved to {output_csv_path}")


Processed CSV saved to /Users/shayneskrtic/Desktop/data_copy_dropped_null_ec_OHE_AAS.csv


In [4]:
import pandas as pd

def load_and_clean_csv(input_csv_path, output_csv_path):
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(input_csv_path)
    
    # Drop rows where 'percent_else' column value is less than 0
    df_cleaned = df[df['percent_else'] >= 0]
    
    # Save the cleaned DataFrame to a new CSV file
    df_cleaned.to_csv(output_csv_path, index=False)

# Example usage
input_csv_path = '/Users/shayneskrtic/Desktop/data_copy_dropped_null_ec_OHE_AAS.csv'
output_csv_path = '/Users/shayneskrtic/Desktop/data_copy_dropped_null_ec_OHE_AAS_Dropped_impossible.csv'
load_and_clean_csv(input_csv_path, output_csv_path)
print(f"Cleaned CSV saved to {output_csv_path}")


Cleaned CSV saved to /Users/shayneskrtic/Desktop/data_copy_dropped_null_ec_OHE_AAS_Dropped_impossible.csv


In [5]:
import pandas as pd
from Bio.PDB import PDBList, PDBParser
import os

def get_oligomeric_state(pdb_id):
    # Download the PDB file
    pdbl = PDBList()
    pdb_file = pdbl.retrieve_pdb_file(pdb_id, file_format='pdb')
    
    # Parse the PDB file
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_id, pdb_file)
    
    # Get the number of chains
    chains = set()
    for model in structure:
        for chain in model:
            chains.add(chain.id)
    
    # Determine the oligomeric state
    num_chains = len(chains)
    if num_chains == 1:
        oligomeric_state = "Monomer"
    elif num_chains == 2:
        oligomeric_state = "Dimer"
    elif num_chains == 3:
        oligomeric_state = "Trimer"
    elif num_chains == 4:
        oligomeric_state = "Tetramer"
    else:
        oligomeric_state = f"Oligomer with {num_chains} chains"
    
    # Delete the PDB file
    os.remove(pdb_file)
    
    return oligomeric_state

def process_pdb_ids(input_csv_path, output_csv_path):
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(input_csv_path)
    
    # Initialize a list to store the oligomeric states
    oligomeric_states = []
    
    # Iterate over each PDB ID in the DataFrame
    for pdb_id in df['pdb_id']:
        oligomeric_state = get_oligomeric_state(pdb_id)
        oligomeric_states.append(oligomeric_state)
    
    # Add the new column to the DataFrame
    df['oligomeric_state'] = oligomeric_states
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv_path, index=False)

# Example usage
input_csv_path = '/Users/shayneskrtic/Desktop/data_copy_dropped_null_ec_OHE_AAS_Dropped_impossible.csv'
output_csv_path = '/Users/shayneskrtic/Desktop/data_copy_dropped_null_ec_OHE_AAS_Dropped_impossible_Chains.csv'
process_pdb_ids(input_csv_path, output_csv_path)
print(f"Processed CSV saved to {output_csv_path}")


Downloading PDB structure '1v48'...
Downloading PDB structure '4but'...
Downloading PDB structure '7mrr'...
Downloading PDB structure '4hl5'...
Downloading PDB structure '8s69'...
Downloading PDB structure '9f6j'...
Downloading PDB structure '7gph'...
Downloading PDB structure '2gkl'...
Downloading PDB structure '6eqw'...
Downloading PDB structure '8hht'...
Downloading PDB structure '7gfb'...
Downloading PDB structure '3cui'...
Downloading PDB structure '4ecz'...
Downloading PDB structure '3ero'...
Downloading PDB structure '1wop'...
Downloading PDB structure '4l7g'...
Downloading PDB structure '6y6j'...
Downloading PDB structure '7gf7'...
Downloading PDB structure '1u4o'...
Downloading PDB structure '7ghb'...
Downloading PDB structure '2pny'...
Downloading PDB structure '7a16'...
Downloading PDB structure '5rx6'...
Downloading PDB structure '9f6l'...
Downloading PDB structure '3t90'...
Downloading PDB structure '8bsd'...
Downloading PDB structure '8qew'...
Downloading PDB structure '4

In [8]:
import requests
from Bio import PDB

def get_protein_sequence_length(pdb_id):
    try:
        # Download the PDB file
        url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
        response = requests.get(url)
        response.raise_for_status()
        
        # Save the PDB file locally
        pdb_file = f"{pdb_id}.pdb"
        with open(pdb_file, 'wb') as file:
            file.write(response.content)
        
        # Create a PDB parser
        parser = PDB.PDBParser(QUIET=True)
        
        # Parse the structure
        structure = parser.get_structure(pdb_id, pdb_file)
        
        # Get the sequence length
        sequence_length = 1
        for model in structure:
            for chain in model:
                for residue in chain:
                    if PDB.is_aa(residue):
                        sequence_length += 1
        
        return sequence_length
    except requests.HTTPError as http_err:
        return f"HTTP error occurred: {http_err}"
    except FileNotFoundError:
        return f"FileNotFoundError: The PDB file for ID '{pdb_id}' could not be found."
    except Exception as e:
        return f"An error occurred: {e}"

# Example usage
pdb_id = "7MRR"  # Replace with your PDB ID
sequence_length = get_protein_sequence_length(pdb_id)
print(f"The sequence length of the protein with PDB ID {pdb_id} is {sequence_length}.")


The sequence length of the protein with PDB ID 7MRR is 310.


In [9]:
import csv
import requests
from Bio import PDB

def get_protein_sequence_length(pdb_id):
    try:
        # Download the PDB file
        url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
        response = requests.get(url)
        response.raise_for_status()
        
        # Save the PDB file locally
        pdb_file = f"{pdb_id}.pdb"
        with open(pdb_file, 'wb') as file:
            file.write(response.content)
        
        # Create a PDB parser
        parser = PDB.PDBParser(QUIET=True)
        
        # Parse the structure
        structure = parser.get_structure(pdb_id, pdb_file)
        
        # Get the sequence length
        sequence_length = 0
        for model in structure:
            for chain in model:
                for residue in chain:
                    if PDB.is_aa(residue):
                        sequence_length += 1
        
        return sequence_length
    except requests.HTTPError as http_err:
        return f"HTTP error occurred: {http_err}"
    except FileNotFoundError:
        return f"FileNotFoundError: The PDB file for ID '{pdb_id}' could not be found."
    except Exception as e:
        return f"An error occurred: {e}"

def add_sequence_length_to_csv(input_csv, output_csv):
    with open(input_csv, mode='r') as infile, open(output_csv, mode='w', newline='') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['sequence_length']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in reader:
            pdb_id = row['pdb_id']
            sequence_length = get_protein_sequence_length(pdb_id)
            row['sequence_length'] = sequence_length
            writer.writerow(row)

# Example usage
input_csv = '/Users/shayneskrtic/Desktop/ML_DATA/data_copy_dropped_null_ec_OHE_AAS_Dropped_impossible_Chains.csv'  
output_csv = '/Users/shayneskrtic/Desktop/ML_DATA/data_copy_dropped_null_ec_OHE_AAS_Dropped_impossible_Chains_betterSeqLength.csv'  # Replace with your desired output CSV file path
add_sequence_length_to_csv(input_csv, output_csv)


In [11]:
import pandas as pd
import ast

def load_and_extract_signatures(csv_file_path):
    df = pd.read_csv(csv_file_path)
    unique_signatures = set()
    
    for motifs in df['motifs'].fillna('[]'):
        motifs_list = ast.literal_eval(motifs)
        for motif in motifs_list:
            unique_signatures.add(motif['signature_ac'])
    
    unique_signatures_list = sorted(list(unique_signatures))
    return unique_signatures_list

def one_hot_encode_motifs(df, unique_signatures):
    # Initialize a DataFrame with zeros
    one_hot_df = pd.DataFrame(0, index=df.index, columns=unique_signatures)
    
    for i, motifs in enumerate(df['motifs'].fillna('[]')):
        motifs_list = ast.literal_eval(motifs)
        for motif in motifs_list:
            signature_ac = motif['signature_ac']
            if signature_ac in one_hot_df.columns:
                one_hot_df.at[i, signature_ac] = 1
    
    return one_hot_df

# Load the original DataFrame
csv_file_path = '/Users/shayneskrtic/Desktop/ML_DATA/data_copy_dropped_null_ec_OHE_AAS_Dropped_impossible_Chains_betterSeqLength_newest.csv'
df = pd.read_csv(csv_file_path)

# Extract unique signatures
unique_signatures = load_and_extract_signatures(csv_file_path)

# One-hot encode the motifs
one_hot_df = one_hot_encode_motifs(df, unique_signatures)

# Merge the one-hot encoded DataFrame with the original DataFrame
result_df = pd.concat([df, one_hot_df], axis=1)

# Save the result to a new CSV file
output_csv_file_path = '/Users/shayneskrtic/Desktop/ML_DATA/data_with_one_hot_encoded_motifs.csv'
result_df.to_csv(output_csv_file_path, index=False)

print(f"One-hot encoded DataFrame saved to {output_csv_file_path}")


One-hot encoded DataFrame saved to /Users/shayneskrtic/Desktop/ML_DATA/data_with_one_hot_encoded_motifs.csv


In [12]:
import pandas as pd
import ast

def extract_first_two_ec_numbers(ec_numbers):
    if pd.isna(ec_numbers):
        return None, None
    ec_list = ast.literal_eval(ec_numbers)
    if not ec_list:
        return None, None
    first_ec = ec_list[0]
    first_two_numbers = first_ec.split('.')[:2]
    return int(first_two_numbers[0]), int(first_two_numbers[1])

def add_ec_number_columns(csv_file_path, output_csv_file_path):
    df = pd.read_csv(csv_file_path)
    
    df[['ec_first', 'ec_second']] = df['ec_numbers'].apply(
        lambda x: pd.Series(extract_first_two_ec_numbers(x))
    )
    
    df.to_csv(output_csv_file_path, index=False)
    print(f"Updated DataFrame saved to {output_csv_file_path}")

# Example usage
csv_file_path = '/Users/shayneskrtic/Desktop/ML_DATA/data_with_one_hot_encoded_motifs.csv'  # Replace with your input CSV file path
output_csv_file_path = '/Users/shayneskrtic/Desktop/ML_DATA/newest_data.csv'  # Replace with your desired output CSV file path
add_ec_number_columns(csv_file_path, output_csv_file_path)


Updated DataFrame saved to /Users/shayneskrtic/Desktop/ML_DATA/newest_data.csv
