In [1]:
import numpy as np
import os

# Load the numpy array of protein and molecule affinities
protein_affinity_array = np.load('data/kiba/affinity_data.npy')
# Documents/base paper implementation/kiba_dataset/affinity_data.npy

# Path to the aligned protein files
aligned_protein_path = "/home/saeed/Documents/base paper implementation/data/kiba/aln"

# Get list of aligned protein IDs from filenames
aligned_protein_ids = [file.split('.aln')[0] for file in os.listdir(aligned_protein_path) if file.endswith('.aln')]

# Get list of protein IDs from the numpy array
array_protein_ids = protein_affinity_array[:, 0]

# Find proteins in the array but not in the aligned data
proteins_not_in_aligned = np.setdiff1d(array_protein_ids, aligned_protein_ids)

# Find proteins in the aligned data but not in the array
proteins_not_in_array = np.setdiff1d(aligned_protein_ids, array_protein_ids)

# Results
print("Proteins not in aligned data:", proteins_not_in_aligned)
print("Proteins not in array:", proteins_not_in_array)


Proteins not in aligned data: []
Proteins not in array: []


In [None]:
import os
import numpy as np
import pconsc4
import random
import tensorflow as tf
from keras import backend as K
from concurrent.futures import ProcessPoolExecutor

# Function to predict and save contact map for a single protein
def predict_single_protein(input_file, output_file):
    # Load the pconsc4 model inside the separate process
    model = pconsc4.get_pconsc4()

    try:
        print(f'Processing {input_file}')
        pred = pconsc4.predict(model, input_file)  # Predict the contact map
        np.save(output_file, pred['cmap'])  # Save the contact map in .npy format
        print(f'{output_file} saved.')

    except Exception as e:
        print(f'Error processing {input_file}: {e}')

    finally:
        # Clear the Keras session and reset the TensorFlow graph to release memory
        K.clear_session()
        tf.compat.v1.reset_default_graph()

def pconsc4Prediction(aligned_protein_path, contact_map_dir, num_workers=4):
    # Ensure the contact maps directory exists
    if not os.path.exists(contact_map_dir):
        os.makedirs(contact_map_dir)

    # Get list of aligned protein files
    file_list = os.listdir(aligned_protein_path)
    random.shuffle(file_list)  # Randomize the order of processing

    tasks = []
    # Iterate through each alignment file and generate contact maps
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        for file in file_list:
            input_file = os.path.join(aligned_protein_path, file)
            output_file = os.path.join(contact_map_dir, file.split('.aln')[0] + '.npy')  # Store as .npy file
            
            if os.path.exists(output_file):
                continue  # Skip if the contact map already exists
            
            # Submit tasks to process the files in parallel
            tasks.append(executor.submit(predict_single_protein, input_file, output_file))
        
        # Wait for all tasks to complete
        for task in tasks:
            task.result()  # This ensures we capture any exceptions raised during the execution

# Paths
aligned_protein_path = "data/kiba/aln"
contact_map_dir = "KibaContactMaps"

# Run the prediction using 4 CPU cores
pconsc4Prediction(aligned_protein_path, contact_map_dir, num_workers=4)


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])




























2024-09-28 12:04:16.618209: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-28 12:04:16.618269: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-28 12:04:16.626497: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2799925000 Hz
2024-09-28 12:04:16.627399: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5de8580582b0 executing computations on platform Host. Devices:
2024-09-28 12:04:16.627459: I tensorflow/compiler/xl

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.












Processing data/kiba/aln/Q96L34.aln
Processing data/kiba/aln/Q16513.aln
Processing data/kiba/aln/Q14289.aln
Processing data/kiba/aln/O00329.aln


OMP: Info #155: KMP_AFFINITY: Initial OS proc set respected: 0
OMP: Info #216: KMP_AFFINITY: decoding x2APIC ids.
OMP: Info #157: KMP_AFFINITY: 1 available OS procs
OMP: Info #158: KMP_AFFINITY: Uniform topology
OMP: Info #287: KMP_AFFINITY: topology layer "LL cache" is equivalent to "socket".
OMP: Info #287: KMP_AFFINITY: topology layer "L3 cache" is equivalent to "socket".
OMP: Info #287: KMP_AFFINITY: topology layer "L2 cache" is equivalent to "core".
OMP: Info #287: KMP_AFFINITY: topology layer "L1 cache" is equivalent to "core".
OMP: Info #192: KMP_AFFINITY: 1 socket x 1 core/socket x 1 thread/core (1 total cores)
OMP: Info #218: KMP_AFFINITY: OS proc to physical thread map:
OMP: Info #172: KMP_AFFINITY: OS proc 0 maps to socket 0 core 0 thread 0 
OMP: Info #254: KMP_AFFINITY: pid 85516 tid 85516 thread 0 bound to OS proc set 0
OMP: Info #155: KMP_AFFINITY: Initial OS proc set respected: 0
OMP: Info #216: KMP_AFFINITY: decoding x2APIC ids.
OMP: Info #157: KMP_AFFINITY: 1 available

In [1]:
# #GOOD working ON 1 CPU core !
# import os
# import numpy as np
# import pconsc4
# import random
# import tensorflow as tf
# from multiprocessing import Process

# def predict_single_protein(input_file, output_file):
#     # Load the pconsc4 model within the separate process
#     model = pconsc4.get_pconsc4()

#     try:
#         print(f'Processing {input_file}')
#         pred = pconsc4.predict(model, input_file)  # Predict the contact map
#         np.save(output_file, pred['cmap'])  # Save the contact map in .npy format
#         print(f'{output_file} saved.')

#     except Exception as e:
#         print(f'Error processing {input_file}: {e}')
#     finally:
#         # Clear the Keras session to release memory
#         tf.keras.backend.clear_session()
#         tf.compat.v1.reset_default_graph()

# def pconsc4Prediction(aligned_protein_path, contact_map_dir):
#     # Ensure the contact maps directory exists
#     if not os.path.exists(contact_map_dir):
#         os.makedirs(contact_map_dir)

#     # Get list of aligned protein files
#     file_list = os.listdir(aligned_protein_path)
#     random.shuffle(file_list)  # Randomize the order of processing

#     # Iterate through each alignment file and spawn a separate process
#     for file in file_list:
#         input_file = os.path.join(aligned_protein_path, file)
#         output_file = os.path.join(contact_map_dir, file.split('.aln')[0] + '.npy')  # Store as .npy file

#         if os.path.exists(output_file):
#             continue  # Skip if the contact map already exists

#         # Spawn a new process for each prediction
#         p = Process(target=predict_single_protein, args=(input_file, output_file))
#         p.start()
#         p.join()  # Wait for the process to finish before moving to the next file

# # Paths
# aligned_protein_path = "data/kiba/aln"
# contact_map_dir = "KibaContactMaps"

# # Run the prediction
# pconsc4Prediction(aligned_protein_path, contact_map_dir)


In [None]:
# import os
# import numpy as np
# import pconsc4
# import random

# def pconsc4Prediction(aligned_protein_path, contact_map_dir):
#     model = pconsc4.get_pconsc4()  # Load the pconsc4 model
    
#     # Ensure the contact maps directory exists
#     if not os.path.exists(contact_map_dir):
#         os.makedirs(contact_map_dir)
    
#     # Get list of aligned protein files
#     file_list = os.listdir(aligned_protein_path)
#     random.shuffle(file_list)  # Randomize the order of processing
    
#     # Iterate through each alignment file and generate contact maps
#     for file in file_list:
#         input_file = os.path.join(aligned_protein_path, file)
#         output_file = os.path.join(contact_map_dir, file.split('.aln')[0] + '.npy')  # Store as .npy file
        
#         if os.path.exists(output_file):
#             continue  # Skip if the contact map already exists
        
#         try:
#             print(f'Processing {input_file}')
#             pred = pconsc4.predict(model, input_file)  # Predict the contact map
#             np.save(output_file, pred['cmap'])  # Save the contact map in .npy format
#             print(f'{output_file} saved.')
#         except Exception as e:
#             print(f'Error processing {file}: {e}')

# # Paths
# aligned_protein_path = "data/kiba/aln"
# contact_map_dir = "KibaContactMaps"

# # Run the prediction
# pconsc4Prediction(aligned_protein_path, contact_map_dir)


In [None]:
# import os
# import numpy as np
# import pconsc4
# import random
# import tensorflow as tf
# from keras import backend as K

# def pconsc4Prediction(aligned_protein_path, contact_map_dir):
#     model = pconsc4.get_pconsc4()  # Load the pconsc4 model

#     # Ensure the contact maps directory exists
#     if not os.path.exists(contact_map_dir):
#         os.makedirs(contact_map_dir)
    
#     # Get list of aligned protein files
#     file_list = os.listdir(aligned_protein_path)
#     random.shuffle(file_list)  # Randomize the order of processing

#     # Iterate through each alignment file and generate contact maps
#     for file in file_list:
#         input_file = os.path.join(aligned_protein_path, file)
#         output_file = os.path.join(contact_map_dir, file.split('.aln')[0] + '.npy')  # Store as .npy file
        
#         if os.path.exists(output_file):
#             continue  # Skip if the contact map already exists
        
#         try:
#             print(f'Processing {input_file}')
#             pred = pconsc4.predict(model, input_file)  # Predict the contact map
#             np.save(output_file, pred['cmap'])  # Save the contact map in .npy format
#             print(f'{output_file} saved.')
            
#             # Clear Keras session to release memory
#             K.clear_session()
#             tf.compat.v1.reset_default_graph()
#         except Exception as e:
#             print(f'Error processing {file}: {e}')

# # Paths
# aligned_protein_path = "data/kiba/aln"
# contact_map_dir = "KibaContactMaps"

# # Run the prediction
# pconsc4Prediction(aligned_protein_path, contact_map_dir)


In [1]:
# import os
# import numpy as np
# import pconsc4
# import random
# import tensorflow as tf
# from concurrent.futures import ProcessPoolExecutor

# def predict_contact_map(model, input_file, output_file):
#     try:
#         print(f'Processing {input_file}')
        
#         # Predict the contact map
#         pred = pconsc4.predict(model, input_file)
        
#         # Save the contact map in .npy format
#         np.save(output_file, pred['cmap'])
#         print(f'{output_file} saved.')

#     except Exception as e:
#         print(f'Error processing {input_file}: {e}')
#     finally:
#         # Clear TensorFlow session to free up memory after each task
#         tf.keras.backend.clear_session()

# def process_in_parallel(aligned_protein_path, contact_map_dir, num_workers=4):
#     model = pconsc4.get_pconsc4()  # Load the pconsc4 model

#     # Ensure the contact maps directory exists
#     if not os.path.exists(contact_map_dir):
#         os.makedirs(contact_map_dir)

#     # Get list of aligned protein files
#     file_list = os.listdir(aligned_protein_path)
#     random.shuffle(file_list)  # Randomize the order of processing

#     # Use a process pool to handle parallel execution
#     with ProcessPoolExecutor(max_workers=num_workers) as executor:
#         futures = []
#         for file in file_list:
#             input_file = os.path.join(aligned_protein_path, file)
#             output_file = os.path.join(contact_map_dir, file.split('.aln')[0] + '.npy')
            
#             # Skip already processed files
#             if os.path.exists(output_file):
#                 continue

#             # Submit the job to the pool
#             futures.append(executor.submit(predict_contact_map, model, input_file, output_file))

#         # Wait for all futures to complete
#         for future in futures:
#             try:
#                 future.result()  # Ensure any exceptions are caught
#             except Exception as e:
#                 print(f'Error in parallel execution: {e}')

# # Paths
# aligned_protein_path = "/home/saeed/Documents/base paper implementation/data/kiba/aln"
# contact_map_dir = "/home/saeed/Documents/base paper implementation/KibaContactMaps"

# # Run the prediction with parallelism
# process_in_parallel(aligned_protein_path, contact_map_dir, num_workers=4)
