# Prepare Data For Binary Classification

In [1]:
#Import All Libraries Here
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Record Constants
INPUT_TRAIN_IN = "../../../data/train_in.csv"
INPUT_TRAIN_OUT = "../../../data/train_out.csv"
INPUT_TEST_IN = "../../../data/test_in.csv"
INPUT_TEST_OUT = "../../../data/test_out.csv"
INPUT_VALIDATION_IN = "../../../data/valid_in_nucleo.csv"
INPUT_VALIDATION_OUT  = "../../../data/valid_out.csv"

In [3]:
#Read X Varaibles and Y Varaibles

x_train_raw =  pd.read_csv(INPUT_TRAIN_IN, header=None , skiprows=1 )
y_train_raw =  pd.read_csv(INPUT_TRAIN_OUT, header=None , skiprows=1 )

x_test_raw =  pd.read_csv(INPUT_TEST_IN, header=None , skiprows=1 )
y_test_raw =  pd.read_csv(INPUT_TEST_OUT, header=None , skiprows=1)

x_valid_raw =  pd.read_csv(INPUT_VALIDATION_IN, header=None , skiprows=1 )
y_valid_raw =  pd.read_csv(INPUT_VALIDATION_OUT, header=None , skiprows=1 )

x_data = pd.concat([x_train_raw, x_test_raw, x_valid_raw], axis=0, ignore_index=True)
y_data = pd.concat([y_train_raw, y_test_raw, y_valid_raw], axis=0, ignore_index=True)

print(f"Train Shape of X : {x_train_raw.shape} and Tranin Shape of Y : {x_train_raw.shape}")
print(f"Test Shape of X : {x_test_raw.shape} and Test Shape of Y : {y_test_raw.shape}")
print(f"Validation Shape of X : {x_valid_raw.shape} and Validation Shape of Y : {y_valid_raw.shape}")

Train Shape of X : (304661, 1001) and Tranin Shape of Y : (304661, 1001)
Test Shape of X : (1200, 1001) and Test Shape of Y : (1200, 12)
Validation Shape of X : (3599, 1001) and Validation Shape of Y : (3599, 12)


In [4]:
WINDOW_SIZE = 50 # Final Sequence Length would be 101 . Middle Index for Modification data.

middle_index = (x_train_raw.shape[1] // 2) + 1 # This is location for Modified Sequence . Use this as Y Target
STRAT_INEDX =middle_index - WINDOW_SIZE -1
END_INDEX =middle_index + WINDOW_SIZE
x_data_cropped =  x_data.iloc[:,STRAT_INEDX :END_INDEX]


In [31]:
x_data_cropped.shape # Shape should be 101

(309460, 101)

### Filter Data for Each Class

In [13]:
## Filter Dataset to Keep only Target Binary Class

RMs = ['hAm','hCm','hGm','hTm','hm1A','hm5C','hm5U','hm6A','hm6Am','hm7G','hPsi','Atol','NonMoD']

def convert_y_to_original_labels(row):
    label = ""
    for index , n in enumerate(row.tolist()) :
        if n == 1 :
            label = RMs[index]
    if label == '':
        return 'NonMoD'
    return label

def convert_y_for_binary_classification(row):
    label = ""
    for index , n in enumerate(row.tolist()) :
        if n == 1 :
            label = 1
    if label == '':
        return  0 
    return label

def prepare_data_for_binary_classification_for_each_class(x_data , y_data , prediction_class):
    # Convert One Hot Encoded Y to to Original Labels 
    y_original_labels = y_data.apply(convert_y_to_original_labels, axis=1)
    
    # Create a copy of x_data
    x_data_copy = x_data.copy()
    x_data_copy['Label'] = y_original_labels
    
    target_class = [prediction_class, 'NonMoD']      
    selected_rna_data = x_data_copy[x_data_copy['Label'].isin(target_class)] 

    y_filtered = selected_rna_data['Label']
    x_filtered = selected_rna_data.drop('Label', axis=1)

    return x_filtered , y_filtered 


def prepare_data_for_binary_classification_for_overall_class(x_data , y_data ):
    # Convert One Hot Encoded Y to to Original Labels 
    y_original_labels = y_data.apply(convert_y_for_binary_classification, axis=1)
    
    # Create a copy of x_data
    x_data_copy = x_data.copy()
    x_data_copy['Label'] = y_original_labels
    
    y_filtered = x_data_copy['Label']
    x_filtered = x_data_copy.drop('Label', axis=1)

    return x_filtered , y_filtered 

In [37]:
class_list = ['hAm','hCm','hGm','hTm','hm1A','hm5C','hm5U','hm6A','hm6Am','hm7G','hPsi','Atol']
ros = RandomOverSampler(random_state=42)

for target in class_list:
    x_data_filtered , y_data_filtered = prepare_data_for_binary_classification_for_each_class(x_data_cropped , y_data , target)
    x_resampled, y_resampled = ros.fit_resample(x_data_filtered, y_data_filtered)
    df_resampled = pd.concat([x_resampled, pd.Series(y_resampled, name='TARGET')], axis=1)
    file_name = "data/" + target+ "_with_ROS.csv"
    df_resampled.to_csv(file_name, index=False)
    
    

### Create 3 mer numerical encoding for all dataset and persist

In [4]:
concatenated_column= x_data.apply(lambda row: ''.join(map(str, row)), axis=1)
x_data_with_complete_sequence = x_data.assign(Sequence=concatenated_column)

In [8]:
kmer_dict = {}
encoding = 0
k = 3

for sequence in x_data_with_complete_sequence['Sequence']:
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if kmer not in kmer_dict:
            kmer_dict[kmer] = encoding
            encoding +=1

In [10]:
import pickle 

file_name = str(k) + "-mer-dictionary.pkl"
with open(file_name, 'wb') as f:
    pickle.dump(kmer_dict, f)
        
with open(file_name, 'rb') as f:
    loaded_dict = pickle.load(f)

In [11]:
print(f"Total Number of {k} mer Keys  : " , len(loaded_dict))
for key in loaded_dict:
    print(f"Key : {key} and Value : {loaded_dict[key]}")

Total Number of 2 mer Keys  :  24
Key : CC and Value : 0
Key : CA and Value : 1
Key : AT and Value : 2
Key : TA and Value : 3
Key : AC and Value : 4
Key : CT and Value : 5
Key : AG and Value : 6
Key : GT and Value : 7
Key : TC and Value : 8
Key : CG and Value : 9
Key : GG and Value : 10
Key : GC and Value : 11
Key : TG and Value : 12
Key : TT and Value : 13
Key : GA and Value : 14
Key : AA and Value : 15
Key : GN and Value : 16
Key : NC and Value : 17
Key : NN and Value : 18
Key : NG and Value : 19
Key : CN and Value : 20
Key : NA and Value : 21
Key : TN and Value : 22
Key : AN and Value : 23


### Prepare Datset For Binary Classification

In [8]:
WINDOW_SIZE = 50 # Final Sequence Length would be 101 . Middle Index for Modification data.

middle_index = (x_train_raw.shape[1] // 2) + 1 # This is location for Modified Sequence . Use this as Y Target
STRAT_INEDX =middle_index - WINDOW_SIZE -1
END_INDEX =middle_index + WINDOW_SIZE
x_data_cropped =  x_data.iloc[:,STRAT_INEDX :END_INDEX]


In [9]:
x_data_cropped

Unnamed: 0,450,451,452,453,454,455,456,457,458,459,...,541,542,543,544,545,546,547,548,549,550
0,T,T,G,C,C,A,C,A,C,T,...,G,C,A,G,T,A,T,C,T,C
1,T,T,T,G,A,A,A,A,A,A,...,C,T,C,A,T,C,G,T,G,C
2,A,G,A,A,A,C,A,T,T,C,...,T,T,T,C,T,G,T,T,C,A
3,T,T,A,G,T,T,T,T,A,C,...,G,A,A,A,A,A,T,T,T,C
4,C,A,A,C,A,G,A,A,G,T,...,T,A,A,A,A,T,G,T,A,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309455,C,C,A,A,A,C,T,C,T,T,...,A,G,G,G,C,A,G,A,G,A
309456,G,A,T,C,C,A,G,T,T,G,...,A,A,C,A,G,G,T,A,A,T
309457,G,C,C,A,G,G,G,C,A,A,...,G,C,A,A,G,C,T,G,A,T
309458,C,T,G,G,G,T,G,C,G,A,...,G,G,C,A,G,A,G,T,C,A


In [14]:
x_data_filtered , y_data_filtered = prepare_data_for_binary_classification_for_overall_class(x_data_cropped , y_data)
df_resampled = pd.concat([x_data_filtered, pd.Series(y_data_filtered, name='TARGET')], axis=1)
file_name = "rna_binary_dataset.csv"
df_resampled.to_csv(file_name, index=False)
