# Create PairWise Training and Test Set

## Strategy

1. Create a common data frame with 8000*2, 1 label, and n rows of samples. Call this **mainDF**
2. Use **sourceDF** to do a stratified random sampling of the samples so we have even distribution of labels. 
3. Append 2 samples to mainDF and add label based on the samples added
4. Labels: Not siblings, siblings
5. The label is based on filename again where two samples are siblings if: XXXXX-RPn everything except n matches. 

### Imports

In [59]:
import pandas as pd
import numpy as np
from fileFunctionsModule import importCSVMotifFilesAsDfFromDir
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
import time
from datetime import date
import sys
import os.path
import re
from itertools import permutations
import random

### Data Import & Early Processing to a Single DF

In [6]:
# ! Define variables to be used
rerun_data_compilation = False
data_path = ""
data_type = ""
date_current = str(date.today().isoformat())

# Ask user what kind of data needs to be processed and if data should be recompiled from source csv files. 
while (data_type.lower() not in ['trimer', 'tetramer']) :
    data_type = input('Process Trimer or Tetramer data type?')
    if (input('Would you like to rerun data compilation? Y or N').upper() == 'N'):
        rerun_data_compilation = False
    else:
        rerun_data_compilation = True

if data_type == 'trimer':
    data_path = "../03_SourceFiles/01_TrimerSourceFiles"
elif data_type == 'tetramer':
    data_path = "../03_SourceFiles/02_TetramerSourceFiles"

In [7]:
# Checks if user wants to rerun data compilation. If not, old label file is used from memory
if rerun_data_compilation:
    # Import Data
    # transpose axis of the dataframe:
    # Fill na and nan values with 0 instead
    data_start = importCSVMotifFilesAsDfFromDir(data_path).transpose().reset_index().fillna(0)

    # Rename index column to sampleName in place
    data_start.rename(columns={'index': 'sampleName'}, inplace=True)

    # Add a new column with a binary identifier for a naive vs selected library. i.e = 1 is naive and 0 is selected
    data_start['naiveLibrary'] = data_start.apply(
        lambda row: (('oo' in row.sampleName[10:21]) and
                    ('RN0' in row.sampleName[-6:] or 'RN1' in row.sampleName[-6:])),
        axis=1
    )

    # convert the data in the column (NaiveLibrary) to int type so it is readable
    data_start['naiveLibrary'] = data_start['naiveLibrary'].astype(int)

    if (input('Would you like to save the resulting DF as a csv for future use or overwrite the previously saved csv? Y or N').upper() == 'Y'):
        data_start.to_csv(str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv'))
        print('DF Structure Saved to file.')
    else:
        pass
elif not rerun_data_compilation:
    if os.path.isfile(str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv')):
        try:
            data_start = pd.read_csv(
                str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv'),
                engine='c',
                low_memory=False,
                index_col=0
                )
            print('File read succsessful. Data frame loaded into memory.')
        except:
            print('Something went wrong while trying to read the csv file. Please rerun data compilation or check file manually.')
            sys.exit()
    else:
        print('CSV File does not exist in the source folder (03_SourceFiles/03_ProcessedFiles). Please re-run compilation')


File read succsessful. Data frame loaded into memory.


## Find # of siblings in data set & separate them out into sibling groups. 

1. Create a naive and a selected set. 
2. Create two dictionaries: one containing groups of naive siblings and one for selected siblings. 
3. Enumerate sets to use for stats later. 

In [8]:
naive_sampleName_set = data_start[(data_start['naiveLibrary'] == 1)]['sampleName']
selected_sampleName_set = data_start[~(data_start['naiveLibrary'] == 1)]['sampleName']

### Logic of setting up groups:

1. Set key of dictionary to 20170808-109OOooNA-JB-3_RN**X** where X indicates round. The Primer and Replicate are ommitted
2. Append value of sampleName to list within each diction by matching the key. Use Regular Expressions to create and match key. 

In [29]:
# Define required functions

def find_sibling_groups(data_iterable):
    """
    Creates a dictionary from provided iterable. The iterable should be made up of sampleNames in this format: 
    20170808-109OOooNA-JB-3__R10F5_RN1RP3
    The keys of the dictionary are set to: 20170808-109OOooNA-JB-3_RN1 (The primer and replicate info is removed)
    """
    sibling_group_dict = {}

    for index, value in data_iterable.items():
        keyID = value.split('__')[0] + '_' + re.findall(r'RN\d{1,}' , value)[0]
        if keyID not in sibling_group_dict:
            sibling_group_dict[keyID] = [value]
        else:
            if value not in sibling_group_dict[keyID]:
                sibling_group_dict[keyID].append(value)
    print('Sibbling Groups Founds: ', len(sibling_group_dict))
    return sibling_group_dict


def remove_single_member_sibling_groups(sibling_group_original, return_extra=False):
    """
    Iterates through the provided dictionary and finds any sibblings groups with one member and removes them.
    Returns three values: 
        sibbling group as a dict with single groups removed
        single_group_value -> values within the single groups
        empty_group -> returns any keys in dictionary which had empty lists (unlikely to happen)

    """
    sibling_group = sibling_group_original.copy()
    single_group_key = []
    single_group_value = []
    empty_group = []
    for key, value in sibling_group.items():
        if len(value) == 1:
            single_group_value.append(value)
            single_group_key.append(key)
        elif len(value) < 1:
            empty_group.append(key)
    
    for key_ in single_group_key:
        sibling_group.pop(key_)
    for key_ in empty_group:
        sibling_group.pop(key_)
    if len(single_group_key) > 0:
        print('Single Groups Found: ', len(single_group_value), '\nSibling Groups: ', len(sibling_group))
    if len(empty_group) > 0:
        print("Empty Groups Found")

    if return_extra:
        return sibling_group, single_group_value, empty_group
    else:
        return sibling_group

def remove_mismatching_sibling_groups(sibling_group_original, return_extra=False):
    """
    Iterates through provided sibbling dictionary to find any mismatches beteween key name and values within the list associated with the key. 
    Return_extra is set to false by default so only the sibbling_group dictionary is return. Can be changed to return:
        sibbling_group -> Dictionary
        mismatching_groups -> Dictionary (includes the entire group that had a mismatch)
        mismatching_groups -> Values that were mismatched (without their keys)
    """

    sibling_group = sibling_group_original.copy()
    mismatching_groups ={}
    mismatching_group_values = []
    for key, value in sibling_group.items():
        group_id = re.findall(r'RN\d{1,}', key)[0]
        for sample in value:
            if group_id != re.findall(r'RN\d{1,}', sample)[0]:
                mismatching_groups[key] = value
                mismatching_group_values.append(sample)

    for key in mismatching_groups:
        sibling_group.pop(key)

    if len(mismatching_groups) > 0:
        print('Mismatches Found, rerun with return_extra=True as an argument to get mismatching values')
    
    if (return_extra):
        return sibling_group, mismatching_groups, mismatching_group_values
    else:
        return sibling_group

def find_average_sibling_members(sibling_group):
    total_members = 0
    for key, value in sibling_group.items():
        total_members += len(value)
    return total_members/(len(sibling_group))

In [33]:
#### Create Sibbling Groups:
naive_sibling_group_dict = find_sibling_groups(naive_sampleName_set)
naive_sibling_group_dict = remove_mismatching_sibling_groups(remove_single_member_sibling_groups(naive_sibling_group_dict))

selected_sibling_group_dict = find_sibling_groups(selected_sampleName_set)
selected_sibling_group_dict = remove_mismatching_sibling_groups(remove_single_member_sibling_groups(selected_sibling_group_dict))

Sibbling Groups Founds:  255
Single Groups Found:  53 
Sibling Groups:  202
Sibbling Groups Founds:  448
Single Groups Found:  73 
Sibling Groups:  375


In [35]:
#### Combine the two sibling dictionaries into one:

combined_sibling_group_dict = {**naive_sibling_group_dict, **selected_sibling_group_dict}

In [14]:
# Extra for stats only
# labels_bar_siblings = ['Naive SG', 'Selected SG', 'Naive Singles', 'Selected Singles']
# data_bar_siblings = [
#     len(naive_sibling_groups), 
#     len(selected_sibling_groups),
#     len(n_s_g),
#     len(s_s_g)
#     ]
# plt.bar(labels_bar_siblings, data_bar_siblings)
# plt.title('Number of Sibling Groups')
# plt.xlabel('Singles Indicate Sibling Groups with 1 Member',fontdict={
#     'weight':'normal',
#     'size':'13'
# },  labelpad=10)

# plt.savefig(str(data_type+'-sibling-groups-distribution-'+ date_current + '.png'), bbox_inches='tight')

In [20]:
print(find_average_sibling_members(naive_sibling_group_dict))
print(find_average_sibling_members(selected_sibling_group_dict))

3.7475247524752477
4.037333333333334


In [75]:
def create_sibling_list(sibling_dict):
    list_left = []
    list_right = []
    for key,value in sibling_dict.items():
        for a,b in permutations(value, 2):
            list_left.append(a)
            list_right.append(b)
        
    return list_left, list_right

def generate_df_from_permutations(df, sibling_dict, sibling_label):
    if sibling_label:
        list_left, list_right = create_sibling_list(sibling_dict)
    else:
        list_left, list_right = zip(*sibling_dict)
    df_left = pd.DataFrame(list_left, columns=['sampleName'], index=None)
    df_left = pd.merge(df_left, df, how='left', on='sampleName', sort=False, validate='m:1')
    df_right = pd.DataFrame(list_right, columns=['sampleName'], index=None)
    df_right = pd.merge(df_right, df, how='left', on='sampleName', sort=False, validate='m:1')
    df_left = df_left.add_prefix('l_')
    df_right = df_right.add_prefix('r_')
    df_combined = pd.concat([df_left, df_right], axis=1)
    df_combined['sibling'] = sibling_label
    df_combined['c_sampleName'] = df_combined['l_sampleName'] + '_^_' + df_combined['r_sampleName']
    print('DF Created Successfully. Df shape: \n', df_combined.shape)
    return df_combined

def generate_random_non_sibling_df(df, sibling_dict, sample_size=10000, r_seed=42):
    # converts the lists of lists within the sibling_dict to a flat list
    flat_list = set([item for sublist in sibling_dict.values() for item in sublist])
    all_permutations = set(permutations(flat_list, 2))
    print('All Possible Permutations:', len(list(all_permutations)))
    
    list_left, list_right = create_sibling_list(sibling_dict)
    permutations_siblings = set(tuple(zip(list_left, list_right)))
    print('Sibling Permutations: ', len(permutations_siblings))

    permutation_non_sibling_dict = all_permutations.difference(permutations_siblings)
    print('Non-Sibling Permutations: ', len(permutation_non_sibling_dict))

    random.seed(r_seed)
    print('Sampling Seed Used: ', r_seed)
    permutation_non_sibling_sampled_dict = random.sample(permutation_non_sibling_dict, sample_size)

    return generate_df_from_permutations(df, permutation_non_sibling_sampled_dict, 0)



In [76]:
sibling_df = generate_df_from_permutations(data_start, combined_sibling_group_dict, 1)

DF Created Successfully. Df shape: 
 (10588, 16006)


In [77]:
non_sibling_df = generate_random_non_sibling_df(data_start, combined_sibling_group_dict, 12000, 42)

All Possible Permutations: 5155170
Sibling Permutations:  10588
Non-Sibling Permutations:  5144582
Sampling Seed Used:  42
DF Created Successfully. Df shape: 
 (12000, 16006)


In [79]:
non_sibling_df[['l_sampleName', 'r_sampleName', 'sibling', 'c_sampleName']]

Unnamed: 0,l_sampleName,r_sampleName,sibling,c_sampleName
0,20171128-71NYsaVH-VG-3__R1F2_RN2RP2,20150922-07OOcsUD-OO-3__R7F20_RN3RP1,0,20171128-71NYsaVH-VG-3__R1F2_RN2RP2_^_20150922...
1,20161105-13OOicXZ-JW-3__R3F2_RN2RP1,20160419-63OOknAB-CL-3__R5F7_RN1RP3,0,20161105-13OOicXZ-JW-3__R3F2_RN2RP1_^_20160419...
2,20170228-22OOooNA-HD-3__R2F3_RN1RP4,20170829-71MLbcHE-DI-3__R1F15_RN1RP1,0,20170228-22OOooNA-HD-3__R2F3_RN1RP4_^_20170829...
3,20150819-07OObwUD-OO-3__R5F18_RN2RP2,20170829-71ZJbcHE-DI-3__R2F17_RN1RP1,0,20150819-07OObwUD-OO-3__R5F18_RN2RP2_^_2017082...
4,20170808-90OOooNA-HD-3__R3F8_RN1RP3,20170601-46OOooNA-HD-3__R4F5_RN1RP1,0,20170808-90OOooNA-HD-3__R3F8_RN1RP3_^_20170601...
...,...,...,...,...
11995,20171128-22WIooVY-VV-3__R7F4_RN1RP1,20170404-29OOooBA-CL-3__R4F20_RN1RP2,0,20171128-22WIooVY-VV-3__R7F4_RN1RP1_^_20170404...
11996,20150819-13OOooUD-OO-3__R10F2_RN0RP1,20150707-07PCknDA-OO-3__R4F14_RN0RP0,0,20150819-13OOooUD-OO-3__R10F2_RN0RP1_^_2015070...
11997,20170504-25OOooLI-HD-3__R4F3_RN1RP2,20140701-18XCsaDA-OO-3__R7F6_RN1RP2,0,20170504-25OOooLI-HD-3__R4F3_RN1RP2_^_20140701...
11998,20170829-111OOooOS-NE-3__R9F10_RN1RP3,20150707-07GAcsUD-OO-3__R2F7_RN2RP3,0,20170829-111OOooOS-NE-3__R9F10_RN1RP3_^_201507...


In [81]:
sibling_df[['l_sampleName', 'r_sampleName', 'sibling', 'c_sampleName']]

Unnamed: 0,l_sampleName,r_sampleName,sibling,c_sampleName
0,20170808-109OOooNA-JB-3__R10F5_RN1RP3,20170808-109OOooNA-JB-3__R10F6_RN1RP4,1,20170808-109OOooNA-JB-3__R10F5_RN1RP3_^_201708...
1,20170808-109OOooNA-JB-3__R10F5_RN1RP3,20170808-109OOooNA-JB-3__R9F1_RN1RP1,1,20170808-109OOooNA-JB-3__R10F5_RN1RP3_^_201708...
2,20170808-109OOooNA-JB-3__R10F5_RN1RP3,20170808-109OOooNA-JB-3__R9F2_RN1RP2,1,20170808-109OOooNA-JB-3__R10F5_RN1RP3_^_201708...
3,20170808-109OOooNA-JB-3__R10F6_RN1RP4,20170808-109OOooNA-JB-3__R10F5_RN1RP3,1,20170808-109OOooNA-JB-3__R10F6_RN1RP4_^_201708...
4,20170808-109OOooNA-JB-3__R10F6_RN1RP4,20170808-109OOooNA-JB-3__R9F1_RN1RP1,1,20170808-109OOooNA-JB-3__R10F6_RN1RP4_^_201708...
...,...,...,...,...
10583,20170202-71MZdcEB-DF-3__R7F1_RN1RP1,20170202-71MZdcEB-DF-3__R7F3_RN1RP3,1,20170202-71MZdcEB-DF-3__R7F1_RN1RP1_^_20170202...
10584,20170202-71MZdcEB-DF-3__R7F2_RN1RP2,20170202-71MZdcEB-DF-3__R7F1_RN1RP1,1,20170202-71MZdcEB-DF-3__R7F2_RN1RP2_^_20170202...
10585,20170202-71MZdcEB-DF-3__R7F2_RN1RP2,20170202-71MZdcEB-DF-3__R7F3_RN1RP3,1,20170202-71MZdcEB-DF-3__R7F2_RN1RP2_^_20170202...
10586,20170202-71MZdcEB-DF-3__R7F3_RN1RP3,20170202-71MZdcEB-DF-3__R7F1_RN1RP1,1,20170202-71MZdcEB-DF-3__R7F3_RN1RP3_^_20170202...


In [82]:
combined_df = pd.concat([non_sibling_df, sibling_df], axis=0, ignore_index=True)

In [95]:
combined_df.to_csv(str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-pair-wise-df.csv'))