# Create PairWise Training and Test Sets
Strategy: 

1. Create a common data frame with 8000*2, 1 label, and n rows of samples. Call this **mainDF**
2. Use **sourceDF** to do a stratified random sampling of the samples so we have even distribution of labels. 
3. Append 2 samples to mainDF and add label based on the samples added
4. Labels: Not Sibblings, Sibblings
5. The label is based on filename again where two samples are sibblings if: XXXXX-RPn everything except n matches. 



In [2]:
import pandas as pd
from fileFunctionsModule import importCSVMotifFilesAsDfFromDir
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
import time
from datetime import date
import sys
import os.path

In [3]:
# ! Define variables to be used
rerun_data_compilation = False
data_path = ""
data_type = ""
date_current = str(date.today().isoformat())

# Ask user what kind of data needs to be processed and if data should be recompiled from source csv files. 
while (data_type.lower() not in ['trimer', 'tetramer']) :
    data_type = input('Process Trimer or Tetramer data type?')
    if (input('Would you like to rerun data compilation? Y or N').upper() == 'N'):
        rerun_data_compilation = False
    else:
        rerun_data_compilation = True

if data_type == 'trimer':
    data_path = "../03_SourceFiles/01_TrimerSourceFiles"
elif data_type == 'tetramer':
    data_path = "../03_SourceFiles/02_TetramerSourceFiles"


In [4]:
# Checks if user wants to rerun data compilation. If not, old label file is used from memory
if rerun_data_compilation:
    # Import Data 
    # transpose axis of the dataframe:
    # Fill na and nan values with 0 instead
    data_start = importCSVMotifFilesAsDfFromDir(data_path).transpose().reset_index().fillna(0)

    # Rename index column to sampleName in place
    data_start.rename(columns={'index': 'sampleName'}, inplace=True)

    # Add a new column with a binary identifier for a naive vs selected library. i.e = 1 is naive and 0 is selected
    data_start['naiveLibrary'] = data_start.apply(
        lambda row: (('oo' in row.sampleName[10:21]) and
                    ('RN0' in row.sampleName[-6:] or 'RN1' in row.sampleName[-6:])),
        axis=1
    )

    # convert the data in the column (NaiveLibrary) to int type so it is readable
    data_start['naiveLibrary'] = data_start['naiveLibrary'].astype(int)

    if (input('Would you like to save the resulting DF as a csv for future use or overwrite the previously saved csv? Y or N').upper() == 'Y'):
        data_start.to_csv(str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv'))
        print('DF Structure Saved to file.')
    else:
        pass
elif not rerun_data_compilation:
    if os.path.isfile(str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv')):
        try:
            data_start = pd.read_csv(
                str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv'),
                engine='c',
                low_memory=False,
                index_col=0
                )
            print('File read succsessful. Data frame loaded into memory.')
        except:
            print('Something went wrong while trying to read the csv file. Please rerun data compilation or check file manually.')
            sys.exit()
    else:
        print('CSV File does not exist in the source folder (03_SourceFiles/03_ProcessedFiles). Please re-run compilation')


File read succsessful. Data frame loaded into memory.


## Find # of sibblings in data set & separate them out into sibbling groups. 

1. Create a naive and a selected set. 
2. Create two dictionaries: one containing groups of naive sibblings and one for selected sibblings. 
3. Enumerate sets to use for stats later. 

In [5]:
naive_set = data_start[(data_start['naiveLibrary'] == 1)]['sampleName']
selected_set = data_start[~(data_start['naiveLibrary'] == 1)]['sampleName']

In [27]:
naive_sibbling_groups = {}
selected_sibbling_groups = {}

for index, value in naive_set.items():
    if (value.split('__')[0]) not in naive_sibbling_groups:
        naive_sibbling_groups[(value.split('__')[0])] = [value]
    else:
        if value not in naive_sibbling_groups[value.split('__')[0]]:
            naive_sibbling_groups[(value.split('__')[0])].append(value)

for index, value in selected_set.items():
    if (value.split('__')[0]) not in selected_sibbling_groups:
        selected_sibbling_groups[(value.split('__')[0])] = [value]
    else:
        if value not in selected_sibbling_groups[value.split('__')[0]]:
            selected_sibbling_groups[(value.split('__')[0])].append(value)


In [28]:
counter = 0
for key, value in naive_sibbling_groups.items():
    print(key, '->', value, '\n')
    counter += 1
    if counter == 10:
        break

20170808-109OOooNA-JB-3 -> ['20170808-109OOooNA-JB-3__R10F5_RN1RP3', '20170808-109OOooNA-JB-3__R10F6_RN1RP4', '20170808-109OOooNA-JB-3__R9F1_RN1RP1', '20170808-109OOooNA-JB-3__R9F2_RN1RP2'] 

20170202-07OOooPA-VT-3 -> ['20170202-07OOooPA-VT-3__R9F7_RN1RP1', '20170202-07OOooPA-VT-3__R9F8_RN1RP2', '20170202-07OOooPA-VT-3__R9F9_RN1RP3'] 

20170808-94OOooNA-HD-3 -> ['20170808-94OOooNA-HD-3__R3F20_RN1RP1', '20170808-94OOooNA-HD-3__R4F1_RN1RP2', '20170808-94OOooNA-HD-3__R4F2_RN1RP3'] 

20170713-32OOooNA-HD-3 -> ['20170713-32OOooNA-HD-3__R10F12_RN1RP1', '20170713-32OOooNA-HD-3__R10F13_RN1RP2', '20170713-32OOooNA-HD-3__R4F8_RN1RP3', '20170713-32OOooNA-HD-3__R4F9_RN1RP4'] 

20180222-57NYooVH-VT-3 -> ['20180222-57NYooVH-VT-3__R7F1_RN1RP2', '20180222-57NYooVH-VT-3__R7F2_RN1RP3', '20180222-57NYooVH-VT-3__R6F20_RN1RP1'] 

20170808-92OOooNA-HD-3 -> ['20170808-92OOooNA-HD-3__R3F12_RN1RP1', '20170808-92OOooNA-HD-3__R3F13_RN1RP2', '20170808-92OOooNA-HD-3__R3F19_RN1RP3'] 

20170601-46OOooNA-HD-3 -> ['20

In [42]:
mismatched_groups = []
for key, value in selected_sibbling_groups.items():
    round_ = value[0][-6:-3]
    for a in value:
        if a[-6:-3] != round_:
            print('Incorrect value at', key)
            mismatched_groups.append(key)
            break

Incorrect value at 20140701-07OOdcUD-OO-3
Incorrect value at 20150707-07DGplUD-OO-3
Incorrect value at 20150819-17DSdcUD-OO-3
Incorrect value at 20161215-07OOsaPA-VT-3
Incorrect value at 20150201-7OOhpUD-OO-3
Incorrect value at 20170808-98OOooBM-MD-3
Incorrect value at 20171106-13OOicXP-JW-3
Incorrect value at 20150922-07AAbwUD-OO-3
Incorrect value at 20150707-07GAcsUD-OO-3
Incorrect value at 20150707-07OOcsUD-OO-3
Incorrect value at 20170808-13OOooBM-MD-3
Incorrect value at 20170404-06WIsaPA-VT-3
Incorrect value at 20150819-17DScaUD-OO-3
Incorrect value at 20171106-22WIsaSP-VT-3
Incorrect value at 20150819-07OObsUD-OO-3
Incorrect value at 20170614-71WIsaNE-VT-3
Incorrect value at 20170713-13OOooBM-MD-3
Incorrect value at 20150922-07OOcsUD-OO-3
Incorrect value at 20160216-13DGgaUD-OO-3
Incorrect value at 20161215-07NXsaPA-VT-3
Incorrect value at 20150201-7RHhnUD-OO-3
Incorrect value at 20140701-07MSdcUD-OO-3
Incorrect value at 20160602-13OOicPA-JW-3
Incorrect value at 20150819-07OObwUD

In [44]:
for a_ in mismatched_groups:
    print(selected_sibbling_groups[a_], '\n')

OO-3__R9F1_RN3RP2'] 

['20170808-13OOooBM-MD-3__R2F16_RN2RP2', '20170808-13OOooBM-MD-3__R2F17_RN3RP3', '20170808-13OOooBM-MD-3__R2F18_RN4RP4', '20170808-13OOooBM-MD-3__R3F14_RN5RP5', '20170808-13OOooBM-MD-3__R3F15_RN6RP6'] 

['20170404-06WIsaPA-VT-3__R8F15_RN1RP1', '20170404-06WIsaPA-VT-3__R8F16_RN1RP2', '20170404-06WIsaPA-VT-3__R8F17_RN1RP3', '20170404-06WIsaPA-VT-3__R8F18_RN2RP1', '20170404-06WIsaPA-VT-3__R9F6_RN2RP2', '20170404-06WIsaPA-VT-3__R9F7_RN2RP3'] 

['20150819-17DScaUD-OO-3__R3F18_RN1RP1', '20150819-17DScaUD-OO-3__R3F19_RN1RP2', '20150819-17DScaUD-OO-3__R3F20_RN1RP3', '20150819-17DScaUD-OO-3__R4F13_RN2RP1', '20150819-17DScaUD-OO-3__R4F14_RN2RP2', '20150819-17DScaUD-OO-3__R4F15_RN2RP3'] 

['20171106-22WIsaSP-VT-3__R4F1_RN1RP1', '20171106-22WIsaSP-VT-3__R4F14_RN1RP1', '20171106-22WIsaSP-VT-3__R4F15_RN1RP2', '20171106-22WIsaSP-VT-3__R4F16_RN1RP3', '20171106-22WIsaSP-VT-3__R4F17_RN2RP1', '20171106-22WIsaSP-VT-3__R4F2_RN1RP2', '20171106-22WIsaSP-VT-3__R4F3_RN1RP3', '20171106-22W