# Create PairWise Training and Test Sets
Strategy: 

1. Create a common data frame with 8000*2, 1 label, and n rows of samples. Call this **mainDF**
2. Use **sourceDF** to do a stratified random sampling of the samples so we have even distribution of labels. 
3. Append 2 samples to mainDF and add label based on the samples added
4. Labels: Not Sibblings, Sibblings
5. The label is based on filename again where two samples are sibblings if: XXXXX-RPn everything except n matches. 



In [1]:
import pandas as pd
from fileFunctionsModule import importCSVMotifFilesAsDfFromDir
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
import time
from datetime import date
import sys
import os.path

In [2]:
# ! Define variables to be used
rerun_data_compilation = False
data_path = ""
data_type = ""
date_current = str(date.today().isoformat())

# Ask user what kind of data needs to be processed and if data should be recompiled from source csv files. 
while (data_type.lower() not in ['trimer', 'tetramer']) :
    data_type = input('Process Trimer or Tetramer data type?')
    if (input('Would you like to rerun data compilation? Y or N').upper() == 'N'):
        rerun_data_compilation = False
    else:
        rerun_data_compilation = True

if data_type == 'trimer':
    data_path = "../03_SourceFiles/01_TrimerSourceFiles"
elif data_type == 'tetramer':
    data_path = "../03_SourceFiles/02_TetramerSourceFiles"


In [13]:
# Checks if user wants to rerun data compilation. If not, old label file is used from memory
if rerun_data_compilation:
    # Import Data 
    # transpose axis of the dataframe:
    # Fill na and nan values with 0 instead
    data_start = importCSVMotifFilesAsDfFromDir(data_path).transpose().reset_index().fillna(0)

    # Rename index column to sampleName in place
    data_start.rename(columns={'index': 'sampleName'}, inplace=True)

    # Add a new column with a binary identifier for a naive vs selected library. i.e = 1 is naive and 0 is selected
    data_start['naiveLibrary'] = data_start.apply(
        lambda row: (('oo' in row.sampleName[10:21]) and
                    ('RN0' in row.sampleName[-6:] or 'RN1' in row.sampleName[-6:])),
        axis=1
    )

    # convert the data in the column (NaiveLibrary) to int type so it is readable
    data_start['naiveLibrary'] = data_start['naiveLibrary'].astype(int)

    if (input('Would you like to save the resulting DF as a csv for future use or overwrite the previously saved csv? Y or N').upper() == 'Y'):
        data_start.to_csv(str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv'))
        print('DF Structure Saved to file.')
    else:
        pass
elif not rerun_data_compilation:
    if os.path.isfile(str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv')):
        try:
            data_start = pd.read_csv(
                str('../03_SourceFiles/03_ProcessedFiles/'+data_type+'-combined-labeled.csv'),
                engine='c',
                low_memory=False,
                index_col=0
                )
            print('File read succsessful. Data frame loaded into memory.')
        except:
            print('Something went wrong while trying to read the csv file. Please rerun data compilation or check file manually.')
            sys.exit()
    else:
        print('CSV File does not exist in the source folder (03_SourceFiles/03_ProcessedFiles). Please re-run compilation')


File read succsessful. Data frame loaded into memory.


## Find # of sibblings in data set

1. Naive sibblings sets
2. Selected Sibbling sets

In [19]:
naive_set = data_start[(data_start['naiveLibrary'] == 1)]['sampleName']
selected_set = data_start[~(data_start['naiveLibrary'] ==1)]['sampleName']

In [16]:
naive_set

1       20170808-109OOooNA-JB-3__R10F5_RN1RP3
2       20170808-109OOooNA-JB-3__R10F6_RN1RP4
3        20170808-109OOooNA-JB-3__R9F1_RN1RP1
4        20170808-109OOooNA-JB-3__R9F2_RN1RP2
5         20170202-07OOooPA-VT-3__R9F7_RN1RP1
                        ...                  
2378      20160816-13ADooPA-YC-3__R5F4_RN0RP3
2389      20170228-23ARooOO-HD-3__R5F5_RN1RP1
2390      20170228-23ARooOO-HD-3__R8F1_RN1RP2
2391      20170228-23ARooOO-HD-3__R8F2_RN1RP3
2392      20170228-23ARooOO-HD-3__R8F3_RN1RP4
Name: sampleName, Length: 810, dtype: object

In [25]:
selected_set

0          20160602-07ABsaGN-OO-3__R2F3_RN1RP1
18        20160216-63BNsaUD-OO-3__R4F16_RN1RP1
19        20160216-63BNsaUD-OO-3__R4F17_RN1RP2
20        20160216-63BNsaUD-OO-3__R4F18_RN1RP3
21       20181108-218TSnoAB-YW-3__R17F8_RN2RP1
                         ...                  
2388    20181108-219TSnoAB-YW-3__R17F15_RN3RP1
2393    20171106-130OOicXP-JW-3__R10F15_RN6RP4
2394       20170202-71MZdcEB-DF-3__R7F1_RN1RP1
2395       20170202-71MZdcEB-DF-3__R7F2_RN1RP2
2396       20170202-71MZdcEB-DF-3__R7F3_RN1RP3
Name: sampleName, Length: 1587, dtype: object

In [26]:
naive_set.sort_values()


1466     20140612-01OOooUD-OO-3__R1F1_RN1RP1
510      20140612-02OOooUD-OO-3__R1F2_RN1RP1
606      20140612-03OOooUD-OO-3__R1F3_RN1RP1
1456    20140612-04OOooUD-OO-3__R1F19_RN1RP2
1457    20140612-04OOooUD-OO-3__R1F20_RN1RP3
                        ...                 
2109    20180809-71ZJooNA-DV-3__R10F1_RN1RP1
2110    20180809-71ZJooNA-DV-3__R10F2_RN1RP2
2111    20180809-71ZJooNA-DV-3__R10F3_RN1RP3
809     20181108-16OOooPA-YW-3__R17F1_RN1RP1
1785    20181108-16TSooPA-YW-3__R17F2_RN1RP1
Name: sampleName, Length: 810, dtype: object

In [41]:
unique_vals_dict = {}
for index, value in selected_set.items():
    if value[:-1] not in unique_vals_dict:
        unique_vals_dict[value[:-1]] = []
        unique_vals_dict[value[:-1]].append(value)
    else:
        if value not in unique_vals_dict[value[:-1]]:
            unique_vals_dict[value[:-1]].append(value)

In [42]:
pd_dict = pd.DataFrame(unique_vals_dict)

In [44]:
unique_vals_dict

1-7MLdcOO-SM-3__R3F6_RN3RP': ['20150201-7MLdcOO-SM-3__R3F6_RN3RP3'],
 '20150201-7MLdcOO-SM-3__R3F7_RN3RP': ['20150201-7MLdcOO-SM-3__R3F7_RN3RP1'],
 '20150201-7MLdcOO-SM-3__R3F8_RN3RP': ['20150201-7MLdcOO-SM-3__R3F8_RN3RP2'],
 '20150201-7MLdcOO-SM-3__R3F9_RN3RP': ['20150201-7MLdcOO-SM-3__R3F9_RN3RP3'],
 '20171106-71ZJgoHE-DF-3__R2F2_RN1RP': ['20171106-71ZJgoHE-DF-3__R2F2_RN1RP1'],
 '20171106-71ZJgoHE-DF-3__R2F3_RN1RP': ['20171106-71ZJgoHE-DF-3__R2F3_RN1RP1'],
 '20180522-06NYsaVH-VT-3__R1F8_RN1RP': ['20180522-06NYsaVH-VT-3__R1F8_RN1RP1'],
 '20180522-06NYsaVH-VT-3__R1F9_RN1RP': ['20180522-06NYsaVH-VT-3__R1F9_RN1RP2'],
 '20180522-06NYsaVH-VT-3__R1F10_RN1RP': ['20180522-06NYsaVH-VT-3__R1F10_RN1RP3'],
 '20171106-71MLbcAD-DI-3__R1F10_RN1RP': ['20171106-71MLbcAD-DI-3__R1F10_RN1RP1'],
 '20171106-71MLbcAD-DI-3__R1F14_RN1RP': ['20171106-71MLbcAD-DI-3__R1F14_RN1RP1'],
 '20161215-07WIsaPA-VT-3__R10F10_RN1RP': ['20161215-07WIsaPA-VT-3__R10F10_RN1RP1'],
 '20161215-07WIsaPA-VT-3__R10F13_RN1RP': ['2016

In [43]:
pd_dict

Unnamed: 0,20160602-07ABsaGN-OO-3__R2F3_RN1RP,20160216-63BNsaUD-OO-3__R4F16_RN1RP,20160216-63BNsaUD-OO-3__R4F17_RN1RP,20160216-63BNsaUD-OO-3__R4F18_RN1RP,20181108-218TSnoAB-YW-3__R17F8_RN2RP,20170202-71OObcHE-DF-3__R6F16_RN1RP,20170202-71OObcHE-DF-3__R6F17_RN1RP,20170202-71OObcHE-DF-3__R6F18_RN1RP,20160816-63OOknRI-CL-3__R7F14_RN1RP,20160816-63OOknRI-CL-3__R7F15_RN1RP,...,20150201-07SBbwOO-SM-3__R2F1_RN2RP,20150201-07SBbwOO-SM-3__R2F18_RN3RP,20150201-07SBbwOO-SM-3__R2F19_RN3RP,20150201-07SBbwOO-SM-3__R2F2_RN2RP,20150201-07SBbwOO-SM-3__R2F20_RN3RP,20181108-219TSnoAB-YW-3__R17F15_RN3RP,20171106-130OOicXP-JW-3__R10F15_RN6RP,20170202-71MZdcEB-DF-3__R7F1_RN1RP,20170202-71MZdcEB-DF-3__R7F2_RN1RP,20170202-71MZdcEB-DF-3__R7F3_RN1RP
0,20160602-07ABsaGN-OO-3__R2F3_RN1RP1,20160216-63BNsaUD-OO-3__R4F16_RN1RP1,20160216-63BNsaUD-OO-3__R4F17_RN1RP2,20160216-63BNsaUD-OO-3__R4F18_RN1RP3,20181108-218TSnoAB-YW-3__R17F8_RN2RP1,20170202-71OObcHE-DF-3__R6F16_RN1RP1,20170202-71OObcHE-DF-3__R6F17_RN1RP2,20170202-71OObcHE-DF-3__R6F18_RN1RP3,20160816-63OOknRI-CL-3__R7F14_RN1RP1,20160816-63OOknRI-CL-3__R7F15_RN1RP2,...,20150201-07SBbwOO-SM-3__R2F1_RN2RP2,20150201-07SBbwOO-SM-3__R2F18_RN3RP1,20150201-07SBbwOO-SM-3__R2F19_RN3RP2,20150201-07SBbwOO-SM-3__R2F2_RN2RP3,20150201-07SBbwOO-SM-3__R2F20_RN3RP3,20181108-219TSnoAB-YW-3__R17F15_RN3RP1,20171106-130OOicXP-JW-3__R10F15_RN6RP4,20170202-71MZdcEB-DF-3__R7F1_RN1RP1,20170202-71MZdcEB-DF-3__R7F2_RN1RP2,20170202-71MZdcEB-DF-3__R7F3_RN1RP3
