# Create sample data
Master file for generating sample data for *E. coli* and *M. tuberculosis*.

## Import packages

In [1]:
import pickle
import pandas as pd
import openpyxl

## Format data for *E. coli*

In [7]:
# Define filepath
file_path = './ecoli_sample_data.xlsx'

# Define key 
df = pd.read_excel(file_path, sheet_name='key', engine='openpyxl')
key = dict(zip(df['Label'].tolist(), df['Code'].tolist()))

# Define interactions 
df = pd.read_excel(file_path, sheet_name='interactions', engine='openpyxl')
scores = df['Score'].tolist()
train_ixns, train_scores, test_ixns, test_scores = [], [], [], []
for ix, row in df[[col for col in df.columns if 'Drug' in col]].iterrows(): 
    if df['Subset'][ix] == 'train': 
        train_ixns.append(row[~pd.isna(row)].tolist())
        train_scores.append(scores[ix])
    elif df['Subset'][ix] == 'test': 
        test_ixns.append(row[~pd.isna(row)].tolist())
        test_scores.append(scores[ix])
train = {'interactions': train_ixns, 'scores': train_scores}
test = {'interactions': test_ixns, 'scores': test_scores}

# Define profiles
df = pd.read_excel(file_path, sheet_name='profiles', engine='openpyxl')
feature_names = df['Gene'].tolist()
profiles = df.loc[:, ~df.columns.isin(['index', 'Gene'])].to_dict('list')

# Define dictionary output
ecoli_data = {
    'key': key, 
    'profiles': profiles, 
    'feature_names': feature_names, 
    'train': train, 
    'test': test
}

## Format data for *M. tuberculosis*

In [8]:
# Define filepath
file_path = './mtb_sample_data.xlsx'

# Define key 
df = pd.read_excel(file_path, sheet_name='key', engine='openpyxl')
key = dict(zip(df['Label'].tolist(), df['Code'].tolist()))

# Define interactions 
df = pd.read_excel(file_path, sheet_name='interactions', engine='openpyxl')
scores = df['Score'].tolist()
train_ixns, train_scores, test_ixns, test_scores, cli_ixns, cli_scores = [], [], [], [], [], []
for ix, row in df[[col for col in df.columns if 'Drug' in col]].iterrows(): 
    if df['Subset'][ix] == 'train': 
        train_ixns.append(row[~pd.isna(row)].tolist())
        train_scores.append(scores[ix])
    elif df['Subset'][ix] == 'test': 
        test_ixns.append(row[~pd.isna(row)].tolist())
        test_scores.append(scores[ix])
    elif df['Subset'][ix] == 'clinical': 
        cli_ixns.append(row[~pd.isna(row)].tolist())
        cli_scores.append(scores[ix])
train = {'interactions': train_ixns, 'scores': train_scores}
test = {'interactions': test_ixns, 'scores': test_scores}
clinical = {'interactions': cli_ixns, 'scores': cli_scores}

# Define profiles
df = pd.read_excel(file_path, sheet_name='profiles', engine='openpyxl')
feature_names = df['Gene'].tolist()
profiles = df.loc[:, ~df.columns.isin(['index', 'Gene'])].to_dict('list')

# Define dictionary output
mtb_data = {
    'key': key, 
    'profiles': profiles, 
    'feature_names': feature_names, 
    'train': train, 
    'test': test, 
    'clinical': clinical
}

## Save data and test saved object

In [9]:
# Save outputs
with open('./../indigopy/sample_data.pkl', 'wb') as f: 
    pickle.dump([ecoli_data, mtb_data], f)

In [10]:
# Test saved object
with open('./../indigopy/sample_data.pkl', 'rb') as f: 
    data1, data2 = pickle.load(f)
assert ecoli_data['train']['interactions'][0] == data1['train']['interactions'][0]
assert mtb_data['clinical']['interactions'][0] == data2['clinical']['interactions'][0]