# About
The procedures outlined in this notebook prepare datasets for our GA-based feature selection algorithm. Each dataset will be splited into training (80%) and testing (20%) folds.
* The origional datasets are stored in /data folder 
* The processed datasets are stored in /res/train_test folder
* The py files relevant to splitting data are in the /scr/dataPreparation folder


# Parameter configurations

In [1]:
import os
import sys
sys.path.append('scr/dataPreparation')
import scr_split_dataset as split
import pandas as pd

In [2]:
train_pct=0.8
print(type(train_pct))# the fraction of training fold
seed=25 # random seed for the split

<class 'float'>


In [3]:
# Procedures

## GAMETES datasets

In [4]:
# Process GAMETES datasets
# Make new directory for result and get data information
! mkdir res
! mkdir res/GAMETES
input_path = 'data/GAMETES'
train_path = 'res/GAMETES'
filenames = os.listdir(input_path)

# Check the file type
for file in filenames:
    if file.split('.')[-1] == 'tsv':
        df = pd.read_csv(input_path + '/' + file, sep='\t')
    else:
        continue

    x = df.iloc[:, 0:-1]
    y = df.iloc[:, -1].values
    y = split.attribute_tran(y)

    # Split dataset into train and test datasets
    train, test = split.train_test(file, x, y, train_pct, seed)

    # Save data into the corresponding path
    split.save_data(train, test, file ,train_path)

GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1.tsv: succeed
GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001.tsv: succeed
GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1.tsv: succeed
GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv: succeed
GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001.tsv: succeed
GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1.tsv: succeed


## GEO datasets

In [5]:
# Process GEO dataset
! mkdir res/GEO_datasets
! unzip -o data/GEO' 'datasets/\*.zip -d data/GEO' 'datasets

input_path = 'data/GEO datasets'
train_path = 'res/GEO_datasets'
filenames = os.listdir(input_path)

! rm -r data/GEO' 'datasets/__MACOSX

# Check the file type
for file in filenames:
    if file.split('.')[-1] == 'csv':
        df = pd.read_csv(input_path + '/' + file, sep=',')
    else:
        continue

    x = df.iloc[:, 0:-1]
    y = df.iloc[:, -1].values
    y = split.attribute_tran(y)

    # Split dataset into train and test datasets
    train, test = split.train_test(file, x, y, train_pct, seed)

    # Save data into the corresponding path
    split.save_data(train, test, file ,train_path)

Archive:  data/GEO datasets/Lung_GSE19804_processed.csv.zip
  inflating: data/GEO datasets/Lung_GSE19804_processed.csv  
  inflating: data/GEO datasets/__MACOSX/._Lung_GSE19804_processed.csv  

Archive:  data/GEO datasets/Renal_GSE53757_processed.csv.zip
  inflating: data/GEO datasets/Renal_GSE53757_processed.csv  
  inflating: data/GEO datasets/__MACOSX/._Renal_GSE53757_processed.csv  

Archive:  data/GEO datasets/Liver_GSE14520_U133A_processed.csv.zip
  inflating: data/GEO datasets/Liver_GSE14520_U133A_processed.csv  
  inflating: data/GEO datasets/__MACOSX/._Liver_GSE14520_U133A_processed.csv  

Archive:  data/GEO datasets/Liver_GSE76427_processed.csv.zip
  inflating: data/GEO datasets/Liver_GSE76427_processed.csv  
  inflating: data/GEO datasets/__MACOSX/._Liver_GSE76427_processed.csv  

Archive:  data/GEO datasets/Breast_GSE70947_processed.csv.zip
  inflating: data/GEO datasets/Breast_GSE70947_processed.csv  
  inflating: data/GEO datasets/__MACOSX/._Breast_GSE70

## GWAS dataset

In [6]:
# Process GWAS dataset 
# use pandas-plink package to I/O plink files