In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from lib.read_data import dataset,Datasets
from math import copysign

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

# feature extractors
from sklearn.decomposition import PCA
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
# from sklearn.linear_model import ElasticNet
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
# finetuning
from sklearn.model_selection import GridSearchCV
# validation
from sklearn import metrics
from sklearn.metrics import confusion_matrix


In [21]:
def read_data_set(data_table,test_size=0.25,normalization=True):
    '''
    convert a pandas dataframe data table into Datasets(dataset,dataset)
    '''
    train, test = train_test_split(data_table,test_size=0.25)
    train_x = train[[col for col in train.columns
                     if col not in ['zscore','direction','cpgName']]]
    features = train_x.columns
    if normalization:
        minMaxScaler = preprocessing.MinMaxScaler()
        train_x = minMaxScaler.fit_transform(train_x)
        test_x = minMaxScaler.fit_transform(test[[col for col in 
                                                  train.columns
                      if col not in ['zscore','direction','cpgName']]])
    else:
        train_x = np.array(train_x)
        test_x = np.array(test[[col for col in train.columns
                      if col not in ['zscore','direction','cpgName']]])
    train_y = np.array(train['direction'],dtype=np.int8)
    test_y = np.array(test['direction'],dtype=np.int8)

    return Datasets(train=dataset(train_x,train_y),
                    test=dataset(test_x,test_y))

In [22]:
if __name__=='__main__':

    def load_bonderWestraData(path):
        data = pd.read_csv(bonder_path,sep=',')
        # print(data.head())
        def binarize(row):
            if row > 0:
                return 1
            else:
                return 0
        data['direction'] = data['zscore'].apply(binarize)
        dataset = read_data_set(data)
        return dataset


    bonder_path = 'data/bonder_withzscore.csv'
    westra_allFeat_path = 'data/westra_all_with_zscore.csv.csv'
    westra_bonderFeat_path = 'data/westra_bonderfeat_with_zscore.csv'

    bonder = load_bonderWestraData(bonder_path)
    westra_allFeat = load_bonderWestraData(westra_allFeat_path)
    westra_bonderFeat = load_bonderWestraData(westra_bonderFeat_path)

    print('Bonder dataset loaded.',bonder.train.values.shape)
    print('Westra with all features dataset loaded.',bonder.train.values.shape)
    print('Westra with bonder features dataset loaded.',
           bonder.train.values.shape)

Bonder dataset loaded. (2131, 8)
Westra with all features dataset loaded. (2131, 8)
Westra with bonder features dataset loaded. (2131, 8)


In [25]:
bonder.train.values[:,:]

array([[4.52657515e-01, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.52252117e-04],
       [1.88313974e-01, 1.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 3.28335049e-05],
       [2.18233017e-01, 4.64285714e-01, 0.00000000e+00, ...,
        8.57142857e-01, 3.21428571e-01, 1.08897791e-03],
       ...,
       [4.31186202e-01, 7.14285714e-02, 0.00000000e+00, ...,
        7.14285714e-02, 0.00000000e+00, 1.31583490e-03],
       [4.25202394e-01, 3.57142857e-02, 0.00000000e+00, ...,
        7.14285714e-02, 0.00000000e+00, 1.90579182e-03],
       [3.30165435e-01, 4.64285714e-01, 0.00000000e+00, ...,
        8.92857143e-01, 0.00000000e+00, 1.77770092e-02]])