In [2]:
# only run if you are in Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os, sys
import glob
import pandas as pd
import numpy as np

# proj_path = '/content/drive/MyDrive/STAT685/685_project_repo/DATA'
proj_path = 'G:/My Drive/STAT685/685_project_repo/DATA'
# proj_path = '/Users/rayhinton/OneDrive/OneDrive - AMNH/misc-other/school/stat685/project/685_project_repo/DATA'

dataset_names = ['8702', '4302']
d_choice = dataset_names[0]

filename = f'OvarianDataset{d_choice}_intensities.csv'

data_paths = {
    '8702': {
        'root': 'OvarianDataset8-7-02',
        'control': ['Control'],
        'case': ['Ovarian Cancer']
    },
    '4302': {
        'root': os.path.join('OvarianDataset4-3-02', 'Ovarian Data WCX2 CSV'),
        'control': [os.path.join('control', f'Group {i}') for i in ['C', 'D']],
        'case': [os.path.join('cancer', f'Group {i}') for i in ['A', 'B']]
    }
}

control_paths = [os.path.join(proj_path, data_paths[d_choice]['root'], control_path) for control_path in data_paths[d_choice]['control']]
control_files = [filename for control_path in control_paths for filename in glob.glob(os.path.join(control_path, "*.csv"))]
print(len(control_files))
print(control_files[0:5])

case_paths = [os.path.join(proj_path, data_paths[d_choice]['root'], case_path) for case_path in data_paths[d_choice]['case']]
case_files = [filename for case_path in case_paths for filename in glob.glob(os.path.join(case_path, "*.csv"))]
print(len(case_files))
print(case_files[0:5])

91
['G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Control\\Control daf-0181.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Control\\Control daf-0182.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Control\\Control daf-0183.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Control\\Control daf-0184.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Control\\Control daf-0185.csv']
162
['G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Ovarian Cancer\\Ovarian Cancer daf-0601.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Ovarian Cancer\\Ovarian Cancer daf-0602.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Ovarian Cancer\\Ovarian Cancer daf-0604.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDataset8-7-02\\Ovarian Cancer\\Ovarian Cancer daf-0605.csv', 'G:/My Drive/STAT685/685_project_repo/DATA\\OvarianDa

In [None]:
# /content/drive/MyDrive/STAT685/project/DATA/OvarianDataset4-3-02/Ovarian Data WCX2 CSV/control/Group C

In [4]:
# read in the raw intensity values

X_dir_sample = np.genfromtxt(case_files[0], delimiter = ',', skip_header = 1, usecols = (1))

# read and build Case
X_dir_case = np.zeros((len(case_files), X_dir_sample.shape[0]))

for i in range(len(case_files)):
  X_i = np.genfromtxt(case_files[i], delimiter = ',', skip_header = 1, usecols = (1))
  X_dir_case[i, :] = X_i

print(f'{X_dir_case.shape=}')

# read and build Control
X_dir_control = np.zeros((len(control_files), X_dir_sample.shape[0]))

for i in range(len(control_files)):
  X_i = np.genfromtxt(control_files[i], delimiter = ',', skip_header = 1, usecols = (1))
  X_dir_control[i, :] = X_i

print(f'{X_dir_control.shape=}')

# combine into one array
X_dir = np.concatenate((X_dir_case, X_dir_control), axis = 0)

# make y, outcome variable
y = np.concatenate((np.ones((len(X_dir_case), )),
                    np.zeros((len(X_dir_control), ))),
                   axis = 0)
y = y.astype(np.int8)

# create column names and combine it all into one DataFrame
dir_colnames = [f'int_{i}' for i in range(X_dir_sample.shape[0])]

X_dir_df = pd.DataFrame(X_dir, columns = dir_colnames)
X_dir_df['y'] = y

X_dir_case.shape=(162, 15154)
X_dir_control.shape=(91, 15154)


In [5]:
print(f'{np.sum(y) = }, {X_dir_case.shape = }')

np.sum(y) = 162, X_dir_case.shape = (162, 15154)


In [6]:
# confirm that some values match
sample_cases = [len(case_files) - 1 - j for j in range(5)]
sample_cases.extend(range(5))

for i in sample_cases:
    print(i)
    print(np.genfromtxt(case_files[i], delimiter = ',', skip_header = 1, usecols = (1))[0:5])
    print(X_dir_df.iloc[i, 0:5])

161
[4.1850176 4.1769734 4.1166415 4.1809955 4.0945199]
int_0    4.185018
int_1    4.176973
int_2    4.116642
int_3    4.180995
int_4    4.094520
Name: 161, dtype: float64
160
[4.1568627 4.1568627 4.066365  4.1689291 4.100553 ]
int_0    4.156863
int_1    4.156863
int_2    4.066365
int_3    4.168929
int_4    4.100553
Name: 160, dtype: float64
159
[4.1608849 4.1729512 4.0904977 4.1729512 4.0623429]
int_0    4.160885
int_1    4.172951
int_2    4.090498
int_3    4.172951
int_4    4.062343
Name: 159, dtype: float64
158
[4.1608849 4.1427853 4.0844646 4.1789844 4.0643539]
int_0    4.160885
int_1    4.142785
int_2    4.084465
int_3    4.178984
int_4    4.064354
Name: 158, dtype: float64
157
[4.1568627 4.1085973 4.0764203 4.1890397 4.0864756]
int_0    4.156863
int_1    4.108597
int_2    4.076420
int_3    4.189040
int_4    4.086476
Name: 157, dtype: float64
0
[4.100553  4.1206637 4.0361991 4.1246858 4.0261438]
int_0    4.100553
int_1    4.120664
int_2    4.036199
int_3    4.124686
int_4    4.026

In [7]:
# confirm that some values match
sample_controls = [len(control_files) - 1 - j for j in range(5)]
sample_controls.extend(range(5))

for i in sample_controls:
    print(i)
    print(np.genfromtxt(control_files[i], delimiter = ',', skip_header = 1, usecols = (1))[0:5])
    print(X_dir_df.iloc[i + len(case_files), 0:5])

90
[4.166918  4.1729512 4.0744093 4.1689291 4.0744093]
int_0    4.166918
int_1    4.172951
int_2    4.074409
int_3    4.168929
int_4    4.074409
Name: 252, dtype: float64
89
[4.1347411 4.0784314 4.0462544 4.1025641 4.0241327]
int_0    4.134741
int_1    4.078431
int_2    4.046254
int_3    4.102564
int_4    4.024133
Name: 251, dtype: float64
88
[4.1126194 4.1065862 4.0884867 4.1085973 4.0080442]
int_0    4.112619
int_1    4.106586
int_2    4.088487
int_3    4.108597
int_4    4.008044
Name: 250, dtype: float64
87
[4.1126194 4.0844646 4.0844646 4.0904977 4.0180995]
int_0    4.112619
int_1    4.084465
int_2    4.084465
int_3    4.090498
int_4    4.018099
Name: 249, dtype: float64
86
[4.1146305 4.0884867 4.0482655 4.1347411 4.0361991]
int_0    4.114630
int_1    4.088487
int_2    4.048266
int_3    4.134741
int_4    4.036199
Name: 248, dtype: float64
0
[4.1689291 4.13273   4.0965309 4.1548517 4.0542986]
int_0    4.168929
int_1    4.132730
int_2    4.096531
int_3    4.154852
int_4    4.054299
N

In [38]:
# Save to a file

X_dir_df.to_csv(os.path.join(proj_path, filename), index = False)

In [8]:
# Read data from the written file, to confirm

int_df = pd.read_csv(os.path.join(proj_path, filename))

int_df.head()

int_colnames = int_df.columns

X_int = int_df.iloc[:, :-1].to_numpy()
y_int = int_df.iloc[:, -1].ravel()
print(X_int.shape)
print(y_int.shape)

(253, 15154)
(253,)


In [9]:
print(X_dir_df.shape)
print(X_dir.shape)
print(y.shape)

print(int_df.shape)
print(X_int.shape)
print(y_int.shape)

(253, 15155)
(253, 15154)
(253,)
(253, 15155)
(253, 15154)
(253,)


In [10]:
print(X_dir_df.mean())
print(int_df.mean())

print('Sum of equal means: ' + str(np.sum(X_dir_df.mean() == int_df.mean())))

int_0        4.155893
int_1        4.132929
int_2        4.083519
int_3        4.153318
int_4        4.053122
               ...   
int_15150    4.095643
int_15151    4.095643
int_15152    4.095643
int_15153    4.095643
y            0.640316
Length: 15155, dtype: float64
int_0        4.155893
int_1        4.132929
int_2        4.083519
int_3        4.153318
int_4        4.053122
               ...   
int_15150    4.095643
int_15151    4.095643
int_15152    4.095643
int_15153    4.095643
y            0.640316
Length: 15155, dtype: float64
Sum of equal means: 3574


In [13]:
# why do I get value errors in the Fisher criterion function?
# is it because there are intensity columns with 0 variance?

# split the check by case and control

y_check = 1
var_thresh = 10**-3

X_vars = np.var(X_dir[y == 1, :], axis = 0)

print(np.sort(X_vars)[:10])

print(f'{len(y) = }, {len(X_vars <= var_thresh) = }')
low_var_cols = X_dir_df.iloc[:, :-1].loc[y == y_check, X_vars <= var_thresh].columns
print(len(low_var_cols))
print(low_var_cols)

[0.00038767 0.00075411 0.00079854 0.00081715 0.00081767 0.00082439
 0.00084505 0.00084848 0.00085089 0.00086431]
len(y) = 253, len(X_vars <= var_thresh) = 15154
134
Index(['int_5', 'int_6', 'int_8', 'int_11', 'int_16', 'int_26', 'int_34',
       'int_35', 'int_48', 'int_50',
       ...
       'int_15144', 'int_15145', 'int_15146', 'int_15147', 'int_15148',
       'int_15149', 'int_15150', 'int_15151', 'int_15152', 'int_15153'],
      dtype='object', length=134)
