In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_pseudo, load_nucleotides

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

%matplotlib inline

In [39]:
%time records = load_pseudo()
records

CPU times: user 4.9 s, sys: 106 ms, total: 5 s
Wall time: 5.01 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,carb,toby
0,TA151,ATGAGT...,31842,6.588,ATGAGT...,28410,5.878,True,False
1,IC1,ATGAGT...,46071,9.532,ATGAGT...,34714,7.182,False,False
2,A237,ATGAGT...,44514,9.210,ATGAGT...,35933,7.434,True,False
3,5920,ATGAGT...,49497,10.241,ATGAGT...,36873,7.629,,
4,LiA96,ATGAGT...,44067,9.117,ATGAGT...,34454,7.128,False,False
...,...,...,...,...,...,...,...,...,...
117,JD318,------...,77766,16.090,ATGAGT...,39108,8.091,False,False
118,Jp238,------...,43062,8.909,ATGAGT...,32466,6.717,False,False
119,Jp1303,------...,44151,9.135,ATGAGT...,32792,6.785,False,False
120,JD304,------...,75465,15.613,ATGAGT...,38729,8.013,False,False


In [3]:
%time n = load_nucleotides('../data/pseudo/concatenated.fasta')

CPU times: user 46.6 s, sys: 1.07 s, total: 47.7 s
Wall time: 44.2 s


In [4]:
%time n_i = load_nucleotides('../data/pseudo/concatenated_naive_impute.fasta')

CPU times: user 46.2 s, sys: 968 ms, total: 47.2 s
Wall time: 41.7 s


# Feature selection

In [157]:
mask = (records['toby'].notna() & records['carb'].notna())

## Variance threshold

In [317]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.01)

In [254]:
# justification(not rigorous) for why < 0.016 is the threshold to drop a column
a, b = 4, 3
arr = np.ones((122, 1))*a
arr[:2] = b
np.var(arr)

0.01612469766191885

In [255]:
forward = str.maketrans('-ACTG', '01234')
def transformation(str):
    return [int(i) for i in str.translate(forward)]

In [8]:
def translate(encoding):
    selector = VarianceThreshold(0.008)
    encoding_new = selector.fit_transform(encoding)
    return encoding_new, selector

In [9]:
%time encoding = pd.DataFrame(records['sequence'].apply(transformation).to_list())

CPU times: user 51.8 s, sys: 1.16 s, total: 52.9 s
Wall time: 46.3 s


In [318]:
encoding_new = selector.fit_transform(encoding)
v = pd.Series(selector.get_support())
v.value_counts()

True     250032
False    233301
dtype: int64

In [12]:
%time encoding_i = pd.DataFrame(records['sequence_i'].apply(transformation).to_list())

CPU times: user 49.9 s, sys: 1.03 s, total: 50.9 s
Wall time: 43.9 s


In [260]:
encoding_i_new = selector.fit_transform(encoding_i)
v_i = pd.Series(selector.get_support())
v_i.value_counts()

False    378209
True     105124
dtype: int64

## SNP counts

In [169]:
%time snp_counts = n.apply(pd.Series.value_counts, axis=0)

CPU times: user 5min 54s, sys: 5.64 s, total: 6min
Wall time: 5min 58s


In [345]:
m = snp_counts.max()
(v & (m<121)).value_counts()

False    279232
True     204101
dtype: int64

In [272]:
%time snp_counts_i = n_i.apply(pd.Series.value_counts, axis=0)

CPU times: user 5min 49s, sys: 5.13 s, total: 5min 55s
Wall time: 5min 53s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,483323,483324,483325,483326,483327,483328,483329,483330,483331,483332
-,,,,,,,,,,,...,,,,,,,,,,
A,122.0,,,122.0,,,,122.0,,,...,,,,,,120.0,122.0,,,122.0
C,,,,,2.0,,,,,89.0,...,,122.0,,2.0,,2.0,,,,
G,,,122.0,,120.0,,122.0,,,,...,122.0,,,120.0,122.0,,,,122.0,
T,,122.0,,,,122.0,,,122.0,33.0,...,,,122.0,,,,,122.0,,


In [348]:
m_i = snp_counts_i.max()
(v_i & (m_i<121)).value_counts()

False    391544
True      91789
dtype: int64

In [382]:
encoding_new = encoding.to_numpy()[mask][:, (v&(m<121))]
encoding_i_new = encoding_i.to_numpy()[mask][:, (v_i&(m_i<121))]

In [383]:
np.save('../data/pseudo/preprocess/o_n_v.npy', encoding_new)
np.save('../data/pseudo/preprocess/i_n_v.npy', encoding_i_new)

## $\chi^2$

In [146]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [386]:
kbest = SelectKBest(chi2, k=250032//2).fit_transform(encoding_new, records['toby'][mask].astype('i4'))
kbest.shape

(119, 125016)

In [387]:
kbest_i = SelectKBest(chi2, k=105124//2).fit_transform(encoding_i_new, records['toby'][mask].astype('i4'))
kbest_i.shape

(119, 52562)

In [388]:
np.save('../data/pseudo/preprocess/o_n_x.npy', kbest)
np.save('../data/pseudo/preprocess/i_n_x.npy', kbest_i)