In [948]:
import numpy as np
import pandas as pd
import biom
import qiime2 as q2
import q2_sample_classifier
import pickle
from qiime2 import Artifact
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.externals import joblib
from scipy.sparse import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

## Input train data

In [497]:
train_datafile='/Users/huangshi/MyProjects/CMI-IBM/age-prediction/Input/gut_data/gut_4434.biom'
train_sample_metadata='/Users/huangshi/MyProjects/CMI-IBM/age-prediction/Input/gut_data/gut_4434_map.txt'
train_feature_metadata=''
train_target_field='age'
train_prefix='AGP'

## Input test data

In [36]:
test_datafile='/Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.biom' #gut_4575_rare_sp.csv
test_sample_metadata = '/Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/sample-metadata.tsv' #'10283_20191126-092828.txt'
test_feature_metadata='' #'skin_taxonomy.txt'
test_prefix='SAGE'
test_target_field = 'agevisit' #

In [8]:
train_table=biom.load_table(train_datafile)
train_metadata=pd.read_csv(train_sample_metadata, sep='\t')

In [37]:
test_table=biom.load_table(test_datafile)
test_metadata=pd.read_csv(test_sample_metadata, sep='\t')

In [13]:
train_df=train_table.to_dataframe(dense=False)

In [38]:
test_df=test_table.to_dataframe(dense=False)

In [167]:
train_X=train_df.T
train_X.shape

(4434, 65694)

In [305]:
test_X=test_df.T
test_X.shape

(184, 52955)

In [169]:
train_y=train_metadata[train_target_field]

In [170]:
test_y=test_metadata[test_target_field]

In [171]:
egr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=500, n_jobs=4)
#egr.fit(train_X, train_y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [943]:
pipe = Pipeline([('egr', RandomForestRegressor(max_depth=2, random_state=0, n_estimators=500, n_jobs=4))])
pipe.fit(train_X, train_y)

Pipeline(memory=None,
         steps=[('egr',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=2, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=500, n_jobs=4,
                                       oob_score=False, random_state=0,
                                       verbose=0, warm_start=False))],
         verbose=False)

In [944]:
pred_y=pipe.predict(train_X)

In [951]:
R_squared=r2_score(train_y, pred_y)
mse=mean_squared_error(train_y, pred_y)
rmse=np.sqrt(mse)
mae=mean_absolute_error(train_y, pred_y)
print('R-squared: ', R_squared)
print('MSE: ', mse)
print('RMSE: ', rmse)
print('MAE: ', mae)

R-squared:  0.019028518323003674
MSE:  223.78499305980282
RMSE:  14.959444944910317
MAE:  12.650015515968576


In [952]:
# save the model to disk
filename = 'gut_sklearn_pipeline.pkl'
pickle.dump(pipe, open(filename, 'wb'))

## Load the pipeline object

In [953]:
# load the model from disk
loaded_pipe = pickle.load(open(filename, 'rb'))
result = loaded_pipe.score(test_X_, test_y)
print(result)

-0.8724027808581561


In [867]:
type(loaded_pipe)

sklearn.pipeline.Pipeline

In [954]:
loaded_pipe.predict(test_X_)

array([47.56843373, 47.40438979, 46.94094645, 47.82804331, 47.36287618,
       47.71140506, 47.38642674, 47.62473285, 47.66568304, 47.52114343,
       47.4154415 , 47.55576699, 47.37603269, 47.60593572, 46.91646905,
       47.49882432, 47.53479183, 47.17908984, 47.54739777, 47.46629715,
       47.10204694, 47.09372961, 47.03575925, 47.1678897 , 47.39045407,
       47.29945922, 47.68961226, 47.44147476, 47.03926096, 47.81370401,
       47.52359066, 47.16572771, 47.93289059, 47.51071429, 47.44345221,
       47.74952509, 47.21831519, 47.72383569, 47.53069438, 47.13893806,
       47.64629492, 47.71303912, 47.65330492, 47.53615209, 47.50258904,
       47.6543161 , 47.4805571 , 47.30778159, 47.62184946, 47.52278063,
       47.61055994, 47.48696219, 46.88701775, 47.10776698, 47.10691762,
       47.70463667, 47.59378653, 49.98482952, 47.7239362 , 47.4504889 ,
       47.50650425, 47.49489759, 47.39325571, 47.24204347, 47.33689337,
       47.58741216, 46.90318047, 47.93536992, 47.42320471, 47.36

In [811]:
#y_pred = egr.predict(test_X)

### Adding preprocessing steps for the test table

### Q: the test data contains 150-nt sequence features while the train data contains 100-nt sequence features.
### Solution: chop the 150-nt sequences into 100 nt ones

In [532]:
def chop_seq_feature_to_nt(x, start=0, end=100):
    '''
    Parameters
    -------
        x: pd.DataFrame 
        A table that contains sequence-like features in the columns
    Return
    -------
        x_dedup: pd.DataFrame
        A table that contain sequence-like features with desired length
    Examples
    -------
    x=pd.DataFrame({'atcttc':[1, 3, 1, 3], 'ttcttc':[1, 3, 3, 1], 
                    'aatttc':[2, 5, 3, 1], 'ttcttc':[2, 5, 3, 1],
                    'aattcc':[2, 5, 3, 1], 'aatatc':[2, 0, 0, 1]})

    '''
    ids=x.columns.tolist()
    new_ids=[i[start:end] for i in ids]
    x.columns=new_ids
    def checkIfDuplicates(listOfElems):
        ''' Check if given list contains any duplicates '''
        if len(listOfElems) == len(set(listOfElems)):
            return False
        else:
            return True
    if(checkIfDuplicates(new_ids)):
        x_dedup=x.sum(axis=1, level=0)
    else:
        x_dedup=x
    return x_dedup

In [533]:
chop_seq_feature_to_nt(x, start=0, end=3)

Unnamed: 0,atc,ttc,aat
0,2,1,6
1,5,3,10
2,3,3,6
3,1,1,3


In [534]:
test_X=chop_seq_feature_to_nt(test_X, start=0, end=100)
test_X.shape

(184, 39634)

In [188]:
def union_feature_table(train_X, test_X):
    '''
    Parameters
    ----------
    train data : pd.DataFrame
        train data table
    test data : pd.DataFrame
        test data table.
    Returns
    -------
    pd.DataFrame
        A padded test data table
    Examples
    -------
    a=pd.DataFrame({'atc':[1, 3, 1, 3], 'ttc':[1, 3, 3, 1], 'aat':[2, 5, 3, 1], 'ttc':[2, 5, 3, 1]})
    b=pd.DataFrame({'atc':[1, 3, 1, 0], 'tta':[0, 3, 1, 0], 'aaa':[2, 1, 3, 1]})
    feature_padding(a, b)

    '''
    train_feature_ids=train_X.columns.values.tolist()
    test_feature_ids=test_X.columns.values.tolist()
    train_X_uniq_f=list(set(train_feature_ids)-set(test_feature_ids))
    test_X_uniq_f=list(set(test_feature_ids)-set(train_feature_ids))
    test_zero_matrix = np.zeros(shape=(test_X.shape[0], len(train_X_uniq_f)))
    train_zero_matrix = np.zeros(shape=(train_X.shape[0], len(test_X_uniq_f)))
    test_padding_matrix=pd.DataFrame(test_zero_matrix, columns = train_X_uniq_f)
    train_padding_matrix=pd.DataFrame(train_zero_matrix, columns = test_X_uniq_f)

    new_test_X=pd.concat([test_X, test_padding_matrix], axis=1).sort_index(axis=1)
    new_train_X=pd.concat([train_X, train_padding_matrix], axis=1).sort_index(axis=1)
    return new_test_X, new_train_X


In [189]:
a_, b_ = union_feature_table(a, b)
display(a_, b_)

Unnamed: 0,A,B,C,D,E,R
0,1,0.0,0.0,0,2,0.0
1,3,0.0,0.0,3,1,0.0
2,1,0.0,0.0,1,3,0.0
3,0,0.0,0.0,0,1,0.0


Unnamed: 0,A,B,C,D,E,R
0,3,1,5,0.0,0.0,5
1,3,3,5,0.0,0.0,5
2,1,3,3,0.0,0.0,3
3,1,1,1,0.0,0.0,1


In [502]:
def pad_feature(a, b):
    '''
    Parameters
    ----------
    a : pd.DataFrame
        train data table
    b : pd.DataFrame
        test data table.
    Returns
    -------
    pd.DataFrame
        A test table with equal number of
        feature as the train table.
    Examples
    -------
    a=pd.DataFrame({'ttc':[1, 3, 1, 3], 'atc':[1, 3, 3, 1], 'aat':[2, 5, 3, 1], 'tac':[2, 5, 3, 1]})
    b=pd.DataFrame({'atc':[1, 3, 1, 0], 'ttc':[0, 3, 1, 0], 'aaa':[2, 1, 3, 1]})
    feature_padding(a, b)
    
    A	B	C	R
    0	1	0.0	0.0	0.0
    1	3	0.0	0.0	0.0
    2	1	0.0	0.0	0.0
    3	0	0.0	0.0	0.0
    
    '''
    print("The shape of train data: ", a.shape)
    a_feature_ids=a.columns.values.tolist()
    b_feature_ids=b.columns.values.tolist()
    print("The number of features in the original test data: ", len(b_feature_ids))
    a_uniq_f=list(set(a_feature_ids)-set(b_feature_ids))
    ab_shared_f=set(a_feature_ids).intersection(set(b_feature_ids))
    print("The number of features with all zeros in the new test data: ", len(a_uniq_f))
    print("The number of shared features kept in the new test data: ", len(ab_shared_f))
    b_padding_matrix = pd.DataFrame(0, index=b.index, columns=a_uniq_f)
    new_b=pd.concat([b[ab_shared_f], b_padding_matrix], axis=1)
    #print(new_b.shape)
    new_b=new_b[a_feature_ids]
    # only keep feature ids in the train table
    
    print("The shape of new test data: ", new_b.shape)
    return new_b


In [501]:
a=pd.DataFrame({'ttc':[1, 3, 1, 3], 'atc':[1, 3, 3, 1], 'aat':[2, 5, 3, 1], 'tac':[2, 5, 3, 1]})
b=pd.DataFrame({'atc':[1, 3, 1, 0], 'ttc':[0, 3, 1, 0], 'aaa':[2, 1, 3, 1]})
b_=pad_feature(a, b)
b_

The shape of train data:  (4, 4)
The number of features in the original test data:  3
The number of features with all zeros in the new test data:  2
The number of shared features kept in the new test data:  2
(4, 4)
The shape of new test data:  (4, 4)


Unnamed: 0,ttc,atc,aat,tac
0,0,1,0,0
1,3,3,0,0
2,1,1,0,0
3,0,0,0,0


### age prediction on the test dataset after feature padding

In [376]:
train_X.shape

(4434, 65694)

In [377]:
test_X.shape

(184, 52955)

In [535]:
test_X_=pad_feature(train_X, test_X)
#test_X_=pd.SparseDataFrame(test_X_)

The shape of train data:  (4434, 65694)
The number of features in the original test data:  39634
The number of features with all zeros in the new test data:  54293
The number of shared features kept in the new test data:  11401
The shape of new test data:  (184, 65694)


In [315]:
test_X_.shape

(184, 76443)

In [496]:
y_pred = egr.predict(test_X_)
y_pred 

array([47.04889674, 47.04889674, 47.04889674, 47.48043568, 48.16709794,
       47.94284281, 47.04889674, 49.62464162, 47.04889674, 48.16709794,
       47.04889674, 47.48043568, 47.04889674, 47.48043568, 46.43493596,
       47.04889674, 47.04889674, 46.43493596, 48.16709794, 47.04889674,
       47.04889674, 47.04889674, 48.16709794, 47.04889674, 47.04889674,
       47.04889674, 47.48043568, 47.04889674, 46.43493596, 47.48043568,
       47.48043568, 46.43493596, 47.48043568, 48.16709794, 48.16709794,
       48.16709794, 48.16709794, 47.48043568, 48.16709794, 47.48043568,
       47.48043568, 47.48043568, 47.48043568, 47.48043568, 47.48043568,
       47.04889674, 48.16709794, 47.48043568, 47.04889674, 47.04889674,
       47.48043568, 47.04889674, 47.04889674, 46.43493596, 49.74806727,
       47.48043568, 47.04889674, 47.04889674, 50.12888204, 49.66748061,
       47.04889674, 48.16709794, 47.48043568, 46.43493596, 47.48043568,
       47.48043568, 47.04889674, 47.48043568, 47.04889674, 47.04

### qiime2

In [506]:
import q2_sample_classifier

In [504]:
! qiime info

[32mSystem versions[0m
Python version: 3.6.7
QIIME 2 release: 2019.7
QIIME 2 version: 2019.7.0
q2cli version: 2019.7.0
[32m
Installed plugins[0m
alignment: 2019.7.0
breakaway: 0+untagged.71.g503723a
composition: 2019.7.0
cutadapt: 2019.7.0
dada2: 2019.7.0
deblur: 2019.7.0
deicode: 0.2.4
demux: 2019.7.0
diversity: 2019.7.0
emperor: 2019.7.0
feature-classifier: 2019.7.0
feature-table: 2019.7.0
fragment-insertion: 2019.7.0
gneiss: 2019.7.0
longitudinal: 2019.7.0
metadata: 2019.7.0
mmvec: 1.0.0
phylogeny: 2019.7.0
quality-control: 2019.7.0
quality-filter: 2019.7.0
qurro: 0.4.0
sample-classifier: 2019.7.1
songbird: 0.9.0
taxa: 2019.7.0
types: 2019.7.0
vsearch: 2019.7.0
[32m
Application config directory[0m
/Users/huangshi/anaconda3/envs/qiime2-2019.7/var/q2cli[0m
[32m
Getting help[0m
To get help with QIIME 2, visit https://qiime2.org[0m


In [965]:
from q2_sample_classifier._transformer import _a
from q2_sample_classifier._transformer import _b
from qiime2.plugins.sample_classifier.pipelines import regress_samples

In [None]:
class SampleEstimatorDirFmt(model.DirectoryFormat):
    version_info = model.File('sklearn_version.json', format=JSONFormat)
    sklearn_pipeline = model.File('sklearn_pipeline.tar', format=PickleFormat)

In [801]:
def _b(data: Pipeline) -> SampleEstimatorDirFmt:
    sklearn_pipeline = PickleFormat()
    with tarfile.open(str(sklearn_pipeline), 'w') as tar:
        tmpdir = model.DirectoryFormat()
        pf = os.path.join(str(tmpdir), 'sklearn_pipeline.pkl')
        for fn in joblib.dump(data, pf):
            tar.add(fn, os.path.basename(fn))
            os.unlink(fn)

    dirfmt = SampleEstimatorDirFmt()
    dirfmt.version_info.write_data(
        {'sklearn-version': sklearn.__version__}, dict)
    dirfmt.sklearn_pipeline.write_data(sklearn_pipeline, PickleFormat)

    return dirfmt

### Save scikit-learn pipeline (tar) and version info (json) on my local disk

In [956]:
with tarfile.open('gut_sklearn_pipeline/sklearn_pipeline.tar', 'w') as tar:
    tmpdir = '.'
    pf = os.path.join(str(tmpdir), 'sklearn_pipeline.pkl')
    print(pf)
    for fn in joblib.dump(pipe, pf):
        tar.add(fn, os.path.basename(fn))
        os.unlink(fn)

./sklearn_pipeline.pkl


In [958]:
with open('gut_sklearn_pipeline/sklearn_version.json', 'w') as outfile:
    json.dump({'sklearn-version': sklearn.__version__}, outfile)

In [959]:
! qiime tools import \
--type 'SampleEstimator[Regressor]' \
--input-path ./gut_sklearn_pipeline \
--output-path gut_sklearn_pipeline.qza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mImported ./gut_sklearn_pipeline as SampleEstimatorDirFmt to gut_sklearn_pipeline.qza[0m


In [969]:
gut_sklearn_pipeline_q2 = q2.Artifact.load('gut_sklearn_pipeline.qza')
gut_sklearn_pipeline_q2

<artifact: SampleEstimator[Regressor] uuid: 10304669-110e-4c69-bf48-ea8f96d57f6f>

In [908]:
pipe

Pipeline(memory=None,
         steps=[('egr',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=2, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=500, n_jobs=None,
                                       oob_score=False, random_state=0,
                                       verbose=0, warm_start=False))],
         verbose=False)

In [918]:
regress_samples

<pipeline qiime2.plugins.sample_classifier.pipelines.regress_samples>

In [919]:
#regress_samples(train_table, train_metadata[train_target_field])

##  Microbiome age prediction using q2_sample_classifier.classify.predict_regression

In [1040]:
! qiime tools import \
--type 'FeatureTable[Frequency]'\
--input-path /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Input/gut_data/gut_4434.biom \
--output-path /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Input/gut_data/gut_4434.qza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mImported /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Input/gut_data/gut_4434.biom as BIOMV210DirFmt to /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Input/gut_data/gut_4434.qza[0m


In [901]:
test_X_mat=test_X_.T.to_numpy()

In [914]:
test_X_table_ = biom.Table(test_X_mat, test_X_.columns, test_X_.index)
test_X_table_


65694 x 184 <class 'biom.table.Table'> with 50844 nonzero entries (0% dense)

In [1034]:
with biom.util.biom_open('/Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.padding_gut_4434.biom', 'w') as f:
    test_X_table_.to_hdf5(f, "example")

In [1035]:
! qiime tools import \
--type 'FeatureTable[Frequency]'\
--input-path /Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.padding_gut_4434.biom \
--output-path /Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.padding_gut_4434.qza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mImported /Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.padding_gut_4434.biom as BIOMV210DirFmt to /Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.padding_gut_4434.qza[0m


In [1038]:
! qiime sample-classifier predict-regression \
--i-table /Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.padding_gut_4434.qza \
--i-sample-estimator  gut_sklearn_pipeline.qza \
--o-predictions gut_sklearn_pipeline_prediction.qza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[31m[1mPlugin error from sample-classifier:

  float() argument must be a string or a number, not 'dict'

Debug info has been saved to /var/folders/kr/2t02j7m97sz_xbpgqf_yl4kr0000gn/T/qiime2-q2cli-err-k_3s4jnl.log[0m


### Q2 API

In [915]:
pipe

Pipeline(memory=None,
         steps=[('egr',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=2, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=500, n_jobs=None,
                                       oob_score=False, random_state=0,
                                       verbose=0, warm_start=False))],
         verbose=False)

In [911]:
predict_regression

<function q2_sample_classifier.classify.predict_regression(table:biom.table.Table, sample_estimator:sklearn.pipeline.Pipeline, n_jobs:int=1) -> pandas.core.series.Series>

In [973]:
predict_regression(test_X_table_, pipe)

TypeError: float() argument must be a string or a number, not 'dict'

In [979]:
! qiime tools import --show-importable-types

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
DeblurStats
DistanceMatrix
EMPPairedEndSequences
EMPSingleEndSequences
ErrorCorrectionDetails
FeatureData[AlignedSequence]
FeatureData[Conditional]
FeatureData[Differential]
FeatureData[Importance]
FeatureData[PairedEndSequence]
FeatureData[Sequence]
FeatureData[Taxonomy]
FeatureTable[Balance]
FeatureTable[Composition]
FeatureTable[Frequency]
FeatureTable[PercentileNormalized]
FeatureTable[PresenceAbsence]
FeatureTable[RelativeFrequency]
Hierarchy
MultiplexedPairedEndBarcodeInSequence
MultiplexedSingleEndBarcodeInSequence
PCoAResults
Phylogeny[Rooted]
Phylogeny[Unrooted]
Placements
QualityFilterStats
RawSequences
SampleData[AlphaDiversity]
SampleData[BooleanSeries]
SampleData[Classi

In [1054]:
! cd /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Qiime2_MicrobiomeAgePrediction

! ls .

Age.crossRF_clf.ranger.R
Age.crossRF_reg.ranger.R
[34mFigures[m[m
[34mInput[m[m
Microbiome Age prediction for new datasets.ipynb
[34mOutput[m[m
[34mQiime2_MicrobiomeAgePrediction[m[m
README.md
[34mR_MicrobiomeAgePrediction[m[m
age_distribution.png


In [None]:
! qiime sample-classifier regress-samples \
  --i-table /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Qiime2_MicrobiomeAgePrediction/.qza \
  --m-metadata-file ecam-metadata.tsv \
  --m-metadata-column month \
  --p-estimator RandomForestRegressor \
  --p-n-estimators 20 \
  --p-random-state 123 \
  --output-dir ecam-regressor

In [1049]:
! qiime sample-classifier regress-samples

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Usage: [34mqiime sample-classifier regress-samples[0m [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

[1mInputs[0m:
  [34m[4m--i-table[0m ARTIFAC

In [1016]:
! qiime sample-classifier fit-regressor \
--i-table Input/gut_data/gut_4434.qza \
--m-metadata-file Input/gut_data/gut_4434_map.txt \
--p-cv 5\
--m-metadata-column age \
--p-n-jobs 4 \
--p-parameter-tuning True \
--o-sample-estimator Output/gut_4434.regressor.qza \
--o-feature-importance Output/gut_4434.feature-importance.qza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved SampleEstimator[Regressor] to: Output/gut_4434.regressor.qza[0m
[32mSaved FeatureData[Importance] to: gut_4434.feature-importance.qza[0m


In [1043]:
! qiime sample-classifier predict-regression \
--i-table /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Input/gut_data/gut_4434.qza \
--i-sample-estimator  /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Qiime2_MicrobiomeAgePrediction/gut_4434.regressor.qza \
--o-predictions /Users/huangshi/MyProjects/CMI-IBM/age-prediction/Qiime2_MicrobiomeAgePrediction/gut_4434.regressor_prediction.qza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved SampleData[RegressorPredictions] to: gut_4434.regressor_prediction.qza[0m


In [1045]:
!qiime sample-classifier scatterplot \
  --i-predictions gut_4434.regressor_prediction.qza \
  --m-truth-file Input/gut_data/gut_4434_map.txt \
  --m-truth-column age \
  --o-visualization gut_4434.scatter.qzv

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved Visualization to: gut_4434.scatter.qzv[0m


In [1046]:
! qiime sample-classifier predict-regression \
--i-table /Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/data/feature-table.padding_gut_4434.qza \
--i-sample-estimator  Output/gut_4434.regressor.qza \
--o-predictions gut_4434-SAGE.regressor_prediction.qza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved SampleData[RegressorPredictions] to: gut_4434-SAGE.regressor_prediction.qza[0m


In [1047]:
!qiime sample-classifier scatterplot \
  --i-predictions gut_4434-SAGE.regressor_prediction.qza \
  --m-truth-file /Users/huangshi/MyProjects/CMI-IBM/Datasets/SAGE/sample-metadata.tsv \
  --m-truth-column agevisit \
  --o-visualization SAGE.scatter.qzv

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved Visualization to: SAGE.scatter.qzv[0m


In [1024]:
gut_sklearn_regressor_q2 = q2.Artifact.load('Output/gut_4434.regressor.qza')

In [1042]:
gut_sklearn_regressor_predictions= q2.Artifact.load('gut_4434.regressor_prediction.qza')

In [None]:
gut_sklearn_regressor_predictions.view(view_type='')

In [1026]:
predict_regression(test_X_table_, gut_sklearn_regressor_q2)

AttributeError: 'Artifact' object has no attribute 'get_params'