In [2]:
import numpy as np
import pandas as pd
import feyn
import sklearn.model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import f_classif
from scipy.stats import pointbiserialr


This version of feyn and the QLattice is available for academic, personal, and non-commercial use. By using the community version of this software you agree to the terms and conditions which can be found at `https://abzu.ai/privacy`.


In [3]:
#Load data set
data =  pd.read_csv('C:/Users/riyak/OneDrive/Documents/NASA_2022_Summer/QL Ready Data/QL_GLDS104_Methyl^MRNAseq.csv')
data

Unnamed: 0,Treatment,ENSMUSG00000113626,ENSMUSG00000110162,ENSMUSG00000045215,ENSMUSG00000085301,ENSMUSG00000026734,ENSMUSG00000097241,ENSMUSG00000107724,ENSMUSG00000111544,ENSMUSG00000034573,...,ENSMUSG00000112734_RNA-seq,ENSMUSG00000114196_RNA-seq,ENSMUSG00000116114_RNA-seq,ENSMUSG00000116656_RNA-seq,ENSMUSG00000116725_RNA-seq,ENSMUSG00000116851_RNA-seq,ENSMUSG00000117310_RNA-seq,ENSMUSG00000117545_RNA-seq,ENSMUSG00000118332_RNA-seq,ENSMUSG00000118383_RNA-seq
0,1,0.755343,0.72185,0.780815,0.708512,0.654772,0.706823,0.906832,0.838259,0.723763,...,8.485787,8.429594,6.941024,10.039852,6.730906,7.764235,13.33237,9.224914,11.382675,8.930691
1,1,0.772216,0.721509,0.77441,0.715839,0.677542,0.751631,0.925455,0.86113,0.720216,...,8.68211,8.627871,7.049384,10.040283,6.780464,7.553072,13.228448,9.126525,11.423001,8.942703
2,1,0.760143,0.759885,0.766649,0.704012,0.681387,0.721888,0.912857,0.860676,0.724975,...,8.238998,8.327797,7.153846,9.862945,6.783814,7.616695,13.207462,8.993847,11.551848,8.796899
3,1,0.746479,0.781561,0.766315,0.716535,0.664805,0.738743,0.939297,0.879868,0.722738,...,8.296385,8.381055,6.923701,9.931099,6.594703,8.200239,13.092402,8.753577,11.428009,8.90507
4,1,0.759619,0.766614,0.765966,0.714433,0.663422,0.707736,0.92955,0.871503,0.710516,...,8.082772,8.716437,7.009062,10.35689,6.790497,7.59653,13.153932,8.457618,11.507192,8.963571
5,1,0.765239,0.715026,0.780118,0.71265,0.667402,0.708685,0.908277,0.886603,0.717203,...,8.315773,8.588713,6.951939,10.032072,6.683158,7.855334,13.32119,8.686237,11.774283,8.893084
6,0,0.770421,0.760375,0.783456,0.727156,0.661015,0.733296,0.944771,0.885449,0.720343,...,7.785353,9.024014,7.580535,10.347728,7.436379,7.151843,13.765087,8.184308,12.113236,9.117377
7,0,0.741404,0.754255,0.766469,0.717731,0.670908,0.729202,0.930693,0.871145,0.717327,...,7.541227,9.399885,7.560817,11.089381,7.31696,6.928587,13.866007,8.020834,12.288919,9.912607
8,0,0.77347,0.784512,0.77006,0.712157,0.668352,0.736842,0.93135,0.845191,0.710484,...,7.746957,9.314143,7.440791,10.845202,7.229329,7.087395,13.933837,8.301332,12.132041,9.644087
9,0,0.747198,0.783058,0.761592,0.714055,0.644201,0.719477,0.896127,0.848395,0.706831,...,7.704403,9.472558,7.377847,11.006183,7.180896,7.08357,13.936322,8.043197,11.846512,9.759722


In [12]:
#Record categorical data types in dataset
stypes = {}
target = "Treatment"
for f in data.columns:
    if data[f].dtype =='object':
        stypes[f] = 'c'

In [24]:
average_120 = []
average_12 = []
features = []
ql = feyn.QLattice()
for i in range(0,11):
    
    #train: leave one out cross-val
    train = data.drop(data.iloc[[i], :].index[0])
    
    # Instantiate a QLattice
    feyn.QLattice(random_seed = i)
    
    # Setting semantic types
    stypes = stypes

    # Set number of epochs
    n_epochs = 20

    # Initialize the list of models
    models = []
    
    # Sample and fit
    for epoch in range(n_epochs):
    
        # Sample models (no data here yet)
        models += ql.sample_models(
            input_names=train.columns, 
            output_name=target, 
            kind='classification', 
            stypes=stypes,
            max_complexity=4
        )
    
        # Fit the models with train data
        models = feyn.fit_models(models, train, loss_function='squared_error')
    
        # Remove worst performing models
        models = feyn.prune_models(models)
        
        from feyn.filters import ExcludeFunctions
        
        # Update QLattice with the models sorted by loss
        ql.update(models)
        
    
    # Find the 10 best models
    best_models = feyn.get_diverse_models(models=models, n=10)
    
    for j in range(10):
        feyn.show_model(best_models[j], update_display=False)
        average_120.append(best_models[j].accuracy_score(data))
        features.extend(best_models[j].features)
    best_model = models[0]
    average_12.append(best_model.accuracy_score(data))
    
    


In [25]:
from statistics import mean
mean(average_120)


1.0

In [26]:
mean(average_12)

1.0

In [27]:
features

['ENSMUSG00000021838_RNA-seq',
 'ENSMUSG00000039509_RNA-seq',
 'ENSMUSG00000083012_RNA-seq',
 'ENSMUSG00000025453_RNA-seq',
 'ENSMUSG00000029469_RNA-seq',
 'ENSMUSG00000010461_RNA-seq',
 'ENSMUSG00000078566_RNA-seq',
 'ENSMUSG00000038349_RNA-seq',
 'ENSMUSG00000107724',
 'ENSMUSG00000028841_RNA-seq',
 'ENSMUSG00000072929',
 'ENSMUSG00000031111_RNA-seq',
 'ENSMUSG00000041594_RNA-seq',
 'ENSMUSG00000083012_RNA-seq',
 'ENSMUSG00000029469_RNA-seq',
 'ENSMUSG00000040883_RNA-seq',
 'ENSMUSG00000083012_RNA-seq',
 'ENSMUSG00000086080',
 'ENSMUSG00000054277_RNA-seq',
 'ENSMUSG00000028841_RNA-seq',
 'ENSMUSG00000002020_RNA-seq',
 'ENSMUSG00000033161_RNA-seq',
 'ENSMUSG00000117310_RNA-seq',
 'ENSMUSG00000041153_RNA-seq',
 'ENSMUSG00000031990_RNA-seq',
 'ENSMUSG00000038763_RNA-seq',
 'ENSMUSG00000083012_RNA-seq',
 'ENSMUSG00000028373_RNA-seq',
 'ENSMUSG00000041153_RNA-seq',
 'ENSMUSG00000036585_RNA-seq',
 'ENSMUSG00000032114_RNA-seq',
 'ENSMUSG00000041143_RNA-seq',
 'ENSMUSG00000041594_RNA-seq',
 

In [28]:
#finding unique features

array = np.array(features)
unique, counts = np.unique(array, return_counts=True)
result = np.column_stack((unique, counts))
print(result)
len(result)

[['ENSMUSG00000002020_RNA-seq' '2']
 ['ENSMUSG00000002980_RNA-seq' '3']
 ['ENSMUSG00000004043_RNA-seq' '1']
 ['ENSMUSG00000005958' '1']
 ['ENSMUSG00000008658_RNA-seq' '1']
 ['ENSMUSG00000010461_RNA-seq' '1']
 ['ENSMUSG00000018334_RNA-seq' '1']
 ['ENSMUSG00000019768_RNA-seq' '1']
 ['ENSMUSG00000021481_RNA-seq' '1']
 ['ENSMUSG00000021559_RNA-seq' '1']
 ['ENSMUSG00000021759_RNA-seq' '1']
 ['ENSMUSG00000021760_RNA-seq' '1']
 ['ENSMUSG00000021838_RNA-seq' '2']
 ['ENSMUSG00000022464_RNA-seq' '1']
 ['ENSMUSG00000022808_RNA-seq' '1']
 ['ENSMUSG00000022893_RNA-seq' '2']
 ['ENSMUSG00000024617_RNA-seq' '1']
 ['ENSMUSG00000025453_RNA-seq' '1']
 ['ENSMUSG00000026864_RNA-seq' '2']
 ['ENSMUSG00000027068_RNA-seq' '1']
 ['ENSMUSG00000027253_RNA-seq' '6']
 ['ENSMUSG00000027784_RNA-seq' '1']
 ['ENSMUSG00000028328_RNA-seq' '3']
 ['ENSMUSG00000028373_RNA-seq' '1']
 ['ENSMUSG00000028841_RNA-seq' '2']
 ['ENSMUSG00000028974_RNA-seq' '1']
 ['ENSMUSG00000029469_RNA-seq' '2']
 ['ENSMUSG00000029862_RNA-seq' '1']


82