In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
np.random.seed(1000)

In [3]:
TF_SNP_count = pd.read_csv("../../../data/product/tf_snp_count.csv")

In [4]:
expression_matrix = pd.read_csv("../../../data/expression_matrix.txt", delimiter='\t')

In [5]:
target_expression = pd.read_csv("../expression_matrix_tf_targets_only.csv", header=0)
target_expression_matrix = target_expression.set_index('CONDITION')

In [6]:
#genotype_marix = pd.read_csv("../matrix_genotypes.csv")

In [7]:
TF = TF_SNP_count.iloc[0:49,]

In [8]:
TF_expression_matrix = expression_matrix.iloc[TF['ID'],].T.drop('ID').apply(lambda x: x.fillna(x.mean()), axis=0)

In [9]:
TF_RF = RandomForestRegressor(max_depth=10, n_estimators=2000, n_jobs=-1)

In [10]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(TF_expression_matrix)

10

In [11]:
RF_score = []
for train_index, test_index in kf.split(TF_expression_matrix):
    X_train, X_test = TF_expression_matrix.iloc[train_index], TF_expression_matrix.iloc[test_index]
    y_train, y_test = target_expression_matrix.iloc[train_index], target_expression_matrix.iloc[test_index]
    TF_RF.fit(X_train, y_train)
    print(TF_RF.score(X_test,y_test))
    RF_score.append(TF_RF.score(X_test,y_test))


0.1318970908703989
0.051126628213334406
0.07293258027861217
0.18459438973113002
0.1247737988126436
0.03150094872774459
0.18493124406250847
0.08333879357816455
0.0010552044183511744
0.10602827207779808


In [19]:
sum(RF_score)/10

0.0972178950770686

In [16]:
# Train new regressor with unique SNPs within TFs added as input
#
#

In [12]:
np.random.seed(1000)

In [13]:
matrix_genotypes_hotcoded_snps = pd.read_csv("../matrix_genotypes_hotcoded_snps_collapsed.csv")

In [14]:
genotypes_hotcoded_snps = matrix_genotypes_hotcoded_snps.set_index('CONDITION')

In [15]:
TF_SNP = pd.concat([TF_expression_matrix, genotypes_hotcoded_snps], axis=1).reindex(TF_expression_matrix.index)
print(TF_expression_matrix.index)
print(TF_SNP.index)

Index(['1_1_d', '1_3_d', '1_4_d', '1_5_c', '2_2_d', '2_3_d', '2_4_a', '2_5_d',
       '2_6_d', '3_1_d',
       ...
       '22_4_d', '22_5_d', '23_3_d', '23_5_d', '24_1_d', '25_1_d', '25_3_d',
       '25_4_d', '26_1_d', '26_2_d'],
      dtype='object', length=112)
Index(['1_1_d', '1_3_d', '1_4_d', '1_5_c', '2_2_d', '2_3_d', '2_4_a', '2_5_d',
       '2_6_d', '3_1_d',
       ...
       '22_4_d', '22_5_d', '23_3_d', '23_5_d', '24_1_d', '25_1_d', '25_3_d',
       '25_4_d', '26_1_d', '26_2_d'],
      dtype='object', length=112)


In [16]:
TF_SNP_RF = RandomForestRegressor(max_depth=10, n_estimators=2000, n_jobs=-1)

In [17]:
#kf2 = KFold(n_splits=10, shuffle=True)
#kf2.get_n_splits(TF_SNP)
kf2 = kf

In [18]:
TF_SNP_RF_score = []

importances = []
for train_index2, test_index2 in kf2.split(TF_SNP):
    X_train2, X_test2 = TF_SNP.iloc[train_index2], TF_SNP.iloc[test_index2]
    y_train2, y_test2 = target_expression_matrix.iloc[train_index2], target_expression_matrix.iloc[test_index2]
    TF_SNP_RF.fit(X_train2, y_train2)
    print(TF_SNP_RF.score(X_test2,y_test2))
    TF_SNP_RF_score.append(TF_SNP_RF.score(X_test2,y_test2))
    importances.append(TF_SNP_RF.feature_importances_)

0.12584596469969042
0.0445774845008279
0.07238889290263249
0.18216710293217922
0.12838042668687555
0.031868143415531805
0.1763274656051975
0.08765567508125846
-0.00015184226222712466
0.10058222957086614


In [20]:
sum(TF_SNP_RF_score)/10

0.09496415431328324

In [21]:
df = pd.DataFrame(data=importances, columns=TF_SNP.columns.values)

df.to_csv('rf_importances.csv', index=False)

In [22]:
df

Unnamed: 0,5513,78,4414,336,4442,64,1677,2394,2283,1925,...,SNP_2464_2,SNP_2516_0,SNP_2516_1,SNP_2516_2,SNP_81_0,SNP_81_1,SNP_81_2,SNP_1643_0,SNP_1643_1,SNP_1643_2
0,0.013135,0.012125,0.015394,0.011213,0.016325,0.013867,0.011185,0.010275,0.012572,0.016274,...,0.000245,0.002319,0.00241,0.000124,0.002245,0.002137,0.000511,0.002204,0.002185,0.003142
1,0.014232,0.011359,0.021595,0.010525,0.016246,0.013462,0.012946,0.011482,0.013518,0.016387,...,0.000178,0.002137,0.002187,0.000148,0.002448,0.003154,0.000393,0.002229,0.002061,0.001868
2,0.013544,0.012103,0.017741,0.012094,0.01587,0.014391,0.012534,0.009937,0.015123,0.015721,...,0.000211,0.002244,0.002288,0.00013,0.002878,0.002446,0.000374,0.002048,0.002438,0.001714
3,0.013847,0.012545,0.015872,0.011949,0.016493,0.013771,0.013841,0.010467,0.014662,0.012808,...,0.000193,0.002273,0.002145,0.000124,0.00249,0.002566,0.000516,0.002188,0.002258,0.002147
4,0.010348,0.012927,0.014764,0.012095,0.013901,0.014478,0.012647,0.011731,0.012873,0.015781,...,0.0,0.002312,0.002153,0.0,0.002421,0.002595,0.000543,0.002216,0.002278,0.001218
5,0.013016,0.011839,0.015482,0.011442,0.015425,0.01127,0.012724,0.011116,0.012855,0.014667,...,0.000223,0.002117,0.002337,0.000172,0.002588,0.002086,0.00057,0.002001,0.002284,0.002762
6,0.014364,0.012521,0.01639,0.012819,0.015011,0.013701,0.010983,0.012244,0.013479,0.013346,...,0.000116,0.002287,0.002293,0.000116,0.00264,0.002867,0.000601,0.00218,0.002367,0.002094
7,0.018483,0.012108,0.017222,0.011164,0.014944,0.014411,0.011947,0.010071,0.013295,0.017153,...,0.000206,0.002214,0.002077,0.000127,0.002208,0.00238,0.000398,0.002232,0.002265,0.002043
8,0.010763,0.011987,0.014513,0.011174,0.015174,0.01186,0.010082,0.010029,0.012718,0.015372,...,0.000304,0.002279,0.002634,0.000143,0.002816,0.002597,0.000448,0.002022,0.002058,0.001111
9,0.016055,0.012645,0.015113,0.010829,0.015536,0.013683,0.011074,0.010696,0.012516,0.015329,...,0.000218,0.002559,0.00253,0.000143,0.002188,0.002483,0.000457,0.001956,0.001984,0.003611


In [23]:
# Train new regressor with only SNPs found to be important (25) within TFs added as input
#
#

In [24]:
np.random.seed(1000)

In [25]:
genotypes_hotcoded_snps_important = pd.read_csv("../matrix_genotypes_hotcoded_snps_only_important.csv").set_index('CONDITION')

In [26]:
TF_SNP_important = pd.concat([TF_expression_matrix, genotypes_hotcoded_snps_important], axis=1).reindex(TF_expression_matrix.index)
print(TF_SNP_important.index)
print(TF_expression_matrix.index)

Index(['1_1_d', '1_3_d', '1_4_d', '1_5_c', '2_2_d', '2_3_d', '2_4_a', '2_5_d',
       '2_6_d', '3_1_d',
       ...
       '22_4_d', '22_5_d', '23_3_d', '23_5_d', '24_1_d', '25_1_d', '25_3_d',
       '25_4_d', '26_1_d', '26_2_d'],
      dtype='object', length=112)
Index(['1_1_d', '1_3_d', '1_4_d', '1_5_c', '2_2_d', '2_3_d', '2_4_a', '2_5_d',
       '2_6_d', '3_1_d',
       ...
       '22_4_d', '22_5_d', '23_3_d', '23_5_d', '24_1_d', '25_1_d', '25_3_d',
       '25_4_d', '26_1_d', '26_2_d'],
      dtype='object', length=112)


In [27]:
TF_SNP_important_RF = RandomForestRegressor(max_depth=10, n_estimators=2000, n_jobs=-1)

In [28]:
#kf3 = KFold(n_splits=10, shuffle=True)
#kf3.get_n_splits(TF_SNP_important)
kf3=kf

In [29]:
TF_SNP_important_RF_score = []

for train_index3, test_index3 in kf3.split(TF_SNP_important):
    X_train3, X_test3 = TF_SNP_important.iloc[train_index3], TF_SNP_important.iloc[test_index3]
    y_train3, y_test3 = target_expression_matrix.iloc[train_index3], target_expression_matrix.iloc[test_index3]
    TF_SNP_important_RF.fit(X_train3, y_train3)
    print(TF_SNP_important_RF.score(X_test3,y_test3))
    TF_SNP_important_RF_score.append(TF_SNP_important_RF.score(X_test3,y_test3))

0.1253099546934164
0.05406039738297309
0.07460014429380975
0.1822697478080574
0.12827642505334025
0.031036280222506197
0.18227537699125795
0.08276009141919154
0.0032847040750725296
0.10667577662159858


In [31]:
sum(TF_SNP_important_RF_score)/10

0.09705488985612234