In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
import pandas as pd
from sklearn.feature_selection import chi2, mutual_info_classif
import numpy as np

# Loading Data

In [43]:
tr_fhb = pd.read_pickle('../Dataset/tr_fhb.pkl')
test_fhb = pd.read_pickle('../Dataset/test_fhb.pkl')
vali_fhb = pd.read_pickle('../Dataset/vali_fhb.pkl')

In [44]:
RANDOM_STATE = 5

In [45]:
germplasm_hw_tr = pd.read_pickle('../Dataset/hw_encoded_tr.pkl')
germplasm_hw_test = pd.read_pickle('../Dataset/hw_encoded_test.pkl')
germplasm_hw_val = pd.read_pickle('../Dataset/hw_encoded_val.pkl')

In [46]:
germplasm_sm_tr = pd.read_pickle('../Dataset/sm_encoded_tr.pkl')
germplasm_sm_test = pd.read_pickle('../Dataset/sm_encoded_test.pkl')
germplasm_sm_val = pd.read_pickle('../Dataset/sm_encoded_val.pkl')

# Feature selection

In [47]:
mut_info_hw = mutual_info_classif(germplasm_hw_tr.to_numpy(dtype=np.float), tr_fhb.to_numpy())
print(mut_info_hw[:10])

[0.0335133  0.         0.         0.00061058 0.00174838 0.03300673
 0.00313995 0.         0.00743667 0.00261085]


In [48]:
mut_info_hw_selected = mut_info_hw > 0
mut_info_hw_selected_ind = np.where(mut_info_hw > 0)
print(np.sum(mut_info_hw_selected))

15043


In [49]:
mut_info_sm = mutual_info_classif(germplasm_sm_tr.to_numpy(dtype=np.float), tr_fhb.to_numpy())
print(mut_info_sm[:10])

[0.         0.01718892 0.01566394 0.         0.         0.
 0.         0.01448701 0.0064845  0.04270139]


In [50]:
mut_info_sm_selected = mut_info_sm > 0
mut_info_sm_selected_ind = np.where(mut_info_hw > 0)
print(np.sum(mut_info_sm_selected))

15214


In [51]:
common_feat = np.intersect1d(mut_info_sm_selected_ind[0], mut_info_hw_selected_ind[0])

In [52]:
print(len(common_feat))

15043


# Hardy Weinberg Equilibrium

In [53]:
germplasm_hw_tr = germplasm_hw_tr.iloc[:,mut_info_hw_selected_ind[0]]

In [54]:
germplasm_hw_tr.head(5)

Name,BK_05,BK_13,BK_14,BOPA1_10012-1239,BOPA1_1007-651,BOPA1_10207-1024,BOPA1_10248-954,BOPA1_10318-572,BOPA1_10321-364,BOPA1_1038-754,...,SCRI_RS_95857,SCRI_RS_9648,SCRI_RS_9736,SCRI_RS_97418,SCRI_RS_98248,SCRI_RS_98293,SCRI_RS_98443,SCRI_RS_99344,SCRI_RS_99798,SCRI_RS_9991
G49,0.093364,0.779188,0.768328,0.694444,0.444444,0.08963,0.812224,0.486549,0.812224,0.604938,...,0.497373,0.432184,0.715173,0.390625,0.828999,0.874571,0.064053,0.458964,0.388698,0.0625
G340,0.482253,0.779188,0.768328,0.694444,0.111111,0.490865,0.812224,0.486549,0.812224,0.604938,...,0.497373,0.432184,0.715173,0.390625,0.008011,0.874571,0.55788,0.104026,0.388698,0.5625
G161,0.482253,0.207057,0.21643,0.694444,0.444444,0.419505,0.812224,0.486549,0.812224,0.345679,...,0.497373,0.11737,0.261012,0.46875,0.828999,0.874571,0.55788,0.458964,0.388698,0.0625
G175,0.482253,0.779188,0.768328,0.694444,0.444444,0.490865,0.812224,0.091488,0.812224,0.604938,...,0.086879,0.432184,0.715173,0.390625,0.828999,0.874571,0.55788,0.458964,0.388698,0.5625
G215,0.093364,0.779188,0.768328,0.027778,0.111111,0.490865,0.009755,0.486549,0.812224,0.604938,...,0.497373,0.432184,0.715173,0.390625,0.828999,0.874571,0.064053,0.458964,0.141785,0.5625


In [55]:
germplasm_hw_test = germplasm_hw_test.iloc[:, mut_info_hw_selected_ind[0]]

In [56]:
classifier = RandomForestClassifier(random_state=RANDOM_STATE)
classifier.fit(germplasm_hw_tr.to_numpy(), tr_fhb.to_numpy())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [57]:
predicted = classifier.predict(germplasm_hw_test.to_numpy())

In [58]:
print('Classification report for Hardy Weinberg Equilibrium Features')
report = classification_report(test_fhb.to_numpy(), predicted)
print(report)
print()
conf_mat = confusion_matrix(test_fhb.to_numpy(), predicted)
print('Confusion matrix')
print(conf_mat)

Classification report for Hardy Weinberg Equilibrium Features
              precision    recall  f1-score   support

           1       0.75      0.50      0.60         6
           2       0.70      0.83      0.76        23
           3       0.67      0.55      0.60        11

    accuracy                           0.70        40
   macro avg       0.71      0.62      0.65        40
weighted avg       0.70      0.70      0.69        40


Confusion matrix
[[ 3  3  0]
 [ 1 19  3]
 [ 0  5  6]]


# Simple -1, 0, 1 encoding

In [59]:
germplasm_sm_tr = germplasm_sm_tr.iloc[:, mut_info_sm_selected_ind[0]]
germplasm_sm_test = germplasm_sm_test.iloc[:, mut_info_sm_selected_ind[0]]

In [60]:
classifier = RandomForestClassifier(random_state=RANDOM_STATE)
classifier.fit(germplasm_sm_tr.to_numpy(), tr_fhb.to_numpy())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [61]:
predicted = classifier.predict(germplasm_sm_test.to_numpy())

In [62]:
print('Classification report for -1, 0, 1 Features')
report = classification_report(test_fhb.to_numpy(), predicted)
print(report)
print()
conf_mat = confusion_matrix(test_fhb.to_numpy(), predicted)
print('Confusion matrix')
print(conf_mat)

Classification report for -1, 0, 1 Features
              precision    recall  f1-score   support

           1       1.00      0.50      0.67         6
           2       0.67      0.87      0.75        23
           3       0.57      0.36      0.44        11

    accuracy                           0.68        40
   macro avg       0.75      0.58      0.62        40
weighted avg       0.69      0.68      0.66        40


Confusion matrix
[[ 3  3  0]
 [ 0 20  3]
 [ 0  7  4]]
