# Medulloblastoma miRNA Classifier
### Author: Shehbeel Arif

#### Preclinical Laboratory Research Unit

#### The Center for Data Driven Discovery in Biomedicine (D3b)
#### Children's Hospital of Philadelphia

In [19]:
# Basic Library for data pre-processing and handling
import pandas as pd

# Library used to split data into training and testing sets
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

# Library for measuring accuracy of ML models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Machine Learning model used
from sklearn.ensemble import RandomForestClassifier

import plotly.express as px

In [32]:
data_dir = "/Users/arifs2/OneDrive - Children's Hospital of Philadelphia/OpenPBTA miRNA Projects/OpenPBTA_miRNA_medulloblastoma/data/"

# Load in the medulloblastoma sub-data from PBTA dataset
df = pd.read_csv(data_dir + "mb_pbta_tdata.csv", 
                 #index_col="genes", 
                 #header=0
                )

# Drop Unknown subtypes
df = df[df['Subtype'] != 'Unknown']

# Split the dataset into features and labels
X = df.iloc[:,2:]
y = df['Subtype']

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

print(X[:5])
print(y[:5])

[[-0.56419696 -0.52401514 -1.01514139 ...  0.02850724 -0.72342765
  -1.08643658]
 [ 2.11373585 -0.7183024  -0.24805053 ... -0.69412465  2.46336588
   2.31569195]
 [ 0.03089922  0.37456341  1.1883526  ...  0.09608775  0.06070451
  -0.83745337]
 [ 1.2806012   0.39884932  1.63339528 ...  2.90564075  0.13107535
   0.87021376]
 [-0.62370657 -0.79116012 -1.03663458 ... -0.28114231 -1.53771874
  -1.3216638 ]]
0    Group3
1       SHH
2       SHH
4    Group4
5    Group4
Name: Subtype, dtype: object


In [33]:
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(19, 2083) (7, 2083) (19,) (7,)


# Base model

In [34]:
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')

Accuracy: 0.7142857142857143


In [35]:
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
                x=df['Subtype'].unique().tolist(),
                y=df['Subtype'].unique().tolist()
                )
disp.show()

# XGBoost

In [6]:
import xgboost as xgb

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y

array([0, 2, 2, 1, 1, 1, 2, 0, 0, 2, 3, 0, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1,
       3, 2, 2, 1])

In [8]:
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [9]:
xgbc = xgb.XGBClassifier(random_state=0)
xgbc.fit(X_train, y_train)

In [10]:
# Make predictions using random forest classifier
xgbc_y_pred = xgbc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, xgbc_y_pred)}')

Accuracy: 1.0


In [11]:
# Calculate a confusion matrix
cm = confusion_matrix(y_test, xgbc_y_pred)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
                x=df['Subtype'].unique().tolist(),
                y=df['Subtype'].unique().tolist()
                )
disp.show()

In [12]:
# What are the most important features?
feature_list = df.columns
feature_list = feature_list.drop('Sample_ID')
feature_list = feature_list.drop('Subtype')

imp_features = pd.Series(xgbc.feature_importances_, index=feature_list)

imp_genes = imp_features.sort_values(ascending=False).to_frame().reset_index()
imp_genes.columns = ["features", "importance"]

imp_genes_fil = imp_genes[~(imp_genes == 0.000000).any(axis=1)]
imp_genes_fil

Unnamed: 0,features,importance
0,miR-18a-3p,0.220897
1,miR-155-5p,0.179644
2,miR-1263,0.147943
3,let-7a-2-3p,0.04618
4,miR-1,0.044957
5,let-7f-2-3p,0.042382
6,miR-6878-5p,0.035884
7,miR-183-3p,0.031866
8,miR-1275,0.029042
9,miR-125b-5p,0.026226


# Testing ML on new miRNAs

In [13]:
# Create new dataframe using only 
xgb_mirnas = imp_genes_fil['features'].to_list()
X_xgb = df[xgb_mirnas]

In [14]:
X_xgb

Unnamed: 0,miR-18a-3p,miR-155-5p,miR-1263,let-7a-2-3p,miR-1,let-7f-2-3p,miR-6878-5p,miR-183-3p,miR-1275,miR-125b-5p,...,miR-181a-2-3p,miR-34b-5p,miR-106a-5p,miR-561-3p,miR-4789-3p,miR-504-5p,miR-548ag,miR-3657,miR-342-5p,miR-632
0,775,304,319,13,249,51,21,64,1234,107626,...,201,25,14641,91,2,8,0,5,399,102
1,1332,1159,5,103,649,70,23,54,803,180684,...,565,239,17542,46,24,53,0,5,418,89
2,782,658,0,33,60,58,9,0,1415,118812,...,610,45,32098,81,11,9,0,1,560,38
4,110,98,1,75,34,73,5,0,1804,546801,...,40,51,4174,32,18,197,1,0,239,12
5,51,178,2,11,151,37,50,6,1690,178977,...,51,26,2250,25,9,43,0,1,243,69
6,493,102,0,3,31,53,0,40,1052,215053,...,442,32,15277,59,21,231,0,1,531,35
7,308,307,0,34,48,61,4,0,544,83823,...,1430,28,17900,43,52,30,1,4,122,14
8,238,109,80,3,79,35,35,124,1302,116810,...,204,67,8654,31,16,26,0,8,593,63
9,188,68,105,5,19,43,12,382,1170,69509,...,193,16,6407,18,13,63,0,3,952,35
10,494,544,1,13,42,134,10,1,1487,113843,...,880,23,24361,39,36,4,0,8,270,20


In [15]:
# Split PBTA data into training and testing set
X_xgb_train, X_xgb_test, y_xgb_train, y_xgb_test = train_test_split(X_xgb, y, test_size=0.25, random_state=0)

In [16]:
xgbc_xgb = xgb.XGBClassifier(random_state=0)
xgbc_xgb.fit(X_xgb_train, y_xgb_train)

# Make predictions using random forest classifier
xgbc_xgb_y_pred = xgbc_xgb.predict(X_xgb_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, xgbc_xgb_y_pred)}')

Accuracy: 0.8571428571428571


In [17]:
# Calculate a confusion matrix
cm = confusion_matrix(y_test, xgbc_xgb_y_pred)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Subtype", y="Subtype", color="Productivity"),
                x=df['Subtype'].unique().tolist(),
                y=df['Subtype'].unique().tolist()
                )
disp.show()

In [18]:
# Initialize random forest classifier
rfc_xgb = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
rfc_xgb.fit(X_xgb_train, y_xgb_train)

# Make predictions using random forest classifier
rfc_xgb_y_pred = rfc_xgb.predict(X_xgb_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_xgb_y_pred)}')

Accuracy: 0.7142857142857143


# XGBoost Stratified K-fold

In [19]:
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev

In [20]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
lst_accu_stratified = []

xgbcv = xgb.XGBClassifier(random_state=0)

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    xgbcv.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(xgbcv.score(x_test_fold, y_test_fold))
  
# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))

List of possible accuracy: [0.7692307692307693, 0.6153846153846154]

Maximum Accuracy That can be obtained from this model is: 76.92307692307693 %

Minimum Accuracy: 61.53846153846154 %

Overall Accuracy: 69.23076923076923 %

Standard Deviation is: 0.10878565864408424


# Testing on FFPE Data

In [21]:
ffpe = pd.read_csv(data_dir + "mb_ffpe_data_subset.csv")
ffpe

Unnamed: 0,sample_id,label,composition_type,cohort,molecular_subtype,let-7a-2-3p,let-7a-3p,let-7a-5p,let-7b-5p,let-7c-3p,...,miR-944,miR-95-3p,miR-95-5p,miR-9-5p,miR-96-3p,miR-96-5p,miR-98-3p,miR-99a-5p,miR-99b-3p,miR-99b-5p
0,7316-302,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group3,155,105,156785,57825,333,...,49,239,22,78526,322,145322,2,121617,1270,36225
1,7316-304,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group3,35,12,60543,14823,160,...,3,3585,2,7327,381,209862,1,118834,914,29937
2,7316-109,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,92,34,153438,92245,311,...,19,147,9,170477,21,2155,5,159050,712,18912
3,7316-14,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,366,200,309458,114088,745,...,42,654,21,393903,71,307,3,385961,842,20647
4,7316-1460,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,237,38,279789,156662,268,...,21,506,8,191015,90,20733,4,270088,941,15055
5,7316-22,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,112,26,229934,144993,321,...,15,189,4,188337,24,2659,1,268412,732,20735
6,7316-2971,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,201,153,495360,341359,1948,...,17,449,16,547502,37,477,18,685620,2099,40669
7,7316-306,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,121,86,403850,276691,597,...,22,563,7,512611,29,1082,2,553698,1321,28821
8,7316-307,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,307,94,264961,186029,518,...,17,215,11,366694,27,127,6,302319,664,14126
9,7316-309,Medulloblastoma_FFPE (n=64),RNA,mb_ffpe,Group4,369,126,391419,195495,220,...,44,471,30,482696,106,28314,10,64832,2228,55457


In [22]:
ffpe_X = ffpe.iloc[:,5:]
ffpe_y = ffpe['molecular_subtype']

In [23]:
ffpe_predictions = xgbc.predict(scaler.fit_transform(ffpe_X))
ffpe_predictions

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 2,
       2, 2, 2, 0, 2, 2, 0])

In [24]:
ffpe_predictions = encoder.inverse_transform(ffpe_predictions)
ffpe_predictions

array(['Group3', 'Group3', 'Group4', 'Group3', 'Group4', 'Group4',
       'Group4', 'Group4', 'Group4', 'Group3', 'Group4', 'Group4',
       'Group4', 'Group4', 'Group4', 'Group4', 'Group4', 'Group3',
       'Group4', 'SHH', 'Group4', 'SHH', 'SHH', 'SHH', 'SHH', 'Group3',
       'SHH', 'SHH', 'Group3'], dtype=object)

In [25]:
# Accuracy of model
print(f'Accuracy: {accuracy_score(ffpe_y, ffpe_predictions)}')

Accuracy: 0.7241379310344828


In [26]:
pd.DataFrame(ffpe_predictions, ffpe_y)

Unnamed: 0_level_0,0
molecular_subtype,Unnamed: 1_level_1
Group3,Group3
Group3,Group3
Group4,Group4
Group4,Group3
Group4,Group4
Group4,Group4
Group4,Group4
Group4,Group4
Group4,Group4
Group4,Group3


In [27]:
# Calculate a confusion matrix
cm = confusion_matrix(ffpe_y, ffpe_predictions)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
                x=ffpe['molecular_subtype'].unique().tolist(),
                y=ffpe['molecular_subtype'].unique().tolist()
                )
disp.show()

# Testing on CSF and Plasma Data

In [28]:
# Directory
lb_data_dir = "/Users/arifs2/OneDrive - Children's Hospital of Philadelphia/OpenPBTA miRNA Projects/OpenPBTA_miRNA_medulloblastoma/data/liquid_biopsy_data/"

# Load in Liquid Biopsy Datasets
plasma = pd.read_csv(lb_data_dir + "lb_plasma.csv")
csf = pd.read_csv(lb_data_dir + "lb_csf.csv")
meta = pd.read_csv(lb_data_dir + "lb_meta.csv")

# Add meta data
plasma = pd.merge(meta, plasma, on="SDG_ID")
csf = pd.merge(meta, csf, on="SDG_ID")

plasma

Unnamed: 0,Study Subject ID,SDG_ID,Short_histology,Specimen_x,Case_type,Tumor_Subtype,WHO_Grade,Specimen_y,CTRL_ANT1,CTRL_ANT2,...,miR-944,miR-95-3p,miR-95-5p,miR-9-5p,miR-96-3p,miR-96-5p,miR-98-3p,miR-99a-5p,miR-99b-3p,miR-99b-5p
0,LB00010,15635-38,ATRT,plasma,Initial CNS Tumor,Not Applicable,4,plasma,8,10,...,0,94,7,33,0,227,42,5254,84,922
1,LB00010,15635-182,ATRT,plasma,Progressive,Not Applicable,4,plasma,28,52,...,57,138,116,58,40,95,102,2704,132,1315
2,LB00072,15635-129,ATRT,plasma,Recurrence,Not Applicable,4,plasma,6,5,...,3,24,9,14,6,331,11,1942,33,311
3,LB00076,15635-135,ATRT,"plasma, CSF",Initial CNS Tumor,Not Applicable,4,plasma,0,3,...,0,106,79,15,9,692,39,2130,60,850
4,LB00004,15635-31,EP,plasma,Recurrence,Not Available,3,plasma,41,95,...,110,176,96,27,118,491,36,3686,167,879
5,LB00012,15635-42,EP,plasma,Initial CNS Tumor,ST-EPN-RELA,3,plasma,23,8,...,10,127,67,0,0,114,68,3873,55,1579
6,LB00024,15635-58,EP,"plasma, CSF",Initial CNS Tumor,PF-EPN-A,3,plasma,36,33,...,24,168,151,67,49,55,66,5039,199,1151
7,LB00024,15635-215,EP,plasma,Recurrence,PF-EPN-A,3,plasma,22,50,...,20,75,24,18,19,123,23,4375,91,962
8,LB00036,15635-75,EP,"plasma, CSF",Initial CNS Tumor,PF-EPN-A,3,plasma,2,4,...,1,52,40,13,6,176,27,2036,80,568
9,LB00039,15635-81,EP,"plasma, CSF",Recurrence,ST-EPN-RELA,3,plasma,78,106,...,78,223,166,101,151,272,86,2808,177,708


In [29]:
# Subset Medulloblastoma
plasma = plasma[plasma['Short_histology'] == 'MB']
csf = csf[csf['Short_histology'] == 'MB']

plasma_X = plasma.iloc[:,27:]
csf_X = csf.iloc[:,27:]

In [30]:
#csf['Tumor_Subtype']
csf_y = ['Group3/4', 'Group3/4', 'Group3/4', 'WNT', 'Group3/4', 'Group3/4', 'Group3/4', 'Group3/4', 'SHH']

#plasma['Tumor_Subtype']
plasma_y = ['Group3/4', 'Group3/4', 'Group3/4', 'WNT', 'Group3/4', 'Group3/4', 'Group3/4', 'SHH', 'Group3/4', 'SHH']

In [31]:
csf_predictions = xgbc.predict(scaler.fit_transform(csf_X))
csf_predictions = encoder.inverse_transform(csf_predictions)
csf_predictions

array(['Group4', 'Group4', 'Group4', 'Group4', 'Group4', 'Group4',
       'Group4', 'SHH', 'Group4'], dtype=object)

In [32]:
plasma_predictions = xgbc.predict(scaler.fit_transform(plasma_X))
plasma_predictions = encoder.inverse_transform(plasma_predictions)
plasma_predictions

array(['SHH', 'Group4', 'WNT', 'SHH', 'Group4', 'Group4', 'Group3', 'SHH',
       'SHH', 'SHH'], dtype=object)

In [33]:
# Reformat predictions to Liquid Biopsy class
csf_predictions = ['Group3/4', 'Group3/4', 'Group3/4', 'Group3/4', 'Group3/4', 'Group3/4', 'Group3/4', 'SHH', 'Group3/4']
plasma_predictions = ['SHH', 'Group3/4', 'WNT', 'SHH', 'Group3/4', 'Group3/4', 'Group3/4', 'SHH', 'SHH', 'SHH']

In [34]:
# Accuracy of model
print(f'CSF Accuracy: {accuracy_score(csf_y, csf_predictions)}')

CSF Accuracy: 0.6666666666666666


In [35]:
# Calculate a confusion matrix
cm = confusion_matrix(csf_y, csf_predictions)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
                x=csf['Tumor_Subtype'].unique().tolist(),
                y=csf['Tumor_Subtype'].unique().tolist()
                )
disp.show()

In [36]:
# Accuracy of model
print(f'Plasma Accuracy: {accuracy_score(plasma_y, plasma_predictions)}')

Plasma Accuracy: 0.6


In [37]:
# Calculate a confusion matrix
cm = confusion_matrix(plasma_y, plasma_predictions)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
                x=plasma['Tumor_Subtype'].unique().tolist(),
                y=plasma['Tumor_Subtype'].unique().tolist()
                )
disp.show()

---

# Training on FFPE Data

In [38]:
ffpe = pd.read_csv(data_dir + "mb_ffpe_data_subset.csv")

ffpe_X = ffpe.iloc[:,5:]
ffpe_y = ffpe['molecular_subtype']
ffpe_y = encoder.fit_transform(ffpe_y)

In [39]:
# Split PBTA data into training and testing set
ffpe_X_train, ffpe_X_test, ffpe_y_train, ffpe_y_test = train_test_split(ffpe_X, ffpe_y, test_size=0.25, random_state=0)


In [40]:
ffpe_xgbc = xgb.XGBClassifier(random_state=0)
ffpe_xgbc.fit(ffpe_X_train, ffpe_y_train)

In [41]:
# Make predictions using random forest classifier
ffpe_xgbc_y_pred = ffpe_xgbc.predict(ffpe_X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(ffpe_y_test, ffpe_xgbc_y_pred)}')

Accuracy: 0.625


---

## Using four miRNAs found by Variance Threshold method
#### ['miR-124-3p', 'miR-125b-5p', 'miR-451a', 'miR-9-5p']

In [24]:
# Split into train-test set
X = df[['miR-124-3p', 'miR-125b-5p', 'miR-451a', 'miR-9-5p']]
y = df['Subtype']

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

#Sanity check
print(X[:5])
print(y[:5])

[[-0.56723433 -0.70497078 -0.54858481 -0.5498604 ]
 [ 0.18040925 -0.11090829 -0.56381406 -0.26847852]
 [-0.55089093 -0.61401314 -0.8283757  -0.38079983]
 [-0.07881786  2.86612879 -0.25732109  1.97864704]
 [-0.21499796 -0.12478856  0.13588365 -0.68131933]]
0    Group3
1       SHH
2       SHH
4    Group4
5    Group4
Name: Subtype, dtype: object


In [25]:
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(19, 4) (7, 4) (19,) (7,)


## Base model

In [26]:
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')

Accuracy: 0.8571428571428571


In [27]:
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
                x=df['Subtype'].unique().tolist(),
                y=df['Subtype'].unique().tolist()
                )
disp.show()

## XGBoost CV

In [11]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y

array([0, 2, 2, 1, 1, 1, 2, 0, 0, 2, 3, 0, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1,
       3, 2, 2, 1])

In [12]:
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [15]:
import xgboost as xgb

xgbc = xgb.XGBClassifier(random_state=0)
xgbc.fit(X_train, y_train)

# Make predictions using random forest classifier
xgbc_y_pred = xgbc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, xgbc_y_pred)}')

Accuracy: 0.7142857142857143


In [16]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev


# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
lst_accu_stratified = []

xgbcv = xgb.XGBClassifier(random_state=0)

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    xgbcv.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(xgbcv.score(x_test_fold, y_test_fold))
  
# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))

List of possible accuracy: [0.6923076923076923, 0.46153846153846156]

Maximum Accuracy That can be obtained from this model is: 69.23076923076923 %

Minimum Accuracy: 46.15384615384615 %

Overall Accuracy: 57.692307692307686 %

Standard Deviation is: 0.16317848796612633
