In [4]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# load dataset
dataset = pd.read_excel('longen_machinelearning_5rows.xls')
dataset_raw = pd.read_excel('longen_machinelearning_5rows.xls')                        
 
# Replace 0 values by zero
dataset = dataset.fillna('zero')

# Encode text in categorical data
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
ord_enc = OrdinalEncoder()
column_names = list(dataset.columns.values)
for column in column_names:
    dataset[column] = ord_enc.fit_transform(dataset[[column]])
      
# Drop target y variables from the dataset
X = dataset.drop(['besl1', 'besl2', 'besl3', 'besl4', 'besl5'],axis=1)

# Define target y variables
y = dataset[['besl1', 'besl2', 'besl3', 'besl4', 'besl5']]

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

# Create the RandomForestClassifier with grid search
param_grid = [
     {'bootstrap': [True, False],'n_estimators': [1, 5, 10, 20],
      'max_features': [2, 4, 6, 8, 10]}]

#'min_samples_leaf': [3, 4, 5],'min_samples_split': [8, 10, 12]
#'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
# n_estimators

rfc = RandomForestClassifier()
grid_search = GridSearchCV(rfc, param_grid, cv=10, scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train,y_train)
predictions = grid_search.predict(X_test)

# Create the RandomForestClassifier without grid search
#rfc = RandomForestClassifier(n_estimators=10)
#rfc.fit(X_train,y_train)
#predictions = rfc.predict(X_test)

# Calculate accuracy (classification report for multiclass-multioutput problems not supported)
i = 0
correct = 0
incorrect = 0

for i in range(0,len(y_test)):
    if y_test.values[i][0] == predictions[i][0] and y_test.values[i][1] == predictions[i][1] and y_test.values[i][2] == predictions[i][2] and y_test.values[i][3] == predictions[i][3] and y_test.values[i][4] == predictions[i][4]:
        correct += 1
    else:
        incorrect += 1
    i += 1
    
print("Correct classified samples: {correct}".format(correct = correct))
print("Incorrect classified samples: {incorrect}".format(incorrect = incorrect))
print("Accuracy: {accuracy}%".format(accuracy = round((correct/(incorrect + correct))*100, 2)))

Correct classified samples: 277
Incorrect classified samples: 10
Accuracy: 96.52%


In [13]:
print(dataset.head())

   MUT1GEN  MUT1eiwit  MUT2GEN  MUT2eiwit  MUT3GEN  MUT3eiwit  MUT4GEN  \
0     11.0       13.0      7.0       20.0      0.0        0.0      0.0   
1      5.0       39.0      0.0        0.0      0.0        0.0      0.0   
2      5.0       39.0     12.0       31.0      0.0        0.0      0.0   
3      2.0       34.0     12.0       30.0     11.0       12.0      5.0   
4      5.0       20.0      0.0        0.0      0.0        0.0      0.0   

   MUT4eiwit  MUT5GEN  VAR1GEN  VAR1OPM  besl1  besl2  besl3  besl4  besl5  
0        0.0      3.0      5.0      6.0    3.0   18.0    0.0    6.0    2.0  
1        0.0      0.0      0.0      6.0    6.0   24.0   15.0    6.0    2.0  
2        0.0      0.0      0.0      6.0    6.0   24.0   15.0    6.0    2.0  
3        6.0      3.0      5.0      6.0    3.0    5.0   15.0    6.0    2.0  
4        0.0      0.0      0.0      6.0    4.0   24.0   15.0    6.0    2.0  


In [3]:
print(dataset_raw)

    MUT1GEN             MUT1eiwit MUT2GEN           MUT2eiwit MUT3GEN  \
0    PIK3CA         p.(Gln546Arg)    KRAS        p.(Gly12Val)       -   
1      EGFR         p.(Leu858Arg)       -                   -       -   
2      EGFR         p.(Leu858Arg)    TP53       p.(Met237Ile)       -   
3      BRAF         p.(Gly469Val)    TP53       p.(Lys351Ter)     NaN   
4      EGFR  p.(Glu746_Ala750del)       -                   -       -   
..      ...                   ...     ...                 ...     ...   
950    TP53         p.(Arg337Leu)       -                   -       -   
951  CDKN2A     p.(Pro70Glnfs*49)    TP53       p.(Arg249Ser)       -   
952    KRAS          p.(Gln61His)   STK11  p.(Gly276Alafs*11)    TP53   
953    KRAS          p.(Gly12Cys)   STK11   p.(Pro281Argfs*6)  UGT1A8   
954   SMAD4         p.(Arg361His)    TP53       p.(Val197Leu)       -   

             MUT3eiwit MUT4GEN MUT4eiwit MUT5GEN VAR1GEN VAR1OPM  \
0                    -       -         -     NaN     Na

In [2]:
print(len(dataset['besl1']))

955


In [3]:
print(y_test)

     besl1  besl2  besl3  besl4  besl5
810    3.0   14.0   15.0    6.0    2.0
299    3.0   18.0   15.0    6.0    2.0
127    4.0   24.0   15.0    6.0    2.0
98     3.0    4.0   15.0    6.0    2.0
543    3.0   17.0   15.0    6.0    2.0
..     ...    ...    ...    ...    ...
24     3.0   18.0    7.0    6.0    2.0
850    3.0   14.0   15.0    6.0    2.0
600    3.0   18.0   14.0    1.0    2.0
907    3.0   18.0   12.0    6.0    2.0
683    3.0   14.0   15.0    6.0    2.0

[287 rows x 5 columns]


In [4]:
ord_enc.categories_

[array(['Atypischmutatieprofielvoornsclc.ErwerdenreedscasereportsbeschrevenikvselectieveinhibitievandeactiverendeBRAFG469Avariantobvcombidabrafenib/trametinibdochdeactiverendeNRASmutatiekanmogelijksoptredenalsresistentiemechanisme.',
        'MutatieprofielpassendbijNSCLC.', 'zero'], dtype=object)]

In [5]:
datasets = [dataset, dataset_raw]
merged_datasets = pd.concat(datasets)
merged_datasets.to_excel("merged_datasets.xlsx")


In [6]:
# Convert predictions to pandas dataframe
predictions = pd.DataFrame(predictions, dtype=np.int)

In [7]:
# Reset index of the test set
y_test = y_test.reset_index()

In [8]:
# Merge y test with predictions
pd.set_option('display.max_rows', None)
df = y_test.join(predictions)
print(df)

     index  besl1  besl2  besl3  besl4  besl5   0   1   2  3  4
0      810    3.0   14.0   15.0    6.0    2.0   3  14  15  6  2
1      299    3.0   18.0   15.0    6.0    2.0   3  18  15  6  2
2      127    4.0   24.0   15.0    6.0    2.0   4  24  15  6  2
3       98    3.0    4.0   15.0    6.0    2.0   3   4  15  6  2
4      543    3.0   17.0   15.0    6.0    2.0   3  17  15  6  2
5      230    3.0   18.0   15.0    6.0    2.0   3  18  15  6  2
6      780    3.0   14.0   15.0    6.0    2.0   3  14  15  6  2
7       32    4.0   24.0   15.0    6.0    2.0   4  24  15  6  2
8      363    3.0   18.0    7.0    6.0    2.0   3  18   7  6  2
9      291    3.0   18.0   15.0    6.0    2.0   3  18  15  6  2
10       7    3.0    2.0   15.0    6.0    2.0   3   2  15  6  2
11     132    6.0   24.0   15.0    6.0    2.0   6  24  15  6  2
12     783    3.0   14.0   15.0    6.0    2.0   3  14  15  6  2
13     791    3.0   14.0   15.0    6.0    2.0   3  14  15  6  2
14      55    3.0   14.0   15.0    6.0  

In [9]:
# Save dataframe to an excel file
#df.to_excel("file_name.xlsx")

In [10]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
dataset_BESL_drop = dataset.drop(['besl1', 'besl2', 'besl3', 'besl4', 'besl5'],axis=1)
#scaler.fit(dataset_BESL_drop)
#scaled_data = scaler.transform(dataset_BESL_drop)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(dataset_BESL_drop)
x_pca = pca.transform(dataset_BESL_drop)

In [11]:
dataset_BESL_drop.head()

Unnamed: 0,MUT1GEN,MUT1eiwit,MUT2GEN,MUT2eiwit,MUT3GEN,MUT3eiwit,MUT4GEN,MUT4eiwit,MUT5GEN,VAR1GEN,VAR1OPM
0,11.0,13.0,7.0,20.0,0.0,0.0,0.0,0.0,3.0,5.0,6.0
1,5.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
2,5.0,39.0,12.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,2.0,34.0,12.0,30.0,11.0,12.0,5.0,6.0,3.0,5.0,6.0
4,5.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


In [12]:
scaled_data.shape

NameError: name 'scaled_data' is not defined

In [None]:
# Reduction of 13 dimensions in just 2
x_pca.shape

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=dataset['besl2'], cmap='plasma')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

In [None]:
pca.components_

In [None]:
df_comp = pd.DataFrame(pca.components_,columns=list(dataset_BESL_drop.columns.values))

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma',)

In [31]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# load dataset
dataset = pd.read_excel('ML_CRC_set2.xls')
dataset_raw = pd.read_excel('ML_CRC_set2.xls')                        

# Replace 0 values by zero
dataset = dataset.fillna('zero')

# Encode text in categorical data
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
ord_enc = OrdinalEncoder()
column_names = list(dataset.columns.values)
for column in column_names:
    dataset[column] = ord_enc.fit_transform(dataset[[column]])
    

# Drop target y variables from the dataset
X = dataset.drop(['BESL1', 'BESL2', 'BESL3', 'BESL4', 'BESL5','BESL6'],axis=1)

# Define target y variables
y = dataset[['BESL1', 'BESL2', 'BESL3', 'BESL4', 'BESL5','BESL6']]

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

# Create the RandomForestClassifier with grid search
param_grid = [
     {'bootstrap': [True, False],'n_estimators': [1, 5, 10, 20],
      'max_features': [2, 4, 6, 8, 10]}]

#'min_samples_leaf': [3, 4, 5],'min_samples_split': [8, 10, 12]
#'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
# n_estimators

rfc = RandomForestClassifier()
grid_search = GridSearchCV(rfc, param_grid, cv=10, scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train,y_train)
predictions = grid_search.predict(X_test)

# Create the RandomForestClassifier without grid search
#rfc = RandomForestClassifier(n_estimators=10)
#rfc.fit(X_train,y_train)
#predictions = rfc.predict(X_test)

# Calculate accuracy (classification report for multiclass-multioutput problems not supported)
i = 0
correct = 0
incorrect = 0

for i in range(0,len(y_test)):
    if y_test.values[i][0] == predictions[i][0] and y_test.values[i][1] == predictions[i][1] and y_test.values[i][2] == predictions[i][2] and y_test.values[i][3] == predictions[i][3] and y_test.values[i][4] == predictions[i][4] and y_test.values[i][5] == predictions[i][5] :
        correct += 1
    else:
        incorrect += 1
    i += 1
    
print("Correct classified samples: {correct}".format(correct = correct))
print("Incorrect classified samples: {incorrect}".format(incorrect = incorrect))
print("Accuracy: {accuracy}%".format(accuracy = round((correct/(incorrect + correct))*100, 2)))

Correct classified samples: 297
Incorrect classified samples: 0
Accuracy: 100.0%


In [24]:
# Convert predictions to pandas dataframe
predictions = pd.DataFrame(predictions, dtype=np.int)

In [25]:
# Reset index of the test set
y_test = y_test.reset_index()

In [26]:
# Merge y test with predictions
pd.set_option('display.max_rows', None)
df = y_test.join(predictions)
print(df)

     index  BESL1  BESL2  BESL3  BESL4  BESL5  BESL6   0   1   2   3   4  5
0       36   28.0   27.0   22.0   18.0   12.0    7.0  28  27  22  18  12  7
1      842   28.0   22.0   22.0   18.0   12.0    7.0  28  22  22  18  12  7
2      421   23.0   33.0   20.0   11.0   12.0    7.0  23  33  20  11  12  7
3      687   12.0   23.0   27.0   16.0   12.0    7.0  12  23  27  16  12  7
4      755   28.0   22.0   22.0   18.0   12.0    7.0  28  22  22  18  12  7
5      347   27.0   22.0   22.0   18.0   12.0    7.0  27  22  22  18  12  7
6      833    7.0   38.0   13.0   11.0   12.0    7.0   7  38  13  11  12  7
7      890   27.0   19.0   20.0   11.0   12.0    7.0  27  19  20  11  12  7
8      486   28.0   27.0   22.0   18.0   12.0    7.0  28  27  22  18  12  7
9      299    3.0   38.0   24.0    3.0    4.0    5.0   3  38  24   3   4  5
10     182   28.0   28.0   21.0   18.0   12.0    7.0  28  28  21  18  12  7
11     180   24.0   22.0   21.0   18.0   12.0    7.0  24  22  21  18  12  7
12     814  