In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### First load the data and make some data cleaning and convert the lables into numerical values

In [3]:
df = pd.read_csv("processed_data.csv")
df.drop(columns="Unnamed: 0", inplace= True)
df = df.replace({"Normal": 0, "tumor": 1})
df

Unnamed: 0,A2M,A2ML1,A2MP1,A4GALT,AADAC,AADACL2,AADACP1,AARD,AATK,ABCA12,...,ZP1,ZP2,ZP3,ZP4,ZPLD1,ZSCAN18,ZWILCH,ZWINT,ZYG11A,label
0,18.025602,4.771998,3.709543,10.552056,8.400921,0.000000,3.661155,8.999742,12.168918,3.888337,...,2.154481,3.559218,6.731842,0.000000,2.007401,11.468336,8.582399,7.710252,3.132114,0
1,18.204083,4.050321,5.694897,10.288325,8.598946,2.349598,5.525601,9.683000,10.832310,4.249902,...,2.096754,4.119961,3.977148,0.000000,0.863435,11.030114,9.202867,8.417243,3.200637,0
2,16.808548,4.288578,5.284253,10.117551,8.706583,4.528257,4.221875,7.528757,9.626440,3.919147,...,2.436959,3.161402,6.089750,0.000000,0.913032,10.571301,9.359899,9.611862,3.832382,0
3,18.279758,2.585570,5.443655,10.508523,8.511491,3.286056,3.460094,7.470366,12.496896,2.248503,...,2.585570,1.807875,5.742182,0.000000,0.000000,11.039304,8.616356,7.578154,2.248503,0
4,18.494754,2.927068,4.424751,10.493543,8.782259,3.447374,5.853208,6.848966,11.674256,2.105281,...,1.214825,1.214825,7.244206,0.000000,0.731673,11.476082,8.720342,7.647387,1.576130,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,13.679174,12.591374,0.000000,9.576083,11.870922,11.527822,9.430882,4.107728,7.065755,10.361356,...,3.341312,1.599346,9.297612,3.604703,0.000000,10.540161,11.148862,11.377966,6.288031,1
94,14.533470,13.072505,1.097912,11.138606,6.922639,6.817160,5.313797,3.339646,7.708266,11.796178,...,4.178451,0.000000,9.514627,2.319491,5.250375,9.007787,10.650952,10.985133,5.693483,1
95,13.639406,10.904838,1.884657,11.986527,5.096319,4.174203,5.096319,1.482900,7.510211,8.844386,...,2.674755,0.924138,10.457959,0.000000,3.761892,10.278812,10.992614,11.242869,4.310962,1
96,14.560743,12.083626,1.373020,9.426123,6.272878,5.697214,4.437732,5.320411,6.900432,11.403955,...,2.063587,0.000000,9.594332,0.000000,1.373020,7.329188,11.770541,13.150590,8.735920,1


### Dividing the data into attributes and labels

In [4]:
X = df.iloc[:, 0:7725].values
y = df.iloc[:, 7725].values

### Dividing the data into training and testing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Since the data has a wide variance of values so scaling the data will be much more powerful for prediction

In [10]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the data using random forest classifier which has the number of estimator that defines the number of trees

In [7]:
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

### calculate the accuracy of the model using cofusion matrix and accuracy score

In [8]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[10  0]
 [ 0 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

1.0


### trying with different estimator 

In [12]:
regressor = RandomForestClassifier(n_estimators=5, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[10  0]
 [ 0 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

1.0


### trying with different estimator 

In [13]:
regressor = RandomForestClassifier(n_estimators=10, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[10  0]
 [ 0 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

1.0


#### so changing the number of estimators didn't change the results