In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### First load the data and make some data cleaning and convert the lables into numerical values

In [2]:
df = pd.read_table("final/processed_squamous_done.txt", sep=" ")
df = df.replace({"Normal": 0, "tumor": 1})
df

Unnamed: 0,ABCA4,ABCB4,ABCC2,ACSL6,ACTL8,ADAMTS18,ADAMTS6,ADAMTS8,ADAMTS9-AS1,ADCYAP1R1,...,WIF1,XKR4,XKRX,ZBTB16,ZC3H12D,ZFHX4-AS1,ZFR2,ZNF536,ZNF560,label
0,177,89,101,122,1,26,674,8241,919,757,...,24149,59,398,5548,336,0,49,100,12,0
1,187,38,69,66,0,220,46,2092,110,456,...,5418,52,52,580,222,0,25,28,3,0
2,326,688,118,292,1,16,233,1189,106,76,...,4880,136,183,821,1721,1,30,13,6,0
3,56,9,36,39,0,7,109,4978,351,323,...,8050,14,159,2638,75,0,14,24,3,0
4,125,44,58,85,1,5,206,8034,662,148,...,16317,39,148,9723,174,0,24,116,3,0
5,90,52,76,81,0,6,249,2245,220,26,...,9486,41,184,478,92,0,48,21,9,0
6,98,25,62,54,0,3,84,3356,171,158,...,18224,18,117,1906,113,0,25,45,3,0
7,57,29,79,53,0,20,109,3283,150,300,...,5456,9,71,580,137,0,20,76,6,0
8,107,16,48,33,0,1,78,2407,144,97,...,3188,65,117,829,106,0,82,9,3,0
9,316,39,29,58,0,9,53,5510,200,483,...,16181,112,62,3917,88,2,16,48,19,0


### Dividing the data into attributes and labels

In [3]:
y = df.label
X = df.drop('label', axis=1)

### Dividing the data into training and testing

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

### Since the data has a wide variance of values so scaling the data will be much more powerful for prediction

In [5]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the data using random forest classifier which has the number of estimator that defines the number of trees

In [6]:
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

### calculate the accuracy of the model using cofusion matrix and accuracy score

In [7]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 21   0]
 [  1 199]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.99      1.00       200

    accuracy                           1.00       221
   macro avg       0.98      1.00      0.99       221
weighted avg       1.00      1.00      1.00       221

0.995475113122172


### trying with different estimator 

In [8]:
regressor = RandomForestClassifier(n_estimators=5, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 20   1]
 [  0 200]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        21
           1       1.00      1.00      1.00       200

    accuracy                           1.00       221
   macro avg       1.00      0.98      0.99       221
weighted avg       1.00      1.00      1.00       221

0.995475113122172


### trying with different estimator 

In [9]:
regressor = RandomForestClassifier(n_estimators=10, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 21   0]
 [  1 199]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.99      1.00       200

    accuracy                           1.00       221
   macro avg       0.98      1.00      0.99       221
weighted avg       1.00      1.00      1.00       221

0.995475113122172


#### so changing the number of estimators didn't change the results

In [10]:
regressor.n_outputs_

1