In [1]:
import warnings 
warnings.filterwarnings('ignore')
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
all_data =  pd.read_csv('../data/data_clean.csv')
selected_data =  pd.read_csv('../data/causal_selected.csv')

In [3]:
all_data.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
3,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
4,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368


In [4]:
selected_data.head()

Unnamed: 0,perimeter_mean,area_mean,concavity_mean,concave points_mean,radius_worst,area_worst,diagnosis
0,132.9,1326.0,0.0869,0.07017,24.99,1956.0,1
1,130.0,1203.0,0.1974,0.1279,23.57,1709.0,1
2,135.1,1297.0,0.198,0.1043,22.54,1575.0,1
3,82.57,477.1,0.1578,0.08089,15.47,741.6,1
4,119.6,1040.0,0.1127,0.074,22.88,1606.0,1


In [5]:
le =LabelEncoder()
all_data['diagnosis'] = le.fit_transform(all_data['diagnosis'])

In [6]:
all_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
3,843786,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
4,844359,1,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368


## scaling

In [7]:
X_all = all_data.iloc[:,2:]
y_all = all_data[['diagnosis']]
X_selected = selected_data.iloc[:,:-1]
y_selected = selected_data[['diagnosis']]

In [8]:
scaler = MinMaxScaler()
scaler.fit(X_all)
Xs_all = scaler.transform(X_all)

In [9]:
scaler.fit(X_selected)
Xs_selected = scaler.transform(X_selected)

### splitting 

In [10]:
X_train_all,X_test_all,y_train_all,y_test_all = train_test_split(Xs_all,y_all,test_size=0.2,random_state=42)

In [11]:
X_train_selected,X_test_selected,y_train_selected,y_test_selected = train_test_split(Xs_selected,y_selected,test_size=0.2,random_state=42)

### Random Forest 

In [12]:
def random_forest(X,y,X_test,y_test):
    criter = 'gini'
    depth = 20
    estimators =60
    boot = False

    with mlflow.start_run():
        RandomForest_pipeline=Pipeline([
                             ('imputer', SimpleImputer(strategy='mean')),
                            ('scaler',StandardScaler()),
                             ('rf_classifier',RandomForestClassifier( n_estimators=estimators,
                                        max_depth=depth,
                                        criterion=criter,
                                        bootstrap=boot))])

        rf_model_all = RandomForest_pipeline.fit(X,y)
        y_pred_all = rf_model_all.predict(X_test)
        accuracy_all = accuracy_score(y_test,y_pred_all)
        print(f"the accuracy is {accuracy_all}")


        mlflow.log_param("number of estimators", estimators)
        mlflow.log_param("maximum depth", depth)
        mlflow.log_param("Criterion", criter)
        mlflow.log_param("bootstrap", boot)
        mlflow.log_metric("accuracy score", accuracy_all)

        mlflow.sklearn.log_model(rf_model_all,"classifier model")

In [13]:
random_forest(X_train_all,y_train_all,X_test_all,y_test_all)

the accuracy is 0.9693877551020408


In [14]:
random_forest(X_train_selected,y_train_selected,X_test_selected,y_test_selected)

the accuracy is 0.9183673469387755
