In [4]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv('C:/Users/odhia/OneDrive/Desktop/datascienceproject/data/clean_df.csv')
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,18.25,19.98,119.6,1040.0,0.09463,0
1,13.71,20.83,90.2,577.9,0.1189,0
2,12.46,24.04,83.97,475.9,0.1186,0
3,16.02,23.24,102.7,797.8,0.08206,0
4,15.78,17.89,103.6,781.0,0.0971,0


In [7]:
X = df.drop('diagnosis',axis=1)
y = df.diagnosis
# oversamplng 
smote = SMOTE(random_state=42)
X,y = smote.fit_resample(X, y)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
def hyperparameter_tune(model,params,x,y):
    grid = GridSearchCV(model,params,cv=2,return_train_score=False)
    grid.fit(x,y)
    result = grid.best_score_
    paras = grid.best_params_
    return f"Best parameters are; {paras} with score of {result}"

In [10]:
def evaluate_model(true,pred):
    accuracy = accuracy_score(true,pred)
    score = f1_score(true,pred)
    precision = precision_score(true,pred)
    recall = recall_score(true,pred)
    
    return (accuracy,score,precision,recall)

### Logistic regression

In [11]:
logistic = LogisticRegression()
logi_param = {'penalty':['l1','l2'],'solver':['liblinear','saga']}
hyperparameter_tune(logistic,logi_param,X_train,y_train)

"Best parameters are; {'penalty': 'l1', 'solver': 'saga'} with score of 0.9087587637009973"

In [12]:
def create_experiment(experiment_name,run_name):
    # creating experiment
    mlflow.set_tracking_uri("file:C:/Users/odhia/OneDrive/Desktop/datascienceproject/notebooks/logistic_mlruns")
    mlflow.set_experiment(experiment_name)
    #creating run within the experiment
    with mlflow.start_run(run_name=run_name):
        linear_model = LogisticRegression(penalty='l1',solver='saga')
        linear_model.fit(X_train,y_train)

        # Evaluate performance
        prediction = linear_model.predict(X_test)
        (accuracy,score,precision,recall) = evaluate_model(y_test,prediction)
        
        #print out metrics
        print(f"Accuracy: {accuracy}")
        print(f"F1_score: {score}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")

        # Infer model signature
        train_preds = linear_model.predict(X_train)
        signature = infer_signature(X_train,train_preds)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('penalty','l1')
        mlflow.log_param('solver','saga')
        mlflow.log_metric('accuracy',accuracy)
        mlflow.log_metric('f1_score',score)
        mlflow.log_metric('precision',precision)
        mlflow.log_metric('recall',recall)

        mlflow.sklearn.log_model(linear_model, "logistic_model", signature=signature)
        
       # mlflow ui --backend-store-uri file:C:/Users/odhia/OneDrive/Desktop/datascienceproject/notebooks/logistic_mlruns


In [13]:
create_experiment('logistic_exp','logistic_run')

2024/07/14 22:22:30 INFO mlflow.tracking.fluent: Experiment with name 'logistic_exp' does not exist. Creating a new experiment.


Accuracy: 0.9333333333333333
F1_score: 0.9308176100628931
Precision: 0.9487179487179487
Recall: 0.9135802469135802


### decision tree

In [14]:
tree = DecisionTreeClassifier()
tree_paras = {'criterion':['gini','entropy','log_loss'],'splitter':['best','random'],'max_depth':[2,4],'min_samples_split':[2,4,6,8],'max_features':['sqrt','log2']}
hyperparameter_tune(tree,tree_paras,X_train,y_train)

"Best parameters are; {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_split': 4, 'splitter': 'best'} with score of 0.9087505348737698"

In [15]:
def create_experiment2(experiment_name,run_name):
    # creating experiment
    mlflow.set_tracking_uri("file:C:/Users/odhia/OneDrive/Desktop/datascienceproject/notebooks/Decision_tree_mlruns")

    mlflow.set_experiment(experiment_name)
    #creating run within the experiment
    with mlflow.start_run(run_name=run_name):
        tree_model = DecisionTreeClassifier(criterion='gini',max_depth=4,max_features='sqrt',min_samples_split=4,splitter='best')
        tree_model.fit(X_train,y_train)

        # Evaluate performance
        prediction = tree_model.predict(X_test)
        (accuracy,score,precision,recall) = evaluate_model(y_test,prediction)
        
        #print out metrics
        print(f"Accuracy: {accuracy}")
        print(f"F1_score: {score}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")

        # Infer model signature
        train_preds = tree_model.predict(X_train)
        signature = infer_signature(X_train,train_preds)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('criterion','gini')
        mlflow.log_param('max_features','sqrt')
        mlflow.log_param('splitter','best')
        mlflow.log_param('max_depth',4)
        mlflow.log_param('min_samples_split',4)
        mlflow.log_metric('accuracy',accuracy)
        mlflow.log_metric('f1_score',score)
        mlflow.log_metric('precision',precision)
        mlflow.log_metric('recall',recall)

        mlflow.sklearn.log_model(tree_model, "decision_model", signature=signature)

In [16]:
create_experiment2('tree_exp','tree_run')

2024/07/14 22:25:22 INFO mlflow.tracking.fluent: Experiment with name 'tree_exp' does not exist. Creating a new experiment.


Accuracy: 0.896969696969697
F1_score: 0.8930817610062893
Precision: 0.9102564102564102
Recall: 0.8765432098765432


### random forest

In [17]:
forest = RandomForestClassifier()
forest_paras = {'criterion':['gini','entropy','log_loss'],'n_estimators':[100,150,200],'max_depth':[2,4],'min_samples_split':[2,4,6,8],'max_features':['sqrt','log2']}
hyperparameter_tune(forest,forest_paras,X_train,y_train)

"Best parameters are; {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_split': 8, 'n_estimators': 100} with score of 0.9229617194957375"

In [18]:
def create_experiment3(experiment_name,run_name):
    # creating experiment
    mlflow.set_tracking_uri("file:C:/Users/odhia/OneDrive/Desktop/datascienceproject/notebooks/random_forest_mlruns")

    mlflow.set_experiment(experiment_name)
    #creating run within the experiment
    with mlflow.start_run(run_name=run_name):
        forest_model = RandomForestClassifier(criterion='gini',max_depth=4,max_features='sqrt',min_samples_split=8,n_estimators=100)
        forest_model.fit(X_train,y_train)

        # Evaluate performance
        prediction = forest_model.predict(X_test)
        (accuracy,score,precision,recall) = evaluate_model(y_test,prediction)
        
        #print out metrics
        print(f"Accuracy: {accuracy}")
        print(f"F1_score: {score}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")

        # Infer model signature
        train_preds = forest_model.predict(X_train)
        signature = infer_signature(X_train,train_preds)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('criterion','gini')
        mlflow.log_param('max_features','sqrt')
        mlflow.log_param('n_estimators',100)
        mlflow.log_param('max_depth',4)
        mlflow.log_param('min_samples_split',8)
        mlflow.log_metric('accuracy',accuracy)
        mlflow.log_metric('f1_score',score)
        mlflow.log_metric('precision',precision)
        mlflow.log_metric('recall',recall)

        mlflow.sklearn.log_model(forest_model, "forest_model", signature=signature)

In [19]:
create_experiment3('randomforest_exp','random_forest_run')

2024/07/14 22:28:55 INFO mlflow.tracking.fluent: Experiment with name 'randomforest_exp' does not exist. Creating a new experiment.


Accuracy: 0.9333333333333333
F1_score: 0.9325153374233128
Precision: 0.926829268292683
Recall: 0.9382716049382716


### xgb

In [20]:
xgb = XGBClassifier()
xgb_paras = {'booster':['gbtree','gblinear','dart'],'learning_rate':[0.01,0.02,0.2,0.25,0.3],'max_depth':[2,4,6]}
hyperparameter_tune(xgb,xgb_paras,X_train,y_train)

"Best parameters are; {'booster': 'gbtree', 'learning_rate': 0.25, 'max_depth': 4} with score of 0.9168477008656726"

In [21]:
def create_experiment4(experiment_name,run_name):
    # creating experiment
    mlflow.set_tracking_uri("file:C:/Users/odhia/OneDrive/Desktop/datascienceproject/notebooks/xgb_mlruns")

    mlflow.set_experiment(experiment_name)
    #creating run within the experiment
    with mlflow.start_run(run_name=run_name):
        xgb_model = XGBClassifier(booster='gbtree',max_depth=4,learning_rate=0.25)
        xgb_model.fit(X_train,y_train)

        # Evaluate performance
        prediction = xgb_model.predict(X_test)
        (accuracy,score,precision,recall) = evaluate_model(y_test,prediction)
        
        #print out metrics
        print(f"Accuracy: {accuracy}")
        print(f"F1_score: {score}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")

        # Infer model signature
        train_preds = xgb_model.predict(X_train)
        signature = infer_signature(X_train,train_preds)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('booster','gbtree')
        mlflow.log_param('learning_rate',0.25)
        mlflow.log_param('max_depth',4)
        mlflow.log_metric('accuracy',accuracy)
        mlflow.log_metric('f1_score',score)
        mlflow.log_metric('precision',precision)
        mlflow.log_metric('recall',recall)

        mlflow.sklearn.log_model(xgb_model, "xgb_model", signature=signature)

In [22]:
create_experiment4('xgb_exp','xgb_run')

2024/07/14 22:29:57 INFO mlflow.tracking.fluent: Experiment with name 'xgb_exp' does not exist. Creating a new experiment.


Accuracy: 0.9393939393939394
F1_score: 0.9375
Precision: 0.9493670886075949
Recall: 0.9259259259259259


### fitting best model on training data

In [23]:
classifier = RandomForestClassifier(criterion='gini',max_depth=4,max_features='sqrt',min_samples_split=8,n_estimators=100)

In [24]:
classifier.fit(X_train,y_train)

### saving objects

In [25]:
import pickle
file_path = 'C:/Users/odhia/OneDrive/Desktop/datascienceproject/models/model.pkl'
obj = classifier
with open (file_path,'wb') as file_obj:
    pickle.dump(obj,file_obj)
            

In [26]:
file_path = 'C:/Users/odhia/OneDrive/Desktop/datascienceproject/models/scaler.pkl'
obj = scaler
with open (file_path,'wb') as file_obj:
    pickle.dump(obj,file_obj)