In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle
from sklearn.pipeline import Pipeline
from joblib import dump, load


In [2]:
# Load the Pima Indians Diabetes dataset
data = pd.read_csv("diabetes.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# Split the data into input variables (X) and target variable (y)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [4]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [20]:
pipeline1 = Pipeline([
    ('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', DecisionTreeClassifier(max_depth = 5, min_samples_split = 5, min_samples_leaf = 5, random_state=42)) #step2 - classifier
])
pipeline1.steps

[('normalizer', StandardScaler()),
 ('clf',
  DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5,
                         random_state=42))]

In [21]:
pipeline2 = Pipeline([
    ('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', RandomForestClassifier(n_estimators = 50, random_state=42)) #step2 - classifier
])
pipeline2.steps

[('normalizer', StandardScaler()),
 ('clf', RandomForestClassifier(n_estimators=50, random_state=42))]

In [22]:
pipeline3= Pipeline([
    ('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', GradientBoostingClassifier(n_estimators = 100, random_state=42)) #step2 - classifier
])
pipeline3.steps

[('normalizer', StandardScaler()),
 ('clf', GradientBoostingClassifier(random_state=42))]

In [71]:
pipeline1.fit(X_train, y_train)
pipeline2.fit(X_train, y_train)
pipeline3.fit(X_train.values, y_train.values)

Pipeline(steps=[('normalizer', StandardScaler()),
                ('clf', GradientBoostingClassifier(random_state=42))])

In [72]:
# Make predictions on the testing dataset
y_pred = pipeline3.predict(X_test.values)

In [73]:
# Evaluate the model's accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc * 100))

Accuracy: 78.79%


In [74]:
###save model as joblib
dump(pipeline1, 'model_objects/pipeline1.joblib')
dump(pipeline2, 'model_objects/pipeline2.joblib')
dump(pipeline3, 'model_objects/pipeline3.joblib')

['model_objects/pipeline3.joblib']

### Load saved model and scrore from here

In [75]:
loaded_pipelineModel  = load('model_objects/pipeline3.joblib')

In [76]:
new_observations = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])


In [79]:
loaded_pipelineModel[0].transform(new_observations)

array([[ 0.63060337,  0.8213066 ,  0.11998613,  0.87217022, -0.71652335,
         0.16754413,  0.46596991,  1.36167568]])

In [77]:
# Make predictions on the new observations
prediction = loaded_pipelineModel.predict(new_observations)
prediction_prob = round(np.amax(loaded_pipelineModel.predict_proba(new_observations)) * 100,2)

In [78]:
# The prediction will be a binary outcome (0 or 1), indicating whether the person has diabetes or not
print("Prediction:", prediction, "as outcome with:", prediction_prob, "% Probability")

Prediction: [1] as outcome with: 87.5 % Probability
