In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle


In [None]:
# Load the Pima Indians Diabetes dataset
data = pd.read_csv("diabetes.csv")
data

In [None]:
# Split the data into input variables (X) and target variable (y)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [None]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train a Random Forest Classifier
model1 = DecisionTreeClassifier(max_depth = 5, min_samples_split = 5, min_samples_leaf = 5, random_state=42)
model2 = RandomForestClassifier(n_estimators = 50, random_state=42)
model3 = GradientBoostingClassifier(n_estimators = 100, random_state=42)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)


In [None]:
# Make predictions on the testing dataset
y_pred = model3.predict(X_test)

In [None]:
# Evaluate the model's accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc * 100))

In [None]:

# Plot the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
# Save the model to disk
model_1_file_Name = 'model_objects/pima_indians_diabetes_DecisionTree_model.pkl'
model_2_file_Name = 'model_objects/pima_indians_diabetes_RandomForest_model.pkl'
model_3_file_Name = 'model_objects/pima_indians_diabetes_GradientBoosting_model.pkl'
scaler_name = 'model_objects/scaler_saved.pkl'

pickle.dump(model1, open(model_1_file_Name, 'wb'))
pickle.dump(model2, open(model_2_file_Name, 'wb'))
pickle.dump(model3, open(model_3_file_Name, 'wb'))
pickle.dump(scaler, open(scaler_name, 'wb'))

### To Predict New observations

In [2]:
# Load the saved model from disk
model_3_file_Name = 'model_objects/pima_indians_diabetes_GradientBoosting_model.pkl'
scaler_name = 'model_objects/scaler_saved.pkl'

loaded_model = pickle.load(open(model_3_file_Name, 'rb'))
loaded_scaler = pickle.load(open(scaler_name, 'rb'))

In [3]:
# New observations
new_observations = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])

In [4]:
new_observations = loaded_scaler.transform(new_observations)
new_observations



array([[ 0.63060337,  0.8213066 ,  0.11998613,  0.87217022, -0.71652335,
         0.16754413,  0.46596991,  1.36167568]])

In [5]:
# Make predictions on the new observations
prediction = loaded_model.predict(new_observations)[0]
prediction_prob = round(np.amax(loaded_model.predict_proba(new_observations)) * 100,2)

In [6]:
# The prediction will be a binary outcome (0 or 1), indicating whether the person has diabetes or not
print("Prediction:", prediction, "as outcome with:", prediction_prob, "% Probability")

Prediction: 1 as outcome with: 87.5 % Probability
