In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score 
from sklearn import svm

In [2]:
diabetes_dataset = pd.read_csv('diabetes.csv')

In [3]:
X = diabetes_dataset.drop(columns = 'Outcome', axis=1) # Strong all the columns except 'Outcome' column.
Y = diabetes_dataset['Outcome'] # Store the 'Outcome' column in Y.

In [4]:
scaler = StandardScaler() 
X = scaler.fit_transform(X) # Fitting and transforming all the inconsistent data into a standard range.

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2) # Splitting the data into training and testing sets.

In [6]:
def train_and_evaluate_model(classifier, X_train, Y_train, X_test, Y_test):
    classifier.fit(X_train, Y_train) # Training the model.
    
    X_train_prediction = classifier.predict(X_train) # Predicting the training data.
    training_accuracy = accuracy_score(Y_train, X_train_prediction) # Calculating the accuracy of the model on the training data.
    
    X_test_prediction = classifier.predict(X_test) # Predicting the testing data.
    testing_accuracy = accuracy_score(Y_test, X_test_prediction) # Calculating the accuracy of the model on the testing data.
    
    return training_accuracy, testing_accuracy, classifier

In [7]:
def random_forest_classifier(X_train, Y_train, X_test, Y_test):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    } # Hyperparameters for the Random Forest Classifier.
    
    classifier = RandomForestClassifier(random_state=2) # Creating a Random Forest Classifier.
    grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy') # Creating a Grid Search CV object.
    grid_search.fit(X_train, Y_train) # Fitting the Grid Search CV object.
    
    best_classifier = grid_search.best_estimator_ # Getting the best classifier.
    
    return train_and_evaluate_model(best_classifier, X_train, Y_train, X_test, Y_test) 


In [8]:
def svm_classifier(X_train, Y_train, X_test, Y_test):
    classifier = svm.SVC(kernel='linear') # Creating a SVM Classifier.
    
    return train_and_evaluate_model(classifier, X_train, Y_train, X_test, Y_test)


In [9]:
def gradient_boosting_classifier(X_train, Y_train, X_test, Y_test):
    classifier = GradientBoostingClassifier(random_state=2) # Creating a Gradient Boosting Classifier.
    
    return train_and_evaluate_model(classifier, X_train, Y_train, X_test, Y_test)

In [10]:
def evaluate_models(models, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2) # Splitting the data into training and testing sets.
    
    results = {}
    best_model_name = None
    best_testing_accuracy = 0
    best_model = None
    
    for model_name, model_function in models.items():
        training_accuracy, testing_accuracy, trained_model = model_function(X_train, Y_train, X_test, Y_test) 
        results[model_name] = {
            "Training Accuracy": training_accuracy,
            "Testing Accuracy": testing_accuracy
        }
        
        if testing_accuracy > best_testing_accuracy:
            best_testing_accuracy = testing_accuracy
            best_model_name = model_name
            best_model = trained_model
    
    return results, best_model_name, best_model

In [11]:
models = {
    "Random Forest": random_forest_classifier,
    "SVM": svm_classifier,
    "Gradient Boosting": gradient_boosting_classifier
}

In [None]:
results, best_model_name, best_model = evaluate_models(models, X, Y) # Evaluating the models.

In [13]:
for model_name, accuracies in results.items():
    print(f"{model_name}: Training Accuracy = {accuracies['Training Accuracy']:.4f}, Testing Accuracy = {accuracies['Testing Accuracy']:.4f}")

print(f"\nBest model based on testing accuracy: {best_model_name}")

Random Forest: Training Accuracy = 0.9609, Testing Accuracy = 0.7273
SVM: Training Accuracy = 0.7866, Testing Accuracy = 0.7727
Gradient Boosting: Training Accuracy = 0.9251, Testing Accuracy = 0.7013

Best model based on testing accuracy: SVM


In [14]:
def predict_input_data(input_data):
    
    # Convert input data to numpy array and reshaping the numpy array as we are predicting for one instance.
    input_data_as_numpy_array = np.asarray(input_data)
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
    
    # Convert input data to a DataFrame with the original feature names
    input_data_df = pd.DataFrame(input_data_reshaped, columns=diabetes_dataset.columns[:-1])
    
    # Standardize the input data
    input_data_standardized = scaler.transform(input_data_df)
    
    # Predict using the best model
    prediction = best_model.predict(input_data_standardized)
    
    return prediction

In [15]:
input_data = (5,166,72,19,175,25.8,0.587,51)

prediction = predict_input_data(input_data)

print(f"\nPrediction: {prediction}")


Prediction: [1]
