In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split

In [None]:
data = pd.read_csv("diabetes1.csv")
data

In [None]:
# Showing the top 5 rows of the dataset 

In [None]:
# Replacing 0 values of Glucose with the median of the Glucose column
data['Glucose'] = data['Glucose'].replace(0, data['Glucose'].median())

# Filling 0 values of Blood Pressure with the median of the Blood Pressure column
data['BloodPressure'] = data['BloodPressure'].replace(0, data['BloodPressure'].median())

# Replacing 0 values in BMI with the mean of the BMI column
data['BMI'] = data['BMI'].replace(0, data['BMI'].mean())

# Replacing the missing values of Skin Thickness with the mean of the Skin Thickness column
data['SkinThickness'] = data['SkinThickness'].replace(0, data['SkinThickness'].mean())

# Replacing the missing values of Insulin with the mean of the Insulin column
data['Insulin'] = data['Insulin'].replace(0, data['Insulin'].mean())

# Display the updated DataFrame
data.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, fmt=".3f", cmap="YlGnBu")
plt.title("Correlation heatmap")


In [None]:
from sklearn.preprocessing import QuantileTransformer

# Initialize the QuantileTransformer with default n_quantiles
quartile = QuantileTransformer()

# Transform the data
X = quartile.fit_transform(data)

# Convert the transformed data to a Pandas DataFrame and set column names
dataset = pd.DataFrame(X, columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'])

# Show the top 5 rows of the transformed dataset


In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dependent and independent features
X = data.drop(["Outcome"], axis=1)
Y = data["Outcome"]

# Splitting the dataset into the training and testing dataset
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.3, random_state=10)
print("The size of the training dataset: ",X_train.size)
print("The size of the testing dataset: ",X_test.size)



In [None]:
###HYPER PARAMETER TUNING
X_test = X_test.drop("predictions", axis=1)
# Check and compare the columns of X_train and X_test

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a DecisionTreeClassifier model
model = DecisionTreeClassifier(criterion='gini', max_depth=None) 

# Train the model with your training data
model.fit(X_train, Y_train)  # X_train is your feature data, y_train is your target data


In [None]:
# Make predictions using the trained model
Y_pred= model.predict(X_test)  # X_test is your test data

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test,Y_pred)

# Print the accuracy
print(f'Accuracy: {accuracy}')


In [None]:
# Importing the required libraries
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the analyze_grid function to analyze GridSearchCV results and make predictions
def analyze_grid(grid):
    '''
    Analyzing the results of GridCV method and making predictions for the test data
    Presenting the classification report at the end
    '''
    # Printing the best parameter and accuracy score
    print("Tuned hyperparameters: ", grid.best_params_)
    print("Accuracy Score:", grid.best_score_)
    
    mean_values = grid.cv_results_["mean_test_score"]
    std_values = grid.cv_results_["std_test_score"]
    for m, s, p in zip(mean_values, std_values, grid.cv_results_["params"]):
        print(f"Mean: {m}, Std: {s} * 2, Params: {p}")
    
    print("The classification Report:")
    Y_true, Y_pred = Y_test, grid.predict(X_test)
    print(classification_report(Y_true, Y_pred))
    print()

# Define the Decision Tree model and its hyperparameters
model = DecisionTreeClassifier()
criterion_list = ['gini', 'entropy']
max_depth_values = [None, 10, 20, 30, 40]

# Define the grid search
grid_dt = dict(criterion=criterion_list, max_depth=max_depth_values)
cross_val = StratifiedKFold(n_splits=100, random_state=10, shuffle=True)
grid_search_cv = GridSearchCV(estimator=model, param_grid=grid_dt, cv=cross_val, scoring='accuracy', error_score=0)
dt_result = grid_search_cv.fit(X_train, Y_train)

# Result of Hyper Parameters of Decision Tree
analyze_grid(dt_result)

In [None]:
import joblib

# Assuming your model is named 'model' (replace w|ith the actual name of your model)
# and you've already trained it

# Save the trained model to a file
joblib.dump(model, 'diabetesmodel.pkl')
