In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action = "ignore") 

In [None]:
df = pd.read_csv('diabetes.csv')
df.head()

In [None]:
df.describe().T

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
print(df.duplicated().sum())
df["Outcome"].value_counts()*100/len(df)

In [None]:
plt.figure(figsize=(15,5))
df['Outcome'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',shadow=True)
plt.title('Distribution of Outcome ')
plt.show()

In [None]:
print("Max Age: " + str(df["Age"].max()))
print("Min Age: " + str(df["Age"].min()))

## Normal Values for Features

- **Glucose**: 70-99 mg/dL (fasting)  
- **Blood Pressure**: <120/80 mmHg  
- **Skin Thickness**: 10-50 mm  
- **Insulin**: 2.6-24.9 μU/mL (fasting)  
- **BMI**: 18.5-24.9  


In [None]:
plt.figure(figsize =(15,9))
columns = df.columns.drop('Outcome')

for i, col in enumerate(columns):
    plt.subplot(3,3, i+1)
    sns.boxplot(x=col, data=df)

plt.show()


In [None]:
sns.pairplot( hue= 'Outcome', data= df)
plt.show()

In [None]:
fig, ax = plt.subplots(4, 2, figsize=(16, 16))

sns.histplot(df.Age, bins=20, kde=True, ax=ax[0, 0]) 
sns.histplot(df.Pregnancies, bins=20, kde=True, ax=ax[0, 1]) 
sns.histplot(df.Glucose, bins=20, kde=True, ax=ax[1, 0]) 
sns.histplot(df.BloodPressure, bins=20, kde=True, ax=ax[1, 1]) 
sns.histplot(df.SkinThickness, bins=20, kde=True, ax=ax[2, 0])
sns.histplot(df.Insulin, bins=20, kde=True, ax=ax[2, 1])
sns.histplot(df.DiabetesPedigreeFunction, bins=20, kde=True, ax=ax[3, 0]) 
sns.histplot(df.BMI, bins=20, kde=True, ax=ax[3, 1]) 

plt.tight_layout()
plt.show()

In [None]:
grouped_means = df.groupby("Outcome").mean()

ax = grouped_means.plot(kind="bar", figsize=(15, 6), colormap="viridis", edgecolor="black")
plt.title("Mean Values of Features by Outcome", fontsize=16)
plt.xlabel("Outcome", fontsize=14)
plt.ylabel("Mean Values", fontsize=14)
plt.legend(title="Features", fontsize=12)

#Add numbers on top of each bar
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f", fontsize=10, label_type="edge")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize =(8,6))
sns.heatmap(df.corr(),vmin= -1 ,center= 0,cmap='RdBu_r' ,annot=True)
plt.show()

In [None]:
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']
df[columns_to_replace] = df[columns_to_replace].replace(0, np.nan)

print(df.isnull().sum())

In [None]:
import missingno as msno
msno.bar(df)

In [150]:
def fill_missing_with_median(column):
    median_values = df.groupby('Outcome')[column].median()
    
    df.loc[(df['Outcome'] == 0) & (df[column].isnull()), column] = median_values[0]
    df.loc[(df['Outcome'] == 1) & (df[column].isnull()), column] = median_values[1]

for col in df.columns.drop('Outcome'):
    fill_missing_with_median(col)


df.to_csv('decisionTree.csv', index=False)

In [None]:
df.isnull().sum()

In [None]:
y = df['Outcome']
cols = ['Glucose', 'Pregnancies','DiabetesPedigreeFunction', 'Insulin', 'BMI', 'Age']
X = df[cols]

X.head()

In [136]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=41)
print(X_train.shape, y_train.shape)

#scaler = StandardScaler()
#std_df = scaler.fit_transform(X)

X_train.head()

In [138]:
from sklearn.preprocessing import RobustScaler

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

#X_train = pd.DataFrame(X_train, columns=cols, index=X_train.index if hasattr(X_train, 'index') else None)
#X_test = pd.DataFrame(X_test, columns=cols, index=X_test.index if hasattr(X_test, 'index') else None)

In [139]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    # Sigmoid activation function
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    # Fit the model using gradient descent
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent loop
        for _ in range(self.num_iterations):
            # Linear combination
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    # Predict probabilities for the test set
    def predict_proba(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    # Predict binary labels using a threshold
    def predict(self, X, threshold=0.6):
        y_pred_proba = self.predict_proba(X)
        return (y_pred_proba >= threshold).astype(int)

In [None]:
model = LogisticRegression(learning_rate=0.01, num_iterations=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix: \n", conf_matrix)

In [141]:
from collections import Counter
from sklearn import datasets
class KNN:
    def __init__(self, k=2):
        self.k = k

    def value(self,knn_values):
        most_common = Counter(knn_values).most_common()
        return most_common[0][0]
    
    def fit(self, X, Y):
        #X_train
        self.X = X
        #y_train
        self.Y = Y
        
    def euclidean_distance(self, point_a, point_b):
        return np.linalg.norm(point_a-point_b)
    

    def predict(self,X):
        #assign each distance to a label and then pick the most popular
        predictions = []
        for test_point in X:
            distances = []
            train_data = zip(self.X, self.Y)
            for train_point, train_label in train_data:
                distance = self.euclidean_distance(test_point, train_point)
                distances.append((distance, train_label))
            distances.sort()
            dists = distances[:self.k]
            y = [y_ for _, y_ in dists]
            predictions.append(self.value(y))
        return predictions

In [None]:
knearest =  KNN()
knearest.fit(X_train, y_train)

y_pred = knearest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix: \n", conf_matrix)

def accuracy(y_pred, y_test):
    return np.sum(y_pred==y_test)/len(y_test)

In [143]:
# make prediction on trained model with standardized data
# patient is not diabetic 1,103,30,38,83,43.3,0.183,33,0

#patient = np.array([1,103])
#patient = patient.reshape(1,-1)
#print(patient)
#y_pred = knearest.predict(patient)


#if y_pred[0] == 1:
 #   print("Patient is diabetic")
#else:
 #   print("Patient is not diabetic")



In [144]:
import numpy as np
import pandas as pd
from collections import Counter
import types
import collections
import importlib
from sklearn.model_selection import train_test_split
from sklearn import datasets

#helper class to keep data
class Node():
    def __init__(self, column_index=None, question_split=None, left=None, right=None):

        self.column_index = column_index
        self.question_split = question_split
        self.left = left
        self.right = right
   

class Leaf():
    def __init__(self,value):
        self.value = value
    
#decision tree class
class DecisionTree():
    def __init__(self,allowed_depth=2):

        #first node
        self.root = None

        #tree limit
        self.allowed_depth = allowed_depth
    
    def build(self, X,Y, depth=0):

        #number of rows and columns   
        n_rows, n_Y = np.shape(X)
        #check tree limit
        if depth>=self.allowed_depth:
            val = self.value(Y)
            return(Leaf(val))
        
        if depth<self.allowed_depth:
            #get split
            column_index,right,left,qst = self.accurate_split(X,Y, n_rows, n_Y)
            #if we have a split
            if column_index is not None:
                left_side = self.build(left[:, :-1],left[:, -1], depth+1)
                right_side = self.build(right[:, :-1],right[:, -1], depth+1)
                return Node(column_index, qst,left_side, right_side)
            else:
                val = self.value(Y)
                return(Leaf(val))
 
            
    
    def accurate_split(self, X,Y, n_x, n_y):
    
        column_index = None
        left = None
        right = None
        question_split = None
        total_ig = -99999999
        
        for column in range(n_y):
            x_values = X[:, column]
            
            for question in x_values:
                #calculate information gain
                dataset = np.concatenate((X, Y.reshape(1, -1).T), axis=1)
                data_left = np.array([row for row in dataset if row[column] <= question])
                data_right = np.array([row for row in dataset if row[column] > question])
                
                if(len(data_left)>0 and len(data_right>0)):
                    info_gain= self.calc_ig(dataset,data_left,data_right,column,question)
                    #replace information gain
                    if info_gain>total_ig and info_gain >0:
                        column_index = column
                        right = data_right
                        left = data_left
                        question_split = question
                              
        return column_index,right,left,question_split
    

    def calc_entropy(self,y,base = None):

        q = np.bincount(np.array(y, dtype=np.int64))
        ps = q / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])   

    def calc_ig(self,dataset,dataset_left,dataset_right,col,qst):

        data_left = dataset_left[:, -1]
        data_right = dataset_right[:, -1]
        data = dataset[:, -1]
        #calculate left probability
        left_weight = len(data_left) / len(data)
        #calculate right probability
        right_weight = len(data_right) / len(data)
        #get the parent entropy
        parent_entropy = self.calc_entropy(data)
        #calculate entropy for the left and right side
        left_child_entropy = left_weight*self.calc_entropy(data_left)
        right_child_entropy = right_weight*self.calc_entropy(data_right)
        #calculate gain
        gain = parent_entropy- (left_child_entropy + right_child_entropy)
        return gain

    def find_leaves(self, X):

        leaves = [self.find_leaf(x, self.root) for x in X]
        return np.array(leaves)
    
    def find_leaf(self, x, node):
        #return if leaf
        if isinstance(node, Leaf):
            return node.value
        feature_index = x[node.column_index]
        
        if feature_index<=node.question_split:
            return self.find_leaf(x, node.left)
        else:
            return self.find_leaf(x, node.right)

 
    #leaf node gets the most common dependent variable
    def value(self, Y):
        counter = Counter(Y)
        value = counter.most_common(1)[0][0]
        return value
    
    #starter method
    def fit(self, X,Y):
        self.root = self.build(X,Y)
    #calculate how many predictions were accurate
    def acc(self,label,pred_label):
        return np.sum(np.equal(label, pred_label)) / len(label)
    #print number of nodes

In [161]:
import decision_tree_prototype

diabetes = pd.read_csv('diabetes.csv')
diabetes = diabetes.drop(columns=['BloodPressure', 'SkinThickness'])

X = diabetes.iloc[:,:-1].values
y = diabetes.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

decisiontree =  DecisionTree()
decisiontree.fit(X_train,y_train)
y_pred = decisiontree.find_leaves(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix: \n", conf_matrix)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [146]:
import numpy as np

class DecisionTree:
    def __init__(self, allowed_depth=3):
        self.allowed_depth = allowed_depth
        self.root = None

    def fit(self, X, y):
        self.root = self.build(X, y, depth=0)

    def build(self, X, y, depth):
        if depth >= self.allowed_depth or len(np.unique(y)) == 1:
            return np.mean(y)
        
        column = np.random.randint(X.shape[1])
        threshold = np.mean(X.iloc[:, column].values)  # Ensure numpy array for threshold calculation
        
        left_idx = X.iloc[:, column] <= threshold
        right_idx = X.iloc[:, column] > threshold
        
        left = self.build(X[left_idx], y[left_idx], depth + 1)
        right = self.build(X[right_idx], y[right_idx], depth + 1)
        
        return (column, threshold, left, right)

    def predict_proba(self, X):
        def traverse(x, node):
            if isinstance(node, float):  # Leaf node
                return node
            
            column, threshold, left, right = node
            
            if x[column] <= threshold:
                return traverse(x, left)
            else:
                return traverse(x, right)
        
        return np.array([traverse(x, self.root) for x in X.values])  # Ensure X is in numpy array form


In [None]:

y = df['Outcome']
cols = ['Glucose', 'Pregnancies','DiabetesPedigreeFunction', 'Insulin', 'BMI', 'Age']
Xdecision = df[cols]

X_trainD, X_testD, y_trainD, y_testD = train_test_split(Xdecision, y, test_size=0.2, random_state=1234)
X_trainD.head()

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Initialize and fit the DecisionTree
deci = DecisionTree(allowed_depth=3)
deci.fit(X_trainD, y_trainD)

# Make predictions using the predict_proba method
y_pred = deci.predict_proba(X_testD)

# Since the decision tree outputs probabilities, threshold them to get class predictions
y_pred_class = (y_pred > 0.5).astype(int)  # Assuming binary classification (threshold 0.5)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_testD, y_pred_class)
conf_matrix = confusion_matrix(y_testD, y_pred_class)

print("Accuracy:", accuracy)
print("Confusion Matrix: \n", conf_matrix)


In [None]:
deci =  DecisionTree()
deci.fit(X_trainD,y_trainD)

y_pred = deci.find_leaves(X_testD)

accuracy = accuracy_score(y_testD, y_pred)
conf_matrix = confusion_matrix(y_testD, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix: \n", conf_matrix)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_prob_lr = log_reg.predict_proba(X_test)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
auc_lr = roc_auc_score(y_test, y_prob_lr)



knn = KNN(k=5)
knn.fit(X_train, y_train)
y_prob_knn = knn.predict_proba(X_test)
fpr_knn, tpr_knn, _ = roc_curve(y_test, y_prob_knn)
auc_knn = roc_auc_score(y_test, y_prob_knn)


decision_tree = DecisionTree(allowed_depth=3)
decision_tree.fit(X_train, y_train)
y_prob_dt = decision_tree.predict_proba(X_test)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)
auc_dt = roc_auc_score(y_test, y_prob_dt)

# Plot ROC Curves for All Algorithms
plt.figure(figsize=(10, 6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.2f})')
plt.plot(fpr_knn, tpr_knn, label=f'KNN (AUC = {auc_knn:.2f})')
plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree (AUC = {auc_dt:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()
