In [43]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [44]:
train_data = pd.read_csv("diabetes_prediction_dataset.csv")
train_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [45]:
train_data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


#  > Data cleaning

In [46]:
y = train_data["diabetes"]
X = train_data.drop(columns=['diabetes'])

In [47]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,Female,80.0,0,1,never,25.19,6.6,140
1,Female,54.0,0,0,No Info,27.32,6.6,80
2,Male,28.0,0,0,never,27.32,5.7,158
3,Female,36.0,0,0,current,23.45,5.0,155
4,Male,76.0,1,1,current,20.14,4.8,155
...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90
99996,Female,2.0,0,0,No Info,17.37,6.5,100
99997,Male,66.0,0,0,former,27.83,5.7,155
99998,Female,24.0,0,0,never,35.42,4.0,100


# missing value

In [48]:
X.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
dtype: int64

In [49]:
from sklearn import preprocessing

In [50]:
label_encoder = preprocessing.LabelEncoder()

In [51]:
X['gender'] 

0        Female
1        Female
2          Male
3        Female
4          Male
          ...  
99995    Female
99996    Female
99997      Male
99998    Female
99999    Female
Name: gender, Length: 100000, dtype: object

In [52]:
X['gender'] = label_encoder.fit_transform(X['gender'])

In [53]:
X['gender'] 

0        0
1        0
2        1
3        0
4        1
        ..
99995    0
99996    0
99997    1
99998    0
99999    0
Name: gender, Length: 100000, dtype: int64

In [54]:
X['smoking_history'] = label_encoder.fit_transform(X['smoking_history'])

In [55]:
X['smoking_history']

0        4
1        0
2        4
3        1
4        1
        ..
99995    0
99996    0
99997    3
99998    4
99999    1
Name: smoking_history, Length: 100000, dtype: int64

In [56]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,80.0,0,1,4,25.19,6.6,140
1,0,54.0,0,0,0,27.32,6.6,80
2,1,28.0,0,0,4,27.32,5.7,158
3,0,36.0,0,0,1,23.45,5.0,155
4,1,76.0,1,1,1,20.14,4.8,155
...,...,...,...,...,...,...,...,...
99995,0,80.0,0,0,0,27.32,6.2,90
99996,0,2.0,0,0,0,17.37,6.5,100
99997,1,66.0,0,0,3,27.83,5.7,155
99998,0,24.0,0,0,4,35.42,4.0,100


In [57]:
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64

In [58]:
def preprocess_inputs(data):
    label_encoder = preprocessing.LabelEncoder()
    data['gender'] = label_encoder.fit_transform(data['gender'])
    data['smoking_history'] = label_encoder.fit_transform(data['smoking_history'])

    # Convert "yes" and "no" to 1 and 0 for hypertension and heart_disease
    data['hypertension'] = data['hypertension'].apply(lambda x: 1 if x.lower() == "yes" else 0)
    data['heart_disease'] = data['heart_disease'].apply(lambda x: 1 if x.lower() == "yes" else 0)
    
    return data

# > classifier

In [59]:
from sklearn.model_selection import train_test_split

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [61]:
print('train:')
print(X_train.shape)
print(y_train.shape)
print('\ntest:')
print(X_test.shape)
print(y_test.shape)

train:
(70000, 8)
(70000,)

test:
(30000, 8)
(30000,)


> # >KNN

In [62]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [63]:
knn = KNN(n_neighbors = 5)

In [64]:
knn.fit(X_train,y_train)

In [65]:
y_pred = knn.predict(X_test)

In [66]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

In [67]:
print ("Accuracy of  knn : ",accuracy_score(y_test,y_pred)*100)

Accuracy of  knn :  95.26


In [68]:
name = []
acc = []
pre = []
recall = []
r2 = []
f1 = []

In [69]:
for t in range(2, 20, 4):
    name.append('k='+str(t))
    model = KNN(n_neighbors = t).fit(X_train,y_train)
    y_pred = model.predict(X_test)
#     acc.append(accuracy_score(y_test,y_pred)*100)
#     pre.append(precision_score(y_test,y_pred)*100)
#     recall.append(recall_score(y_test,y_pred)*100)
#     r2.append(r2_score(y_test,y_pred)*100)
#     f1.append(f1_score(y_test,y_pred)*100)

In [70]:
# plt.scatter(name,acc)

In [71]:
import pickle

In [72]:
# with open('model.pkl','wb') as file:
#     s = pickle.dump(model,file)

In [73]:
with open('model.pkl','wb') as file:
    pickle.dump(knn, file)

In [74]:
def predict_diabetes(gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level):
    data = pd.DataFrame({
        'gender': [gender],
        'age': [age],
        'hypertension': [hypertension],
        'heart_disease': [heart_disease],
        'smoking_history': [smoking_history],
        'bmi': [bmi],
        'HbA1c_level': [HbA1c_level],
        'blood_glucose_level': [blood_glucose_level]
    })
    
    data = preprocess_inputs(data)
    
    with open('model.pkl', 'rb') as file:
        model = pickle.load(file)
    
    prediction = model.predict(data)
    
    return "Yes" if prediction[0] == 1 else "No"

# Test the predict function
result = predict_diabetes("Female", 43.0, "No", "No", "No", 27.32, 6.6, 400)
print("Diabetes Prediction:", result)


Diabetes Prediction: Yes


In [75]:
# model.predict([[1,2.0,0,0,0,16.81,6.2,140]])
# model.predict([[0,80.0,0,1,0,25.19,6.6,140]])
# modell.predict([[1,43.0,0,0,0,27.32,6.6,400]])