In [102]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import pickle
import streamlit as st

In [107]:
import numpy as np
import sklearn
import streamlit
import imblearn

print("NumPy:", np.__version__)
print("Scikit-learn:", sklearn.__version__)
print("Streamlit:", streamlit.__version__)
print("imblearn:", imblearn.__version__)


NumPy: 2.0.2
Scikit-learn: 1.6.1
Streamlit: 1.43.2
imblearn: 0.13.0


In [104]:
# Load dataset
df = pd.read_csv('/content/diabetes.csv')

# Handle categorical features
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'smoking_history']
df[categorical_cols] = df[categorical_cols].astype(str)
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [123]:
diabetes_means = df.groupby('diabetes').mean()

In [112]:
# Features and target
x = df.drop(columns=['diabetes'], axis=1)
y = df['diabetes']

In [119]:
print(x.shape,y.shape)

(100000, 13) (100000,)


In [114]:
#finding missing values
df.isnull().sum()

Unnamed: 0,0
age,0
bmi,0
HbA1c_level,0
blood_glucose_level,0
diabetes,0
gender_Male,0
gender_Other,0
hypertension_1,0
heart_disease_1,0
smoking_history_current,0


In [115]:
df.value_counts(['diabetes'])

Unnamed: 0_level_0,count
diabetes,Unnamed: 1_level_1
0,91500
1,8500


#our dataset is highly imbalanced so we correct it first

In [116]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)

In [118]:
print(x_resampled.shape,y_resampled.shape)

(183000, 13) (183000,)


#standardizing data

In [120]:
# Standardization
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_resampled)

#training and testing

In [121]:
# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=43)

# Model training
classifier = svm.SVC(kernel='linear')
classifier.fit(x_train, y_train)

In [122]:
# Model evaluation
y_train_pred = classifier.predict(x_train)
y_test_pred = classifier.predict(x_test)


In [126]:
#training and testing accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {100*train_acc:.5f} %")
print(f"Testing Accuracy: {100*test_acc:.5f} %")


Training Accuracy: 91.84016 %
Testing Accuracy: 92.04372 %


In [125]:
# Save the trained model and scaler
with open('model.pkl', 'wb') as model_file:
    pickle.dump(classifier, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [127]:
#save the features and diabetes_means
with open('features.pkl','wb') as file:
  pickle.dump(x.columns,file)
with open('diabetes_means.pkl','wb') as file:
  pickle.dump(diabetes_means,file)