In [1]:
import pandas as pd

# Load the dataset (assuming it's saved as 'diabetes.csv' in your directory)
df = pd.read_csv('/content/diabetes.csv')

# Show first 5 rows
print(df.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute correlation matrix
corr_matrix = df.corr()


# Plot heatmap
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix with Outcome")
plt.show()


# Show top correlations with Outcome
correlation_with_outcome = corr_matrix["Outcome"].abs().sort_values(ascending=False)
print("Top correlated features:\n", correlation_with_outcome)


Top correlated features:
 Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Select top features manually from previous output
features = ['Glucose', 'BMI', 'Age']
X = df[features]
y = df['Outcome']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# SVM
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)


print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


Logistic Regression Accuracy: 0.7467532467532467
SVM Accuracy: 0.7727272727272727


In [8]:
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))


Logistic Regression Accuracy: 0.7467532467532467


In [7]:
def predict_diabetes():
    print("\nEnter the following details:")
    glucose = float(input("Glucose level: "))
    bmi = float(input("BMI: "))
    age = float(input("Age: "))

    user_data = pd.DataFrame([[glucose, bmi, age]], columns=features)

    pred_log = log_model.predict(user_data)
    pred_svm = svm_model.predict(user_data)

    print("\n🔍 Prediction Results:")
    print("Logistic Regression:", "Diabetic" if pred_log[0] == 1 else "Not Diabetic")
    print("SVM:", "Diabetic" if pred_svm[0] == 1 else "Not Diabetic")

predict_diabetes()



Enter the following details:
Glucose level: 70 
BMI: 20.5
Age: 30

🔍 Prediction Results:
Logistic Regression: Not Diabetic
SVM: Not Diabetic


DIABETICS PREDICTION MODEL:
STEPS:
>Importing the libraries.
>read the dataset.
>Identifying the most features correlated with Outcome.
>Using that features to train the model using SVM,Logistic regression.
>Finding the ACcuracy(0.74, 0.77) of each classification model.
>Predicting the Outcome whether the person is diabetic or not by collecting the input-Age,BMI,Glucose level.
