<a href="https://colab.research.google.com/github/rikabhusssen/Rikab-Hussen/blob/main/Machine_Learning_Project_Diabetes_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

In [3]:
print(diabetes_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
print(diabetes_dataset.shape)

(768, 9)


In [5]:
print(diabetes_dataset.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [6]:
print(diabetes_dataset['Outcome'].value_counts())


Outcome
0    500
1    268
Name: count, dtype: int64


In [7]:
print(diabetes_dataset.groupby('Outcome').mean())


         Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
Outcome                                                                      
0           3.298000  109.980000      68.184000      19.664000   68.792000   
1           4.865672  141.257463      70.824627      22.164179  100.335821   

               BMI  DiabetesPedigreeFunction        Age  
Outcome                                                  
0        30.304200                  0.429734  31.190000  
1        35.142537                  0.550500  37.067164  


In [8]:
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [9]:
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(standardized_data, Y, test_size=0.2, stratify=Y, random_state=2)

In [11]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


In [12]:
svm_parameters = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]}
svm_classifier = GridSearchCV(svm.SVC(), svm_parameters, cv=5)
svm_classifier.fit(X_train, Y_train)

In [13]:
svm_best_classifier = svm_classifier.best_estimator_
X_train_prediction = svm_best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Best SVM parameters:', svm_classifier.best_params_)
print('Accuracy score of the SVM training data: ', training_data_accuracy)

Best SVM parameters: {'C': 1, 'kernel': 'linear'}
Accuracy score of the SVM training data:  0.7866449511400652


In [14]:
X_test_prediction = svm_best_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the SVM test data: ', test_data_accuracy)


Accuracy score of the SVM test data:  0.7727272727272727


In [15]:
rf_parameters = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
rf_classifier = GridSearchCV(RandomForestClassifier(), rf_parameters, cv=5)
rf_classifier.fit(X_train, Y_train)


In [16]:
rf_best_classifier = rf_classifier.best_estimator_
X_train_prediction = rf_best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Best Random Forest parameters:', rf_classifier.best_params_)
print('Accuracy score of the Random Forest training data: ', training_data_accuracy)

Best Random Forest parameters: {'max_depth': 10, 'n_estimators': 100}
Accuracy score of the Random Forest training data:  0.996742671009772


In [17]:
X_test_prediction = rf_best_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the Random Forest test data: ', test_data_accuracy)


Accuracy score of the Random Forest test data:  0.7662337662337663


In [18]:
dt_parameters = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
dt_classifier = GridSearchCV(DecisionTreeClassifier(), dt_parameters, cv=5)
dt_classifier.fit(X_train, Y_train)

In [19]:
dt_best_classifier = dt_classifier.best_estimator_
X_train_prediction = dt_best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Best Decision Tree parameters:', dt_classifier.best_params_)
print('Accuracy score of the Decision Tree training data: ', training_data_accuracy)


Best Decision Tree parameters: {'max_depth': 20, 'min_samples_split': 5}
Accuracy score of the Decision Tree training data:  0.9723127035830619


In [20]:
X_test_prediction = dt_best_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the Decision Tree test data: ', test_data_accuracy)

Accuracy score of the Decision Tree test data:  0.7012987012987013


In [21]:
lr_parameters = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
lr_classifier = GridSearchCV(LogisticRegression(), lr_parameters, cv=5)
lr_classifier.fit(X_train, Y_train)


In [22]:
lr_best_classifier = lr_classifier.best_estimator_
X_train_prediction = lr_best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Best Logistic Regression parameters:', lr_classifier.best_params_)
print('Accuracy score of the Logistic Regression training data: ', training_data_accuracy)

Best Logistic Regression parameters: {'C': 1, 'solver': 'liblinear'}
Accuracy score of the Logistic Regression training data:  0.7833876221498371


In [23]:
X_test_prediction = lr_best_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the Logistic Regression test data: ', test_data_accuracy)


Accuracy score of the Logistic Regression test data:  0.7597402597402597


In [31]:
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
std_data = scaler.transform(input_data_reshaped)
print(std_data)

# SVM prediction
svm_prediction = svm_best_classifier.predict(std_data)
print('SVM prediction:', svm_prediction)
if svm_prediction[0] == 0:
    print('The person is not diabetic (SVM)')
else:
    print('The person is diabetic (SVM)')

# Random Forest prediction
rf_prediction = rf_best_classifier.predict(std_data)
print('Random Forest prediction:', rf_prediction)
if rf_prediction[0] == 0:
    print('The person is not diabetic (Random Forest)')
else:
    print('The person is diabetic (Random Forest)')

# Decision Tree prediction
dt_prediction = dt_best_classifier.predict(std_data)
print('Decision Tree prediction:', dt_prediction)
if dt_prediction[0] == 0:
    print('The person is not diabetic (Decision Tree)')
else:
    print('The person is diabetic (Decision Tree)')

# Logistic Regression prediction
lr_prediction = lr_best_classifier.predict(std_data)
print('Logistic Regression prediction:', lr_prediction)
if lr_prediction[0] == 0:
    print('The person is not diabetic (Logistic Regression)')
else:
    print('The person is diabetic (Logistic Regression)')

[[ 0.3429808   1.41167241  0.14964075 -0.09637905  0.82661621 -0.78595734
   0.34768723  1.51108316]]
SVM prediction: [1]
The person is diabetic (SVM)
Random Forest prediction: [1]
The person is diabetic (Random Forest)
Decision Tree prediction: [1]
The person is diabetic (Decision Tree)
Logistic Regression prediction: [1]
The person is diabetic (Logistic Regression)


