In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Data Analysis

In [3]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data.shape

(768, 9)

In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
# replacing 0 to Non-Diabetic and 1 to Diabetic
data['Outcome'].replace(to_replace=[0,1],value=['Non-Diabetic','Diabetic'],inplace=True)

In [8]:
data['Outcome'].value_counts()

Non-Diabetic    500
Diabetic        268
Name: Outcome, dtype: int64

In [9]:
data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Diabetic,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164
Non-Diabetic,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19


In [10]:
# splitting Data to independent and dependent set
X = data.drop(columns='Outcome',axis=1)
y = data['Outcome']

In [11]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [12]:
y

0          Diabetic
1      Non-Diabetic
2          Diabetic
3      Non-Diabetic
4          Diabetic
           ...     
763    Non-Diabetic
764    Non-Diabetic
765    Non-Diabetic
766        Diabetic
767    Non-Diabetic
Name: Outcome, Length: 768, dtype: object

## Standardization

In [13]:
SC = StandardScaler()
X = SC.fit_transform(X)
X

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [14]:
y

0          Diabetic
1      Non-Diabetic
2          Diabetic
3      Non-Diabetic
4          Diabetic
           ...     
763    Non-Diabetic
764    Non-Diabetic
765    Non-Diabetic
766        Diabetic
767    Non-Diabetic
Name: Outcome, Length: 768, dtype: object

In [15]:
# Train and Test Split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.20,stratify=y,random_state=2)

## Machine Learning Model using Support Vector Machine

In [16]:
SVM_Model = SVC(kernel='linear')
SVM_Model.fit(X_train,y_train)

SVC(kernel='linear')

## Model Evaluation

In [17]:
#  Model Accuracy on Train Data
X_train_pred = SVM_Model.predict(X_train)
train_prediction_score = accuracy_score(X_train_pred,y_train)
print(f'Train_data_prediction_score {(train_prediction_score*100).round(2)}%')

Train_data_prediction_score 76.22%


In [18]:
confusion_matrix(X_train_pred,y_train)

array([[111,  43],
       [103, 357]], dtype=int64)

In [19]:
#  Model Accuracy on Test Data
X_test_pred = SVM_Model.predict(X_test)
test_prediction_score = accuracy_score(X_test_pred,y_test)
print(f'Test_data_prediction_score {(test_prediction_score*100).round(2)}%')

Test_data_prediction_score 82.47%


In [20]:
confusion_matrix(X_test_pred,y_test)

array([[36,  9],
       [18, 91]], dtype=int64)

## Prediction System

In [21]:
new_data_1 = np.array([7,107,74,0,0,29.6,0.254,31])
new_data_1 = new_data_1.reshape(1,-1)
new_data_1 = SC.transform(new_data_1)
predict1 = SVM_Model.predict(new_data_1)
print(new_data_1)
print(f"The Patient is {predict1}")

[[ 0.93691372 -0.43485916  0.25303625 -1.28821221 -0.69289057 -0.30366421
  -0.65801229 -0.19067191]]
The Patient is ['Non-Diabetic']


In [22]:
new_data_2 = np.array([7,184,84,33,0,35.5,0.355,41])
new_data_2 = new_data_2.reshape(1,-1)
new_data_2 = SC.transform(new_data_2)
predict2 = SVM_Model.predict(new_data_2)
print(new_data_2)
print(f"The Patient is {predict2}")

[[ 0.93691372  1.97502103  0.77001375  0.7818138  -0.69289057  0.44515934
  -0.3529803   0.66020563]]
The Patient is ['Diabetic']


In [23]:
new_data_3 = np.array([0,113,80,16,0,31,0.874,21])
new_data_3 = new_data_3.reshape(1,-1)
new_data_3 = SC.transform(new_data_3)
predict3 = SVM_Model.predict(new_data_3)
print(new_data_3)
print(f"The Patient is {predict3}")

[[-1.14185152 -0.24707629  0.56322275 -0.28456324 -0.69289057 -0.12597727
   1.21446129 -1.04154944]]
The Patient is ['Non-Diabetic']


In [24]:
new_data_4 = np.array([0,134,58,20,291,26.4,0.352,21])
new_data_4 = new_data_4.reshape(1,-1)
new_data_4 = SC.transform(new_data_4)
predict4 = SVM_Model.predict(new_data_4)
print(new_data_4)
print(f"The Patient is {predict4}")

[[-1.14185152  0.41016376 -0.57412775 -0.03365099  1.83383214 -0.7098058
  -0.36204066 -1.04154944]]
The Patient is ['Non-Diabetic']


In [25]:
new_data_5 = np.array([0,123,72,0,0,36.3,0.258,52])
new_data_5 = new_data_5.reshape(1,-1)
new_data_5 = SC.transform(new_data_5)
predict5 = SVM_Model.predict(new_data_5)
print(new_data_5)
print(f"The Patient is {predict5}")

[[-1.14185152  0.06589516  0.14964075 -1.28821221 -0.69289057  0.54669473
  -0.64593181  1.59617091]]
The Patient is ['Non-Diabetic']


In [26]:
new_data_6 = np.array([11,143,94,33,146,36.6,0.254,51])
new_data_6 = new_data_6.reshape(1,-1)
new_data_6 = SC.transform(new_data_6)
predict6 = SVM_Model.predict(new_data_6)
print(new_data_6)
print(f"The Patient is {predict6}")

[[ 2.12477957  0.69183807  1.28699125  0.7818138   0.57481223  0.58477051
  -0.65801229  1.51108316]]
The Patient is ['Diabetic']


In [27]:
new_data_7 = np.array([1,189,60,23,846,30.1,0.398,59])
new_data_7 = new_data_7.reshape(1,-1)
new_data_7 = SC.transform(new_data_7)
predict7 = SVM_Model.predict(new_data_7)
print(new_data_7)
print(f"The Patient is {predict7}")

[[-0.84488505  2.13150675 -0.47073225  0.15453319  6.65283938 -0.24020459
  -0.2231152   2.19178518]]
The Patient is ['Diabetic']
