### import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

### data selection and analysis

In [2]:
df=pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.shape

(768, 9)

In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

###### 0 -> Non-diabetic  1-> Diabetic

In [7]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [9]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]
print(x.shape)
print(y.shape)
print(type(x))
print(type(y))

(768, 8)
(768,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


### Data Standardization

In [10]:
scaler= StandardScaler()

In [11]:
scaler.fit(x)

StandardScaler()

In [12]:
stanardized_data=scaler.transform(x)

In [13]:
print(stanardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [14]:
x=stanardized_data
y=df.iloc[:,-1]
print(x.shape)
print(y.shape)
print(type(x))
print(type(y))

(768, 8)
(768,)
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


### Split into tainset and testset

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(614, 8)
(154, 8)
(614,)
(154,)


In [16]:
def gen_metrics(ytest,ypred):
    cm=confusion_matrix(ytest,ypred)
    print('confusion matrix\n',cm)
    print('\nclassification report\n',classification_report(ytest,ypred))
    print('accuracy : ',accuracy_score(ytest,ypred))

### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
m1=LogisticRegression(max_iter=1000)
m1.fit(x_train,y_train)

LogisticRegression(max_iter=1000)

In [19]:
print('Training score: ',m1.score(x_train,y_train))
print('Testing score: ',m1.score(x_test,y_test))

Training score:  0.7768729641693811
Testing score:  0.7792207792207793


In [20]:
ypred_m1=m1.predict(x_test)
ypred_m1

array([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
      dtype=int64)

In [21]:
gen_metrics(y_test,ypred_m1)

confusion matrix
 [[92 11]
 [23 28]]

classification report
               precision    recall  f1-score   support

           0       0.80      0.89      0.84       103
           1       0.72      0.55      0.62        51

    accuracy                           0.78       154
   macro avg       0.76      0.72      0.73       154
weighted avg       0.77      0.78      0.77       154

accuracy :  0.7792207792207793


### making a predictive System

In [22]:
input_data=(4,110,92,0,0,37.6,0.191,30)

# changing the input data to numpy array
input_data_as_numpy_array=np.asarray(input_data)

#reshape the array as we are predicting for one instance(only one instance)
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)

#standardize input data
std_data=scaler.transform(input_data_reshaped)

print(std_data)

prediction=m1.predict(std_data)
print(prediction)


[[ 0.04601433 -0.34096773  1.18359575 -1.28821221 -0.69289057  0.71168975
  -0.84827977 -0.27575966]]
[0]


In [23]:
if(prediction[0]==0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')

The person is not diabetic
