In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
# Data collection and analysis
#PIMA Diabetes dataset

In [3]:
diabetest_dataset = pd.read_csv('diabetes.csv')
diabetest_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [4]:
diabetest_dataset.shape

(2000, 9)

In [5]:
diabetest_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.7035,121.1825,69.1455,20.935,80.254,32.193,0.47093,33.0905,0.342
std,3.306063,32.068636,19.188315,16.103243,111.180534,8.149901,0.323553,11.786423,0.474498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,63.5,0.0,0.0,27.375,0.244,24.0,0.0
50%,3.0,117.0,72.0,23.0,40.0,32.3,0.376,29.0,0.0
75%,6.0,141.0,80.0,32.0,130.0,36.8,0.624,40.0,1.0
max,17.0,199.0,122.0,110.0,744.0,80.6,2.42,81.0,1.0


In [6]:
diabetest_dataset['Outcome'].value_counts()

Outcome
0    1316
1     684
Name: count, dtype: int64

In [7]:
# 0--> NON-Diabetetic
# 1--> Diabetic

In [8]:
diabetest_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.168693,110.586626,68.094985,20.052432,70.56383,30.567477,0.434676,31.081307
1,4.732456,141.568713,71.166667,22.633041,98.897661,35.320468,0.540681,36.95614


In [9]:
# separating the data and labels

In [10]:
x = diabetest_dataset.drop(columns='Outcome',axis=1)
y = diabetest_dataset['Outcome']

In [11]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2,138,62,35,0,33.6,0.127,47
1,0,84,82,31,125,38.2,0.233,23
2,0,145,0,0,0,44.2,0.630,31
3,0,135,68,42,250,42.3,0.365,24
4,1,139,62,41,480,40.7,0.536,21
...,...,...,...,...,...,...,...,...
1995,2,75,64,24,55,29.7,0.370,33
1996,8,179,72,42,130,32.7,0.719,36
1997,6,85,78,0,0,31.2,0.382,42
1998,0,129,110,46,130,67.1,0.319,26


In [12]:
y

0       1
1       0
2       1
3       1
4       0
       ..
1995    0
1996    1
1997    0
1998    1
1999    0
Name: Outcome, Length: 2000, dtype: int64

In [13]:
# data standardization

In [14]:
scaler = StandardScaler()
standardized_data = scaler.fit_transform(x)
print(standardized_data)

[[-0.5153943   0.52455322 -0.37248123 ...  0.17268332 -1.06324616
   1.18042417]
 [-1.12049474 -1.1597562   0.67008046 ...  0.73724853 -0.7355513
  -0.85632626]
 [-1.12049474  0.74288962 -3.60442246 ...  1.47363794  0.49175869
  -0.17740945]
 ...
 [ 0.69480658 -1.12856529  0.46156812 ... -0.12187245 -0.27492362
   0.75610116]
 [-1.12049474  0.24383498  2.12966682 ...  4.28419085 -0.46968566
  -0.60173245]
 [-0.5153943  -1.25332895  0.14879962 ... -0.25687717  0.23516743
  -0.68659705]]


In [15]:
x = standardized_data
y = diabetest_dataset['Outcome']

In [16]:
print(x)
print(y)

[[-0.5153943   0.52455322 -0.37248123 ...  0.17268332 -1.06324616
   1.18042417]
 [-1.12049474 -1.1597562   0.67008046 ...  0.73724853 -0.7355513
  -0.85632626]
 [-1.12049474  0.74288962 -3.60442246 ...  1.47363794  0.49175869
  -0.17740945]
 ...
 [ 0.69480658 -1.12856529  0.46156812 ... -0.12187245 -0.27492362
   0.75610116]
 [-1.12049474  0.24383498  2.12966682 ...  4.28419085 -0.46968566
  -0.60173245]
 [-0.5153943  -1.25332895  0.14879962 ... -0.25687717  0.23516743
  -0.68659705]]
0       1
1       0
2       1
3       1
4       0
       ..
1995    0
1996    1
1997    0
1998    1
1999    0
Name: Outcome, Length: 2000, dtype: int64


In [17]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [18]:
print(x.shape,x_train.shape,x_test.shape)

(2000, 8) (1600, 8) (400, 8)


In [19]:
# training the model

In [20]:
classifier = svm.SVC(kernel='linear')

In [21]:
# training the svm classifier

In [22]:
classifier.fit(x_train,y_train)

In [23]:
# accuracy score on the training data

x_train_prediction = classifier.predict(x_train)
traing_data_accuracy = accuracy_score(x_train_prediction,y_train)

print("Accuracy score of the  traing data:",traing_data_accuracy)

Accuracy score of the  traing data: 0.775


In [24]:
# accuracy score on the test data

In [25]:
x_test_prediction = classifier.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction,y_test)

print('Accracy score of test data :',test_data_accuracy)

Accracy score of test data : 0.805


In [26]:
# making predictive system

In [27]:
input_data = (7,195,70,33,145,25.1,0.163,55)

# changing the input_data np array
input_data_as_np_array = np.array(input_data)
print('np_array:',input_data_as_np_array)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_np_array.reshape(1,-1)
print('Reshaped:',input_data_reshaped)

# standardize the input data 
std_data = scaler.transform(input_data_reshaped)
print('standardize:',std_data)

prediction = classifier.predict(std_data)
print('prediction:',prediction)

if prediction[0] == 0:
    print('The Preson is not diabetic')
else:
    print('The preson in diabetic')

np_array: [7.00e+00 1.95e+02 7.00e+01 3.30e+01 1.45e+02 2.51e+01 1.63e-01 5.50e+01]
Reshaped: [[7.00e+00 1.95e+02 7.00e+01 3.30e+01 1.45e+02 2.51e+01 1.63e-01 5.50e+01]]
standardize: [[ 0.9973568   2.30243538  0.04454345  0.74941535  0.58249579 -0.87053501
  -0.95195356  1.85934098]]
prediction: [1]
The preson in diabetic




In [28]:
# or 

In [29]:

# Assuming input_data contains values for all 8 features
input_data = [7,195,70,33,145,25.1,0.163,55]

# Standardize the input data using a separate scaler object fitted on the entire dataset or training set only
input_data_reshaped = np.array(input_data).reshape(1, -1)
std_data = scaler.transform(input_data_reshaped)

# Predict using the classifier
prediction = classifier.predict(std_data)

if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')


The person is diabetic




# save the trained model

In [30]:
import pickle

In [31]:
filename = 'trained_model.sav'
pickle.dump(classifier,open(filename,'wb'))

In [32]:
#loading the saved model
loaded_model = pickle.load(open('trained_model.sav','rb'))

In [35]:

# Assuming input_data contains values for all 8 features
input_data = [7,195,70,33,145,25.1,0.163,55]

# Standardize the input data using a separate scaler object fitted on the entire dataset or training set only
input_data_reshaped = np.array(input_data).reshape(1, -1)
#std_data = scaler.transform(input_data_reshaped)

# Predict using the classifier
prediction = loaded_model.predict(std_data)

if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')


The person is diabetic
