# **Importing Dependencies**

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # import the StandardScaler class from the sklearn.preprocessing module
from sklearn import svm
from sklearn.metrics import accuracy_score


# **Data Collection and Analsys**

In [None]:
diabetes_dataset = pd.read_csv('/content/diabetes (1).csv')

In [None]:
#Checking the first 5 rows of the data in the dataframe(diabetes_dataset)
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes_dataset.shape
# we got 9 rows and 768 columns

(768, 9)

In [None]:
# Getting the statistical measures of the data
diabetes_dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [None]:
#outcome lets take and check how many are for 0 and how mny are for 1
diabetes_dataset['Outcome'].value_counts() # so we have 500 for non-diabitic(0) and we have 268 for diabitic (1)
#The 'Outcome' column likely represents whether a patient is diabetic or not, with values typically encoded as 0 (non-diabetic) and 1 (diabetic).

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


0 ---> Non-diabetic

1 ---> Diabetic

In [None]:
# Now lets take the mean of each so that we will get to know which category people are more diabetic tahn the others
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


The above datat concludes that people with diabetes have all the symptoms in higher levels like glucode, bp, skinthickness,insulin etc... so based on these values the ML model can predict if person is diabetic or non-diabetic.

# **Separating the Data and Labels**

In [None]:
# Separating the Data and Labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [None]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


# **Data Standardization**

* Data standardization is the process of scaling the features (input variables) of your dataset so that they have a mean of 0 and a standard deviation of 1

* e.g., age ranging from 0-100 and income ranging from thousands to millions), models can become biased towards features with larger scales. Standardization ensures that all features contribute equally to the model by putting them on the same scale.

In [None]:
# We had imported a function called standard scalar so we will be using that now to standerdise the data
# you create an instance of StandardScaler.
scaler = StandardScaler()

In [None]:
# Then fit the scaler to your data X. This step calculates the mean and standard deviation for each feature in X.
scaler.fit(X)

In [None]:
# Transforming the original data X into its standardized form.
standardized_data = scaler.transform(X)

In [None]:
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


Like i have laready mentioned the defination of standardization its converting the higher lower values in the scale of 0 ,1 so that it will be easy and gives great performance

In [None]:
X = standardized_data
Y = diabetes_dataset['Outcome']

That is in the previous step we have taken all the standarised data and feeeding it to X and outcome table in X ( This step was even done before) but now in X we have stored the standardised data

After standardization, the features in standardized_data are now in a form that is more suitable for many machine learning algorithms. These features, stored in X, will be used by the model to learn patterns in the data.

In [None]:
print(X)
print(Y)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


# **Train Test Split**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)
# random_state = 2 so its a random number that i took here if you wanna replicate the way i split the data then you can take the same number that i considered

In [None]:
# Now lets check the data of Original data and trained data
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


# **Training the model**

In [None]:
classifier = svm.SVC(kernel='linear') # now this will load the data in svm into the classifier

In [None]:
# Training the support vector machine classifer
classifier.fit(X_train, Y_train) # present training data and label for the training data

# **Evaluating the Model**

# Accuracy Score

In [None]:
# Accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7866449511400652


In [None]:
# Accuracy score on the testing data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7727272727272727


# Making a predictive system


In [None]:
input_data = (4,110,92,0,0,37.6,0.191,30)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[ 0.04601433 -0.34096773  1.18359575 -1.28821221 -0.69289057  0.71168975
  -0.84827977 -0.27575966]]
[0]
The person is not diabetic


