In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
#https://youtu.be/xUE7SjVx9bQ

# Data Collection and Analysis Section

 Pregnancies = # of times pregnant <br>
 Glucose = plasma glucose concentration of 2hrs in an oral glucose tolerance test<br>
 BloodPressure = Diastolic Blood Pressure(mm Hg) <br>
 SkinThickness = Triceps skin fold thickness(mm) <br>
 Insulin = 2-hr serum insulin (mu U/mL)<br>
 BMI = body mass index(weight in kg/(height in m)^2)<br>
 DiabetesPedigreeFunction = information about diabetes history in relatives, higher value = higher chance of having diabetes<br>
 Age = Age of person in years <br>
 Outcome = 0,non-diabetic and 1, diabetic <br>

In [4]:
#storing the data in diabetes dataframe
diabetes = pd.read_csv('diabetes.csv')
#look at the different variables that are included by printing only first 5 rows
diabetes.head()
#see the # of rows by columns(aka size of data set)
#diabetes.shape #768 rows(observations) by 9 columns(parameters)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
#getting some general statistical measures on the data
diabetes.describe()
#see how many people have diabetes vs don't
diabetes['Outcome'].value_counts()
#500 ppl don't have diabetes and 268 have diabetes
#typically we need more data to train the model around 1000's of observations but this is sufficient since we are practicing

0    500
1    268
Name: Outcome, dtype: int64

In [6]:
#find the average parameters of people with diabetes vs w/o
diabetes.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [11]:
#now we are going to drop the outcomes column for the machine to be able to guess it on its own
X = diabetes.drop(columns='Outcome',axis=1)#we use axis=1, when droping column and 0 when dropping row
Y = diabetes['Outcome']#seperate the outcome column into its own table

NOW WE HAVE TO STANDARDIZE THE DATA SO THAT IT IS EASIER TO READ AND THE UNITS ARE THE SAME, AND MAKE IT EASIER FOR THE MACHINE LEARNING MODEL(IT ALSO ALLOWS IT TO MAKE BETTER PREDICTIONS)

In [15]:
scaler = StandardScaler()
scaler.fit(X)
#transform the data now
standardized_data = scaler.transform(X)
print(standardized_data) #all of the values are in a similar range

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [18]:
#make X the standardized data again
X = standardized_data
#Now we will the X and Y to train the model 
#X = represents the data
#Y = represents the label

Now we will split the data into training and test data

#now we will create four new variables 
#with the X_train and X_test we are going to train the ML model with the train
#and once model is trained we will evaluate the model with the test data
#Y_train represents the labels for the X_train data and the Y_test represents the labels for the X_test data
#we will have four outputs from this function below
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2,stratify=Y,random_state=2)
#X is the data
#Y is the labels
#test size = 0.2, so 20% of the data will be in the  test data and 80% in the train data
#Stratify, to be in the same proportion as what we originally used in both test and train datasets as well. 
#random _state=helpful for replicating code ensures that it is done the same, kind of like an index
#we see that 614 observations are used for the training and 154 for the test
print(X.shape,X_train.shape,X_test.shape)

Training the Model

In [25]:
classifier = svm.SVC(kernel='linear') #use linear model
#fit training data to classifier
classifier.fit(X_train, Y_train)

SVC(kernel='linear')

Evaluate the model

In [30]:
#accuracy score on the training data
#
#this will predict the labels for all the X_train rows and store it in new variable the prediction one
X_train_prediction=classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy Score of the training data:',training_data_accuracy)
###
#now we need to see the accuracy of the test data using the model and see how well it's performing
#accuracy score on the test data
X_test_prediction=classifier.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print('Accuracy Score of the testing data:',testing_data_accuracy)
#anything above 70% is pretty good
#here we have evidence that the model has not overtrained which is good
#we would see that it overtrained the test data if the accuracy for the train data is very high but low for the test data, aka overfitting




Accuracy Score of the training data: 0.7866449511400652
Accuracy Score of the testing data: 0.7727272727272727


Making a predictive system 

In [38]:
#here we are going to add new data that we want to use to let the model predict if the person has diabetes or not, basically a new person 
input_data = (5,166,72,19,175,25.8,0.587,51)
#
#this is like creating a blueprint for the model
#chaning the input to a numpy array
input_data_as_na = np.asarray(input_data)
#
#need to reshape the new data(array) since we are predicting for one instance if we don't do this the model will be confused and will be predicting for 768 observations instead of 1
#(1,-1) predicting only one instance
input_data_reshaped=input_data_as_na.reshape(1,-1)
#need to standardize since we did that with the original data when inputting it into the model
std_data=scaler.transform(input_data_reshaped)
#
#now we can feed this new data into the model to predict
#lets make the prediction
prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')

[1]
The person is diabetic
