### Import the dependencies

In [186]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

### Loading the dataset

#### PIMA diabetes dataset (kaggle/UCI) (females dataset)

In [187]:
diab_dst = pd.read_csv('diabetesCopy.csv')

# diab_dst.head()

In [188]:
# numbers of rows and column in our dataset

diab_dst.shape

(799, 9)

In [189]:
# finding the statistical measures of the dataset

diab_dst.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,799.0,799.0,799.0,799.0,799.0,799.0,799.0,799.0,799.0
mean,3.994994,120.986233,69.261577,21.554443,90.598248,32.103905,0.501014,33.95995,0.355444
std,3.519722,32.525118,20.014667,17.30963,134.005013,8.552623,0.367231,12.630508,0.478948
min,0.0,0.0,0.0,0.0,0.0,0.0,0.08,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.15,0.25,24.0,0.0
50%,3.0,117.0,72.0,23.0,44.0,32.0,0.39,29.0,0.0
75%,6.0,141.0,80.0,33.0,135.0,36.8,0.66,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [190]:
# value counts
# tells us that 500 patients are non-diabetic and 268 people are diabetic in the dataset

diab_dst['Outcome'].value_counts()

Outcome
0    515
1    284
Name: count, dtype: int64

In [191]:
# building some insights
# we will come to know who will be more susceptible to diabetes by finding mean for the outcome

diab_dst.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.460194,110.768932,68.347573,20.341748,77.452427,30.500388,0.454214,31.640777
1,4.964789,139.514085,70.919014,23.753521,114.43662,35.01169,0.58588,38.165493


In [192]:
# separating data and labels for the independent variable 

X = diab_dst.drop(columns='Outcome', axis=1)   # axis = 1, means columns and 0 means rows, since df means x,y
Y = diab_dst['Outcome']

### Data standardization

#### To make machine leaning models easier to predict the data. Since the data in out dataset has different ranges for each labels, like glucose is from(100,200), BMI is from(20,40) and same. So we will make them in a particular range. So we are fitting inconsistent data using standard scalar function.

In [193]:
# # taking one instance of the standard scaler function

# scaler = StandardScaler()

In [194]:
# # standardising the X dataset we have created
# # it will make all the labels within same ranges (0 to 1)
# # another way to do it will be scaler.fit(X) and then scaler.transform but we are using one method for doing those

# scaler.fit(X)

# standardised_data = scaler.transform(X)

In [195]:
# standardised_data

In [196]:
# X_std = standardised_data

In [197]:
# X
# Y
# X_std

### Splitting the data into training data and testing data

In [198]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size=0.3,
                                                    random_state=4,
                                                    stratify=Y)

In [199]:
print(X.shape, x_train.shape, x_test.shape, y_train.shape, y_test.shape, Y.shape)

(799, 8) (559, 8) (240, 8) (559,) (240,) (799,)


## Training the model

In [200]:
# creating a classifer
# SVC means Support Vector Classifier

classifier = svm.SVC(kernel='poly',degree=3)

In [201]:
# Training the svm classifier

classifier.fit(x_train, y_train)

### Evaluating the model

#### Accuracy Score of the model



In [202]:
# acccuracy score on the training data

# predict label for all the x_train dataset, means predicting y values of x_train
x_train_prediction = classifier.predict(x_train)

# Training data accuracy ?, checking if the predicted answer relates to the y_train dataset
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [203]:
# if accuracy is above 75 means it is good, we can boost it by another factors
print(training_data_accuracy)

0.7692307692307693


In [204]:
x_test_prediction = classifier.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [205]:
print(test_data_accuracy)

0.7458333333333333


In [206]:
# model not overtrained, overfitting, means it is not much roughly worked on trained data, so it will work good on test data
# if there's a lot of score difference between test_data_accuracy and train_data_accuracy then it means overfitting

### Making a predictive system

In [207]:
# taking the array
# reshaping the array, because model is trained on (768,8) inputs, we are only giving one instance and not 768 inputs
# if we did not reshape it will expect 768 values
# also we need to standardised the data we have given because the ml is trained on standardised_data

input_data = (1,103,30,38,83,43.3,0.183,33)
arrayed_data = np.asarray(input_data)

reshaped_data = arrayed_data.reshape(1,-1)

#standardisation
# std_dat = scaler.transform(reshaped_data)
# print(std_dat)

#predicting the y value
prediction = classifier.predict(reshaped_data)
print(prediction)

if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetc')

[0]
The person is not diabetic




## Saving the trained model

In [208]:
import pickle

In [209]:
filename = 'trained_model.sav'

# saving the trained model in the file (in binary) using dump which is used to save 
pickle.dump(classifier, open(filename, 'wb'))

In [210]:
# laoding the save model using pickle.load

load_model = pickle.load(open('trained_model.sav','rb'))