## Description:                                                                                                                   
- The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset.

## Attributes:
 1. Glucose Level
 2. BMI
 3. Blood pressure
 4. Pregnancies
 5. Skin thickness
 6. Insulin
 7. Diabetes pedigree function
 8. Age
 9. Outcome

# # Step 0: Import libraries and Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import pickle

In [2]:
dataset = pd.read_csv('diabetes.csv')

# # Step 2: Data Preprocessing

In [3]:
dataset_X = dataset.iloc[:,[1, 4, 5, 7]].values
dataset_Y = dataset.iloc[:,8].values

In [4]:
dataset_X

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

In [5]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0,1))
dataset_scaled = sc.fit_transform(dataset_X)

In [6]:
dataset_scaled = pd.DataFrame(dataset_scaled)

In [7]:
X = dataset_scaled
Y = dataset_Y

In [8]:
X

Unnamed: 0,0,1,2,3
0,0.743719,0.000000,0.500745,0.483333
1,0.427136,0.000000,0.396423,0.166667
2,0.919598,0.000000,0.347243,0.183333
3,0.447236,0.111111,0.418778,0.000000
4,0.688442,0.198582,0.642325,0.200000
...,...,...,...,...
763,0.507538,0.212766,0.490313,0.700000
764,0.613065,0.000000,0.548435,0.100000
765,0.608040,0.132388,0.390462,0.150000
766,0.633166,0.000000,0.448584,0.433333


In [9]:
Y

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42, stratify = dataset['Outcome'] )

# # Step 4: Data Modelling

In [11]:
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 42)
svc.fit(X_train, Y_train)

In [12]:
svc.score(X_test, Y_test)

0.7337662337662337

In [13]:
Y_pred = svc.predict(X_test)

## Step 5: Dump the pickle file

In [14]:
pickle.dump(svc, open('model.pkl','wb'))
model = pickle.load(open('model.pkl','rb'))

In [15]:
print(model.predict(sc.transform(np.array([[86, 66, 26.6, 31]]))))

[0]
