Importing the Dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection and Analysis

PIMA Diabetes Dataset

In [7]:
# loading the diabetes dataset to a pandas DataFrame
pregnancy_dataset = pd.read_csv('./pregnancy.csv') 

In [53]:
pregnancy_dataset=pregnancy_dataset.drop(columns = 'PatientID', axis=1)

In [55]:
# printing the first 5 rows of the dataset
pregnancy_dataset.head()

Unnamed: 0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
0,37,5,119,73,35,202,23.7,2.11,0.0
1,38,4,148,88,44,234,21.7,2.47,1.0
2,33,4,150,86,10,274,24.6,2.25,1.0
3,25,8,154,91,20,267,25.3,0.87,1.0
4,28,7,95,84,35,154,20.1,0.26,0.0


In [57]:
# number of rows and Columns in this dataset
pregnancy_dataset.shape

(1000, 9)

In [59]:
# getting the statistical measures of the data
pregnancy_dataset.describe()

Unnamed: 0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,35.265,4.47,121.658,80.957,29.382,146.935,26.7091,1.29857,0.3
std,8.71118,2.902331,18.022204,6.241483,11.435333,74.947354,4.745795,0.704578,0.458487
min,21.0,0.0,90.0,70.0,10.0,15.0,18.5,0.11,0.0
25%,27.0,2.0,107.0,76.0,20.0,84.0,22.7,0.6775,0.0
50%,36.0,5.0,122.0,81.0,29.0,149.0,26.5,1.3,0.0
75%,43.0,7.0,136.0,86.0,39.0,211.0,30.9,1.92,1.0
max,49.0,9.0,159.0,94.0,49.0,275.0,35.0,2.5,1.0


In [61]:
pregnancy_dataset['Outcome'].value_counts()

Outcome
0.0    700
1.0    300
Name: count, dtype: int64

0 --> Non-Diabetic

1 --> Diabetic

In [64]:
pregnancy_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,35.204286,4.491429,118.004286,79.435714,29.405714,148.594286,26.725429,1.285957
1.0,35.406667,4.42,130.183333,84.506667,29.326667,143.063333,26.671,1.328


In [66]:
# separating the data and labels
X = pregnancy_dataset.drop(columns = 'Outcome', axis=1)
Y = pregnancy_dataset['Outcome']

In [68]:
print(X)

     Age  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0     37            5      119             73             35      202  23.7   
1     38            4      148             88             44      234  21.7   
2     33            4      150             86             10      274  24.6   
3     25            8      154             91             20      267  25.3   
4     28            7       95             84             35      154  20.1   
..   ...          ...      ...            ...            ...      ...   ...   
995   37            5      145             77             42      260  24.9   
996   48            8      116             88             36       57  29.8   
997   39            8      103             84             20      125  34.6   
998   28            0      145             75             33      215  20.0   
999   23            6      103             71             49      231  31.8   

     DiabetesPedigreeFunction  
0                  

In [70]:
print(Y)

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
995    0.0
996    1.0
997    0.0
998    0.0
999    0.0
Name: Outcome, Length: 1000, dtype: float64


Train Test Split

In [73]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [75]:
print(X.shape, X_train.shape, X_test.shape)

(1000, 8) (800, 8) (200, 8)


Training the Model

In [78]:
classifier = svm.SVC(kernel='linear')

In [80]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [84]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [86]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.76375


In [88]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [90]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.74


Making a Predictive System

In [95]:
input_data = (28,7,95,84,35,154,20.1,0.26)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person does not have pregnancy complication')
else:
  print('The person has pregnancy complication')

[0.]
The person does not have pregnancy complication




Saving the trained model

In [98]:
import pickle

In [100]:
filename = 'pregnancy_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [102]:
# loading the saved model
loaded_model = pickle.load(open('pregnancy_model.sav', 'rb'))

In [104]:
for column in X.columns:
  print(column)

Age
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
