In [1]:
import pandas as pd
from pandas import set_option

In [2]:
pima_df = pd.read_csv('diabetes.csv')

In [3]:
pima_df_cols = pima_df.columns
pima_df_cols

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
pima_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
set_option('display.width', 100)
set_option('precision', 3)
correlations = pima_df.corr(method = 'pearson')
print(type(correlations)) # Type of correlations is Pandas DataFrame
print(correlations['Outcome'].sort_values(ascending=False))
# sknewness = pima_df.skew()
# print(sknewness)

<class 'pandas.core.frame.DataFrame'>
Outcome                     1.000
Glucose                     0.467
BMI                         0.293
Age                         0.238
Pregnancies                 0.222
DiabetesPedigreeFunction    0.174
Insulin                     0.131
SkinThickness               0.075
BloodPressure               0.065
Name: Outcome, dtype: float64


In [6]:
# See if the class is balanced:
# Since there are more number of non-diabetic patients (about double) than the diabetic, its unbalanced
pima_df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [9]:
#Check for null data:
# No null data is present
# Returns series data
print(type(pima_df.isnull().sum()))
print(pima_df.isnull().sum())
temp = pima_df.isnull().sum()

<class 'pandas.core.series.Series'>
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [21]:
for i in range(0,temp.size):
        print("Column {} has {} missing values".format(temp.keys()[i],temp[i]))

Column Pregnancies has 0 missing values
Column Glucose has 0 missing values
Column BloodPressure has 0 missing values
Column SkinThickness has 0 missing values
Column Insulin has 0 missing values
Column BMI has 0 missing values
Column DiabetesPedigreeFunction has 0 missing values
Column Age has 0 missing values
Column Outcome has 0 missing values


In [19]:
temp.keys()[0]

'Pregnancies'

In [23]:
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [25]:
outcome=pima_df['Outcome']
data=pima_df[pima_df.columns[:8]]
train,test=train_test_split(pima_df,test_size=0.25,random_state=0,stratify=pima_df['Outcome'])# stratify the outcome
train_X=train[train.columns[:8]]
test_X=test[test.columns[:8]]
train_Y=train['Outcome']
test_Y=test['Outcome']

In [26]:
train_X.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
432,1,80,74,11,60,30.0,0.527,22
453,2,119,0,0,0,19.6,0.832,72


In [27]:
train_Y.head(2)

432    0
453    0
Name: Outcome, dtype: int64

In [41]:
types=['rbf','linear']
trained_models = {}
for i in types:
    model=svm.SVC(kernel=i)
    model.fit(train_X,train_Y)
    prediction=model.predict(test_X)
    trained_models["svm_"+i] = model
    print('Accuracy for SVM kernel=',i,'is',metrics.accuracy_score(prediction,test_Y))

Accuracy for SVM kernel= rbf is 0.6510416666666666
Accuracy for SVM kernel= linear is 0.7708333333333334


In [44]:
print(trained_models['svm_linear'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)


In [35]:
import pickle

In [36]:
with open('svmlinear_trained_model.pkl', 'wb') as f:
    pickle.dump(svm_models[1], f)

In [39]:
# Check if we get the same accuracy using the pickle file again
with open('svmlinear_trained_model.pkl', 'rb') as f:
    check_model = pickle.load(f)

In [40]:
check_prediction = check_model.predict(test_X)
print('Accuracy for Model Checking = ',metrics.accuracy_score(prediction,test_Y))
#The accuracy is the same when we have used check_model, so check_model works!!

Accuracy for Model Checking =  0.7708333333333334


In [45]:
# Training more models and storing them into pickle files with their names
# Training logisticRegression
model = LogisticRegression()
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
trained_models['logisticregression'] = model
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(prediction,test_Y))

The accuracy of the Logistic Regression is 0.7760416666666666


In [46]:
# Decision Tree
model=DecisionTreeClassifier()
model.fit(train_X,train_Y)
prediction=model.predict(test_X)
trained_models['decisiontree'] = model
print('The accuracy of the Decision Tree is',metrics.accuracy_score(prediction,test_Y))

The accuracy of the Decision Tree is 0.7604166666666666


In [47]:
print(trained_models.keys())

dict_keys(['svm_rbf', 'svm_linear', 'logisticregression', 'decisiontree'])


In [52]:
# Now, save all the trained models in the pickle format
for each_model in trained_models.keys():
    filename = each_model+'.pkl'
    with open(filename,'wb') as f:
        pickle.dump(trained_models[each_model],f)
#     with open(trained_models[each_model])