In [5]:
# Pima Indian Diabetes dataset

# read the data into a Pandas DataFrame
import pandas as pd
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv(url, header=None, names=col_names)

In [8]:
# print the first 5 rows of data
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
# Question: Can we predict diabetes status of a patient given their health mesurements?

In [11]:
# define X and y
feature_cols = ['pregnant', 'insulin', 'bmi', 'age']
X = pima[feature_cols]
y = pima.label

In [16]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [14]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
y_pred_class = logreg.predict(X_test)

In [25]:
# calculate accuracy
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)

0.692708333333


In [26]:
# Null accuracy: accuracy that could be achieved by always predicting the most frequent class

In [28]:
# examine the class distribution of the testing set (using a Pandas Series method)
y_test.value_counts()

0    130
1     62
Name: label, dtype: int64

In [31]:
# calculate the percentage of ones
y_test.mean()

0.3229166666666667

In [33]:
# calculate the percentage of zeros
1 - y_test.mean()

0.6770833333333333

In [35]:
# calculate null accuracy(for binary classfication problems coded as 0/1)
max(y_test.mean(), 1-y_test.mean())
# not so useful model

0.6770833333333333

In [37]:
# calculate null accuracy (for binary classfication problems)
y_test.value_counts().head(1) / len(y_test)

0    0.677083
Name: label, dtype: float64

In [39]:
# Comparing the true and predicted response values
# print the first 24 true and predicted responses
print 'True', y_test.values[0:25]
print 'Pred', y_pred_class[0:25]

 True [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
Pred [0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [48]:
print y_test.value_counts().head(1)

0    130
Name: label, dtype: int64


In [49]:
# Confusion matrix
# Table that describes the performance of a classfication model
print metrics.confusion_matrix(y_test, y_pred_class)

[[118  12]
 [ 47  15]]


In [53]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]
print confusion

[[118  12]
 [ 47  15]]


In [56]:
# Metrics computed from a confusion matrix
# Classification Accuracy: Overall, how often is the classfier correct?
print (TP+TN) / float(TP + TN + FP + FN)
print metrics.accuracy_score(y_test, y_pred_class)

0.692708333333
0.692708333333


In [58]:
# Classification Error: Overall, how often is the classifier incorrect?
print (FP+FN) 

59
