In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

  return f(*args, **kwds)


## remove outliers using the inter-quartile range

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## insulin looks like it might have some outliers 
- mean, quartlies a lot smaller than the max value
 - let's try to remove these outliers to check if they help with model performance

## calculate the inter-quartile range
- In descriptive statistics, the interquartile range (IQR), also called the midspread or middle 50%, or technically H-spread, is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles, IQR = Q3 −  Q1. In other words, the IQR is the first quartile subtracted from the third quartile; these quartiles can be clearly seen on a box plot on the data. (https://en.wikipedia.org/wiki/Interquartile_range)

In [7]:
# calculate interquartile range
q25, q75 = np.percentile(df['Insulin'], 25), np.percentile(df['Insulin'], 75)
iqr = q75 - q25


In [8]:
iqr

127.25

## let's remove all observations that have insulin higher than 127.25

In [9]:
df2 = df[df['Insulin'] <= 127.25]

In [11]:
print('original df size:', df.shape)
print('new df size:', df2.shape)

original df size: (768, 9)
new df size: (576, 9)


## let's compare models

## baseline logistic regression model

In [22]:
X, y = df.loc[:, df.columns != 'Outcome'], df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Accuracy of logistic regression classifier on test set: 0.75


Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,97,11
1,28,18


## logistic regression on data without insulin outliers

In [24]:
X, y = df2.loc[:, df2.columns != 'Outcome'], df2['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Accuracy of logistic regression classifier on test set: 0.80


Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,5
1,18,12
