In [21]:
import numpy as np

#Import the diabetes dataset
#data = np.genfromtxt("diabetes.csv", dtype=float, delimiter=',', names=True) 


#Import the Iris dataset
dataRaw = np.loadtxt("diabetes.csv",delimiter=',', dtype='object')
# Get the header (first row)
header = dataRaw[0,:]

data = dataRaw[1:,:8] # Get the data (second row till end; 1-4th columns). 

data = np.vstack(data.astype(np.float32)) # Convert them to float
print(data)

# Get the labels (second row; 4th columns)
labels = np.vstack(dataRaw[1:,8].astype(np.int32))
#print(labels)

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


In [22]:
print(dataRaw)

[['Pregnancies' 'Glucose' 'BloodPressure' ... 'DiabetesPedigreeFunction'
  'Age' 'Outcome']
 ['6' '148' '72' ... '0.627' '50' '1']
 ['1' '85' '66' ... '0.351' '31' '0']
 ...
 ['5' '121' '72' ... '0.245' '30' '0']
 ['1' '126' '60' ... '0.349' '47' '1']
 ['1' '93' '70' ... '0.315' '23' '0']]


In [23]:
# find unique labels and frequency
labelsUn,labelsCounts = np.unique(labels,return_counts=True)
print(labelsUn,labelsCounts)

[0 1] [500 268]


In [25]:
# count the number of rows and columns
nrows,ncols = np.shape(data)
print(nrows)
print(ncols)

nclasses = len(labelsUn) # number of unique categories
print(nclasses)

# Setting up empty arrays for the analysis
average = np.zeros((nclasses,ncols))
maxi = np.zeros((nclasses,ncols))
mini = np.zeros((nclasses, ncols))
sd = np.zeros((nclasses,ncols))

# populating the arrays with the average, max, min and standard deviation
for i in labelsUn:
    indexes = np.reshape(labels==i,nrows)
    average[i-1,:] = np.mean(data[indexes,:],axis=0)
    maxi[i-1,:] = np.max(data[indexes,:],axis=0)
    mini[i-1,:] = np.min(data[indexes,:],axis=0)
    sd[i-1,:] = np.std(data[indexes,:],axis=0)


print(average)
print(maxi)
print(mini)
print(sd)

768
8
2
[[  4.86567163 141.25746155  70.82462311  22.16417885 100.33582306
   35.14253235   0.55049992  37.06716537]
 [  3.2980001  109.98000336  68.18399811  19.66399956  68.79199982
   30.30418587   0.42973423  31.19000053]]
[[ 17.         199.         114.          99.         846.
   67.09999847   2.42000008  70.        ]
 [ 13.         197.         122.          60.         744.
   57.29999924   2.329       81.        ]]
[[ 0.     0.     0.     0.     0.     0.     0.088 21.   ]
 [ 0.     0.     0.     0.     0.     0.     0.078 21.   ]]
[[  3.7342515   31.87997818  21.45167542  17.64668655 138.43019104
    7.24940491   0.3716591   10.94777298]
 [  3.01416397  26.11504173  18.0450058   14.87504959  98.7663269
    7.68216181   0.29878604  11.6559782 ]]


In [26]:
# Find outliers per class and feature based on the formula mean+-2*sd
# More optimal ways are available but let's do it with nested for loops for revision

outliers2sd = np.zeros((nclasses,ncols))
for i in labelsUn:
    indexes = np.reshape(labels==i,nrows)
    classData = data[indexes,:]
    for j in range(ncols):
        thresholdLow = average[i-1,j]-2*sd[i-1,j]
        thresholdHigh = average[i-1,j]+2*sd[i-1,j]
        remain = [x for x in classData[:,j] if(x > thresholdLow)]
        remain = [x for x in classData[:,j] if(x < thresholdHigh)]
        outliers2sd[i-1,j] = 100 * (labelsCounts[i-1] - len(remain)) / labelsCounts[i-1]

In [29]:
# Export to .csv file
decimals = 2
fmt = "%.2f"
formatf = ".csv"
outcome = np.array(['diabetes','no_diabetes'])
for i in range(len(labelsUn)):
    temp = np.vstack([average[i,:], mini[i,:],maxi[i,:],sd[i,:],outliers2sd[i,:]]).T
    temp = np.around(temp,decimals)
    temp_str = np.char.mod(fmt,temp)
    rows = np.array(header[:-1].astype("U"))[:,np.newaxis]
    rowsf = np.hstack((rows, temp_str))
    headerf = [outcome[i], 'mean','min','max','std','outliers2sd%']
    np.savetxt(outcome[i]+formatf,np.vstack((headerf,rowsf)),delimiter=',',fmt='%s')