In [19]:
import sys
import os
import pandas as pd
import numpy as np
import pickle
from collections import Counter,defaultdict

from sklearn.model_selection import train_test_split
from scipy.stats import kurtosis,iqr
from scipy import fftpack as fft
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score,f1_score

In [20]:
class Patient():
    def __init__(self,cgm):
        self.cgm = cgm
        
    def preprocess(self):
        #drop rows with 30% of values missing
        #self.cgm=self.cgm.loc[self.cgm.isnull().mean(axis=1)<0.3,:]
        
        #drop last column as it has many missing values for all patients
        self.cgm=self.cgm.iloc[:,:30]
        
        #reset the indices
        self.cgm.reset_index(inplace=True,drop=True)
        
        #interpolate the remaining missing values
        self.cgm.interpolate(method='polynomial',order=5,inplace=True)
        self.cgm.bfill(inplace=True)
        self.cgm.ffill(inplace=True)
        self.cgm=self.cgm.astype('float64')
        
    def fft(self,):
        ndarr = fft.rfft(self.cgm, n=5, axis=1)
        df= pd.DataFrame(data=ndarr)
        df.columns=['fft'+str(i) for i in range(1,df.shape[1]+1)]
        return df
        
    def rolling_mean(self,win,olap):
        df=self.cgm.rolling(window=win,axis=1).apply(np.mean).dropna(axis=1).iloc[:,::olap]
        df.columns=['rm'+str(i) for i in range(1,df.shape[1]+1)]
        return df
    
    def kurtosis(self,win,olap):
        df=self.cgm.rolling(window=win,axis=1).apply(kurtosis).dropna(axis=1).iloc[:,::olap]
        df.columns=['kt'+str(i) for i in range(1,df.shape[1]+1)]
        return df

    def stdev(self,win,olap):
        df=self.cgm.rolling(window=win,axis=1).apply(np.std).dropna(axis=1).iloc[:,::olap]
        df.columns=['st'+str(i) for i in range(1,df.shape[1]+1)]
        return df
    
    def diff(self,win,olap):
        def getdiff(x):
            return x.max()-x[24]
        #df=self.cgm.rolling(window=win,axis=1).apply(lambda x : getdiff(x)).dropna(axis=1).iloc[:,::olap]
        #df.columns=['diff'+str(i) for i in range(1,df.shape[1]+1)]
        df=self.cgm.apply(lambda x : getdiff(x),axis=1)
        df.name="diff"
        return df
    
#     def vel(self,win,olap):
#         def getvel(x):
# #             x=pd.Series(x)
#             x.reset_index(drop=True,inplace=True)
# #             if abs(int(x[x==x.max()].index[0])-int(x[x==x.min()].index[0])):
# #                 return abs(x.max()-x.min())/abs(int(x[x==x.max()].index[0])-int(x[x==x.min()].index[0]))
            
#             if abs(int(x[x==x.max()].index[0])-24):
#                 return abs(x.max()-24)/abs(int(x[x==x.max()].index[0])-24)

#             return 0

# #         df=self.cgm.rolling(window=win,axis=1).apply(lambda x : getvel(x)).dropna(axis=1).iloc[:,::olap]
# #         df.columns=['vel'+str(i) for i in range(1,df.shape[1]+1)]
#         df=self.cgm.apply(lambda x: getvel(x),axis=1)
#         df.name="vel"
#         return df
    
#     def kurt(self):
#         df=self.cgm.apply(kurtosis,axis=1)
#         df.name='ktw'
#         return df
    
#     def iqrg(self):
#         df=self.cgm.apply(iqr,axis=1)
#         df.name='iqr'
#         return df

#     def tdiff(self):
#         def gettdiff(x):
#             x.reset_index(drop=True,inplace=True)
#             return (x.max()-x.min())/abs(x[x==x.max()].index[0]-x[x==x.min()].index[0])
#         df=self.cgm.apply(lambda x: gettdiff(x),axis=1)
#         df.name='tdiff'
#         return df
    
    def featureMatrix(self):
        self.preprocess()
        #self.stdev(10,5),self.kurtosis(10,5)
        df=pd.concat([self.fft(),self.rolling_mean(10,5),self.stdev(10,5),self.kurtosis(10,5),self.diff(10,5)],axis=1)
        return df

In [21]:
cnames=["cgmSeries_ 1","cgmSeries_ 2","cgmSeries_ 3","cgmSeries_ 4","cgmSeries_ 5","cgmSeries_ 6","cgmSeries_ 7","cgmSeries_ 8","cgmSeries_ 9","cgmSeries_10","cgmSeries_11","cgmSeries_12","cgmSeries_13","cgmSeries_14","cgmSeries_15","cgmSeries_16","cgmSeries_17","cgmSeries_18","cgmSeries_19","cgmSeries_20","cgmSeries_21","cgmSeries_22","cgmSeries_23","cgmSeries_24","cgmSeries_25","cgmSeries_26","cgmSeries_27","cgmSeries_28","cgmSeries_29","cgmSeries_30","cgmSeries_31"
]

In [22]:
#fname=sys.argv[1]
fname='proj3_test.csv'

In [23]:
obj=Patient(pd.read_csv(fname,names=cnames))
obj=obj.featureMatrix()



In [24]:
file=open('pca.pkl','rb')
p=pickle.load(file)
file.close()

In [25]:
stdscaler = StandardScaler()
obj = pd.DataFrame(stdscaler.fit_transform(obj))

obj=pd.concat([pd.DataFrame(p.transform(obj.iloc[:,:20])),obj.iloc[:,20]],axis=1)

In [26]:
obj.shape

(51, 6)

In [27]:
clusters = np.loadtxt("clusters.csv",delimiter=',')
train_features = clusters[:,:6]
train_kmeans = clusters[:,6]
train_dbscan = clusters[:,7]

In [28]:
#KNN k=5
k=5
test_kmeans = np.array([])
test_dbscan = np.array([])
for i,x in obj.iterrows():
    dist = np.array([])
    for y in train_features:
        dist = np.append(dist,np.linalg.norm(x-y))
    idx = dist.argsort()[:k]
    km = np.take(train_kmeans, idx)
    db = np.take(train_dbscan, idx)
    test_kmeans = np.append(test_kmeans,Counter(km).most_common(1)[0][0]) 
    test_dbscan = np.append(test_dbscan,Counter(db).most_common(1)[0][0])

In [29]:
result = np.column_stack((test_dbscan,test_kmeans))
np.savetxt("res.csv", result, delimiter=",",fmt='%f')

In [30]:
print("DBscan Kmeans")
print(result)

DBscan Kmeans
[[4. 5.]
 [4. 4.]
 [4. 1.]
 [4. 4.]
 [4. 4.]
 [4. 4.]
 [4. 1.]
 [6. 1.]
 [4. 4.]
 [4. 1.]
 [4. 6.]
 [5. 1.]
 [4. 4.]
 [4. 6.]
 [4. 1.]
 [6. 4.]
 [4. 6.]
 [4. 1.]
 [4. 1.]
 [4. 4.]
 [5. 5.]
 [6. 6.]
 [4. 6.]
 [2. 4.]
 [4. 1.]
 [4. 1.]
 [4. 5.]
 [5. 5.]
 [5. 1.]
 [6. 4.]
 [4. 4.]
 [4. 4.]
 [4. 4.]
 [4. 6.]
 [1. 5.]
 [2. 4.]
 [6. 4.]
 [4. 4.]
 [6. 1.]
 [4. 6.]
 [6. 1.]
 [4. 4.]
 [4. 6.]
 [5. 5.]
 [4. 6.]
 [4. 4.]
 [4. 1.]
 [4. 1.]
 [4. 6.]
 [4. 1.]
 [4. 1.]]
