In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from datetime import date, time, timedelta
from scipy.fftpack import rfft
import warnings
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score, train_test_split
import pickle
warnings.filterwarnings('ignore')

In [2]:
# Reading both the insulin set of data
insulin_data_1_df = pd.read_csv("InsulinData.csv", low_memory=False)
insulin_data_2_df = pd.read_csv("Insulin_patient2.csv" , low_memory=False)

# Reading both the CGM set of data

In [3]:
cgm_data_1_df = pd.read_csv("CGMData.csv", low_memory=False)
cgm_data_2_df = pd.read_csv("CGM_patient2.csv", low_memory=False)

In [4]:
def usingOnlyRequiredColumns(insulin_data_1_df,insulin_data_2_df):
    
    insulin_data_1_df = insulin_data_1_df[["Date", "Time", "BWZ Carb Input (grams)"]]
    insulin_data_2_df = insulin_data_2_df[["Date", "Time", "BWZ Carb Input (grams)"]]
    
    # Creating a column called Time stamp
    
    insulin_data_1_df["Timestamp"] = pd.to_datetime(insulin_data_1_df["Date"] + ' ' + insulin_data_1_df["Time"])
    insulin_data_2_df["Timestamp"] = pd.to_datetime(insulin_data_2_df["Date"] + ' ' + insulin_data_2_df["Time"])
    
    return insulin_data_1_df, insulin_data_2_df

In [5]:
insulin_data_1_df, insulin_data_2_df = usingOnlyRequiredColumns(insulin_data_1_df, insulin_data_2_df)

In [6]:
def usingOnlyRequiredColumns_cgm(cgm_data_1_df,cgm_data_2_df):
    
    cgm_data_1_df   = cgm_data_1_df[["Date", "Time", "Sensor Glucose (mg/dL)"]]
    cgm_data_2_df  = cgm_data_2_df[["Date", "Time", "Sensor Glucose (mg/dL)"]]
    
    # Creating a column called Time stamp
    
    cgm_data_1_df["Timestamp"] = pd.to_datetime(cgm_data_1_df["Date"] + ' ' + cgm_data_1_df["Time"])
    cgm_data_2_df["Timestamp"] = pd.to_datetime(cgm_data_2_df["Date"] + ' ' + cgm_data_2_df["Time"])
    
    return cgm_data_1_df, cgm_data_2_df

In [7]:
cgm_data_1_df, cgm_data_2_df = usingOnlyRequiredColumns_cgm(cgm_data_1_df, cgm_data_2_df)

# Checking our new dataframes:

In [8]:
def cleaningUpInsulinData(insulin_data_1_df, insulin_data_2_df):
    
#     Working on insulin data set - I
#     Setting up the index to Timestamp
#     Sorting values based on Timestamp
    

    insulin_data_1_df_copy = insulin_data_1_df.copy()  
    insulin_data_1_df_copy = insulin_data_1_df_copy.set_index('Timestamp')
    insulin_data_1_df_copy = insulin_data_1_df_copy.sort_values(by="Timestamp" , ascending=True)
    insulin_data_1_df_copy = insulin_data_1_df_copy.dropna().reset_index()
    insulin_data_1_df_copy['BWZ Carb Input (grams)'].replace(0.0, np.nan, inplace=True)
    insulin_data_1_df_copy = insulin_data_1_df_copy.dropna()
    insulin_data_1_df_copy = insulin_data_1_df_copy.reset_index().drop(columns="index")

    
    
    insulin_data_2_df_copy = insulin_data_2_df.copy()
    insulin_data_2_df_copy = insulin_data_2_df_copy.set_index('Timestamp')
    insulin_data_2_df_copy = insulin_data_2_df_copy.sort_values(by="Timestamp", ascending=True)
    insulin_data_2_df_copy = insulin_data_2_df_copy.dropna().reset_index()
    insulin_data_2_df_copy['BWZ Carb Input (grams)'].replace(0.0, np.nan, inplace=True)
    insulin_data_2_df_copy = insulin_data_2_df_copy.dropna()
    insulin_data_2_df_copy = insulin_data_2_df_copy.reset_index().drop(columns="index")
    
    return insulin_data_1_df_copy, insulin_data_2_df_copy


In [9]:
insulin_data_1_df_new, insulin_data_2_df_new = cleaningUpInsulinData(insulin_data_1_df, insulin_data_2_df)

In [10]:
def creatingList(insulin_data_1_df_new):
    valid_timestamp_list = []
    two_hour_thirty_mins = 150
    for index, ts in enumerate(insulin_data_1_df_new['Timestamp']):
        try:
            valid_tm = (insulin_data_1_df_new['Timestamp'][index+1] -ts).seconds / 60.0
            if valid_tm >=150:
                valid_timestamp_list.append(ts)
                
        except KeyError:
            break
    
    return valid_timestamp_list      


In [11]:
# Fetching the valid timestamp list from the insulin data:

valid_timestamp_list_1 = creatingList(insulin_data_1_df_new)
valid_timestamp_list_2 = creatingList(insulin_data_2_df_new)

In [12]:
def creatingMealData(valid_timestamp_list, cgm_data_1_df):
    
    cgmDataList = []
    
    for index, ts in enumerate(valid_timestamp_list):
        start_offset_thirty_minutes = pd.to_datetime(ts - pd.Timedelta(value=30, unit="T"))
        end_offset_oneTwenty_minutes = pd.to_datetime(ts - pd.Timedelta(value=2, unit="H"))
        fetch_date = ts.date().strftime('%-m/%-d/%Y')
        cgmDataList.append(cgm_data_1_df.loc[cgm_data_1_df['Date']==fetch_date].set_index('Timestamp').between_time
                           (start_time=start_offset_thirty_minutes.strftime('%-H:%-M:%-S'),
                            end_time=end_offset_oneTwenty_minutes.strftime('%-H:%-M:%-S'))['Sensor Glucose (mg/dL)'].values.tolist())
    return pd.DataFrame(cgmDataList)

In [13]:
def creatingMealData_2(valid_timestamp_list_2, cgm_data_2_df):
     
    cgmDataList_2 = []
    
    for index, ts in enumerate(valid_timestamp_list_2):
        start_offset_thirty_minutes = pd.to_datetime(ts - pd.Timedelta(value=30, unit="T"))
        end_offset_oneTwenty_minutes = pd.to_datetime(ts - pd.Timedelta(value=2, unit="H"))
        fetch_date = ts.date().strftime('%Y-%m-%d')
        cgmDataList_2.append(cgm_data_2_df.loc[cgm_data_2_df['Date']==fetch_date].set_index('Timestamp').between_time
                           (start_time=start_offset_thirty_minutes.strftime('%H:%M:%S'),
                            end_time=end_offset_oneTwenty_minutes.strftime('%H:%M:%S'))['Sensor Glucose (mg/dL)'].values.tolist())
    return pd.DataFrame(cgmDataList_2)

In [14]:
meal_df_1 = creatingMealData(valid_timestamp_list_1, cgm_data_1_df)
meal_df_2 = creatingMealData_2(valid_timestamp_list_2, cgm_data_2_df)

In [15]:
meal_df_1 = meal_df_1.iloc[:,0:30]

In [16]:
meal_df_2 = meal_df_2.iloc[:,0:30]


In [17]:
def noMealData(insulin_data_1_df_new,cgm_data_1_df):
    valid_timestamp_list_no_meal = []
    for index, ts in enumerate(insulin_data_1_df_new["Timestamp"]):
        try:
            value = (insulin_data_1_df_new["Timestamp"][index+1] - ts).seconds/3600
            if value >=4 :
                valid_timestamp_list_no_meal.append(ts)
        except KeyError:
            break

    no_meal_data_list_1 = []

    for index, ts in enumerate(valid_timestamp_list_no_meal):
        start_offset_two_hours = pd.to_datetime(ts + pd.Timedelta(value=2, unit="H"))
        end_offset_two_hours = pd.to_datetime(start_offset_two_hours + pd.Timedelta(value=2, unit="H"))
        get_date = ts.date().strftime('%-m/%-d/%Y')
        no_meal_data_list_1.append(cgm_data_1_df.loc[cgm_data_1_df['Date']==get_date].set_index('Timestamp').between_time
                      (start_time=start_offset_two_hours.strftime('%-H:%-M:%-S'),
                       end_time=end_offset_two_hours.strftime('%-H:%-M:%-S'))['Sensor Glucose (mg/dL)'].values.tolist())
    return pd.DataFrame(no_meal_data_list_1)
   

In [18]:
no_meal_df_1 = noMealData(insulin_data_1_df_new, cgm_data_1_df)
no_meal_df_1 = no_meal_df_1.iloc[:,0:24]


In [19]:
def noMealData_2(insulin_data_2_df_new,cgm_data_2_df):
    valid_timestamp_list_no_meal = []
    for index, ts in enumerate(insulin_data_2_df_new["Timestamp"]):
        try:
            value = (insulin_data_2_df_new["Timestamp"][index+1] - ts).seconds/3600
            if value >=4 :
                valid_timestamp_list_no_meal.append(ts)
        except KeyError:
            break

    no_meal_data_list_2 = []

    for index, ts in enumerate(valid_timestamp_list_no_meal):
        start_offset_two_hours = pd.to_datetime(ts + pd.Timedelta(value=2, unit="H"))
        end_offset_two_hours = pd.to_datetime(start_offset_two_hours + pd.Timedelta(value=2, unit="H"))
        get_date = ts.date().strftime('%Y-%m-%d')
        no_meal_data_list_2.append(cgm_data_2_df.loc[cgm_data_2_df['Date']==get_date].set_index('Timestamp').between_time
                      (start_time=start_offset_two_hours.strftime('%-H:%-M:%-S'),
                       end_time=end_offset_two_hours.strftime('%-H:%-M:%-S'))['Sensor Glucose (mg/dL)'].values.tolist())
    return pd.DataFrame(no_meal_data_list_2)

In [20]:
no_meal_df_2 = noMealData_2(insulin_data_2_df_new, cgm_data_2_df)
no_meal_df_2 = no_meal_df_2.iloc[:,0:24]

#### Feature Extraction workflow

In [21]:
def cleaning_meal_data(meal_df):
    
    indexes_to_clean = meal_df.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x: x > 6).dropna().index
    
    meal_cleaned_df = meal_df.drop(meal_df.index[indexes_to_clean]).reset_index().drop(columns="index")
    
    meal_cleaned_df = meal_cleaned_df.interpolate(method='linear', axis=1)
    
    return meal_cleaned_df


In [22]:
meal_cleaned_df_1 = cleaning_meal_data(meal_df_1)
meal_cleaned_df_2 = cleaning_meal_data(meal_df_2)

In [23]:
def createMealFeatureDataMatrix(meal_cleaned_df):
    power_second_max = []
    power_third_max = []
    List1=[]
    differential_data = []
    standard_deviation = []
    for i in range(len(meal_cleaned_df)):
        array = abs(rfft(meal_cleaned_df.iloc[:,0:30].iloc[i].values.tolist())).tolist()
        sorted_array = abs(rfft(meal_cleaned_df.iloc[:,0:30].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
    
        power_second_max.append(sorted_array[-3])
        power_third_max.append(sorted_array[-4])
    
    TimeMeal = meal_cleaned_df.iloc[:,22:25].idxmin(axis=1)
    MaximumGlucoseLevels = meal_cleaned_df.iloc[:,5:19].idxmax(axis=1)


    for i in range(len(meal_cleaned_df)):
        List1.append(np.diff(meal_cleaned_df.iloc[:,MaximumGlucoseLevels[i]:TimeMeal[i]].iloc[i].tolist()).max())
        differential_data.append(np.diff(meal_cleaned_df.iloc[:,MaximumGlucoseLevels[i]:TimeMeal[i]].iloc[i].tolist()).max())
        standard_deviation.append(np.std(meal_cleaned_df.iloc[i]))
    
    meal_data_feature_matrix = pd.DataFrame()
    meal_data_feature_matrix['Power II Max'] = power_second_max
    meal_data_feature_matrix['Power III Max'] = power_third_max
    meal_data_feature_matrix['II Differential'] = differential_data
    meal_data_feature_matrix['Standard Deviation'] = standard_deviation

    return meal_data_feature_matrix




In [24]:
meal_feature_matrix_1 = createMealFeatureDataMatrix(meal_cleaned_df_1)
meal_feature_matrix_2 = createMealFeatureDataMatrix(meal_cleaned_df_2)
meal_feature_matrix=pd.concat([meal_feature_matrix_1,meal_feature_matrix_2]).reset_index().drop(columns='index')

In [25]:
def cleaning_no_meal_data(no_meal_df):
    
    indexes_to_clean = no_meal_df.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x: x > 6).dropna().index
    
    no_meal_cleaned_df = no_meal_df.drop(no_meal_df.index[indexes_to_clean]).reset_index().drop(columns="index")
    
    no_meal_cleaned_df = no_meal_cleaned_df.interpolate(method='linear', axis=1)
    
    return no_meal_cleaned_df

In [26]:
no_meal_cleaned_df_1 = cleaning_meal_data(no_meal_df_1)
no_meal_cleaned_df_2 = cleaning_meal_data(no_meal_df_2)

In [27]:
def createNoMealFeatureDataMatrix(no_meal_cleaned_df):
    power_second_max = []
    power_third_max = []
    differential_data = []
    standard_deviation = []
    for i in range(len(no_meal_cleaned_df)):
        array = abs(rfft(no_meal_cleaned_df.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array = abs(rfft(no_meal_cleaned_df.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
    
        power_second_max.append(sorted_array[-3])
        power_third_max.append(sorted_array[-4])

    for i in range(len(no_meal_cleaned_df)):
        differential_data.append(np.diff(np.diff(no_meal_cleaned_df.iloc[:,0:24].iloc[i].tolist())).max())
        standard_deviation.append(np.std(no_meal_cleaned_df.iloc[i]))
    
    no_meal_data_feature_matrix = pd.DataFrame()
    no_meal_data_feature_matrix['Power II Max'] = power_second_max
    no_meal_data_feature_matrix['Power III Max'] = power_third_max
    no_meal_data_feature_matrix['II Differential'] = differential_data
    no_meal_data_feature_matrix['Standard Deviation'] = standard_deviation

    return no_meal_data_feature_matrix

In [28]:
no_meal_feature_matrix_1 = createNoMealFeatureDataMatrix(no_meal_cleaned_df_1)
no_meal_feature_matrix_2 = createNoMealFeatureDataMatrix(no_meal_cleaned_df_2)
no_meal_feature_matrix = pd.concat([no_meal_feature_matrix_1, no_meal_feature_matrix_2]).reset_index().drop(columns="index")

### Both Data Feature extraction Matrix for meal and no meal data are ready

1. Meal Data Matrix name is : meal_feature_matrix
2. No Meal Data Matrix name is : no_meal_feature_matrix

In [29]:
meal_feature_matrix['ClassLabel'] = 1
no_meal_feature_matrix['ClassLabel'] = 0

total_dataset = pd.concat([meal_feature_matrix, no_meal_feature_matrix]).reset_index().drop(columns="index")

In [30]:
total_dataset = total_dataset.dropna().reset_index().drop(columns="index")

In [31]:
np.random.seed(42)
dataset=shuffle(total_dataset,random_state=1).reset_index().drop(columns='index')
X = dataset.drop("ClassLabel", axis = 1)
y = dataset["ClassLabel"]

# Splitting the test and training model using the 80-20 % proportion 80% training, 20% testing or validation
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train);


In [32]:
np.random.seed(42)

clf_single_score = clf.score(X_test, y_test)

## Cross validation k-folds = 5 score : Accuracy
clf_cross_val_score_accuracy = np.mean(cross_val_score(clf, 
                                                       X, 
                                                       y, cv=5,
                                                       scoring="accuracy"))

## Cross validation k-folds = 5, score : Precision

clf_cross_val_score_precision = np.mean(cross_val_score(clf,
                                                       X,
                                                       y,
                                                       cv=5,
                                                       scoring="precision"))
## Cross validation k-folds = 5, score : Recall

clf_cross_val_score_recall = np.mean(cross_val_score(clf,
                                                    X,
                                                    y,
                                                    cv=5,
                                                    scoring="recall"))

## Cross validation k-folds = 5, score : F1 score

clf_cross_val_score_f1 = np.mean(cross_val_score(clf,
                                                X,
                                                y,
                                                cv=5,
                                                scoring="f1"))


print(f"Accuracy score with cross validation five : {(clf_cross_val_score_accuracy) * 100:.2f}%")
print(f"Precision score with cross validation five : {(clf_cross_val_score_precision) * 100:.2f}%")
print(f"Recall score with cross validation five : {(clf_cross_val_score_recall) * 100:.2f}%")
print(f"F-1 score with cross validation five : {(clf_cross_val_score_f1) * 100:.2f}%")
print(f"Single validation score without cross validation :{(clf_single_score) * 100:.2f}%")

Accuracy score with cross validation five : 91.55%
Precision score with cross validation five : 88.80%
Recall score with cross validation five : 96.79%
F-1 score with cross validation five : 92.53%
Single validation score without cross validation :93.31%


### Saving and loading trained machine learning models

1. Python `pickle()` module

In [33]:
# Save an existing model to file
pickle.dump(clf, open('clf_random_forest_classifier.pkl', "wb"))