In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#Read Data

In [2]:
import pandas as pd

base_path = '/content/drive/MyDrive/Edu/CM1001 Applied Machine Learning and Data Mining/Diabetes-Data/data-'

data_frame = pd.DataFrame()
for i in range(0, 70):
    file_name = base_path + (i + 1).__str__().zfill(2)
    data_frame = data_frame.append(pd.read_csv(file_name, names=['Date', 'Time', 'Code', 'Value'], delim_whitespace=True), ignore_index=True)

print(data_frame)

             Date   Time Code Value
0      04-21-1991   9:09   58   100
1      04-21-1991   9:09   33     9
2      04-21-1991   9:09   34    13
3      04-21-1991  17:08   62   119
4      04-21-1991  17:08   33     7
...           ...    ...  ...   ...
29325  05-09-1989  08:00   33   1.0
29326  05-09-1989  08:00   34   7.0
29327  05-10-1989  08:00   34   7.0
29328  05-11-1989  08:00   34   7.0
29329  05-12-1989  08:00   34   7.0

[29330 rows x 4 columns]


#Clean data

In [3]:
rows_before = len(data_frame)

# Clean dates
# We remove the whole column, because the model
# should predict [Code, Time] to Value
try:
  data_frame.drop(['Date'], inplace=True, axis=1 )
except:
  pass

# Clean codes
# We keep only code related to concrete glucose measurements
defined_codes = (48,57,58,59,60,61,62,63,64)
data_frame = data_frame[data_frame.Code.isin(defined_codes)]

# Clean time
# We remove every time that could not be parsed (hours or minutes)
# We also remove it if hours or minutes are out of range
for i in range(len(data_frame)):
  try:
    hour = int(data_frame.iloc[i]['Time'].split(':')[0])
    minute = int(data_frame.iloc[i]['Time'].split(':')[1])
    if hour > 23 or hour < 0:
      hour = 0
      minute = -1
    elif minute > 59 or minute < 0:
      hour = 0
      minute = -1
  except:
    hour = 0
    minute = -1
  finally:
    data_frame.iloc[i]['Time'] = hour * 60 + minute
data_frame = data_frame[data_frame['Time'].apply(lambda x: x != -1)]

# Clean values
# We revome every value that could not be parsed to a number
# We also remove it if the value is zero, as this would
# indicate an invalid glucose measurement
for i in range(len(data_frame)):
  try:
    value = int(data_frame.iloc[i]['Value'])
    if value == 0:
        value = -1
  except:
    value = -1
  finally:
    data_frame.iloc[i]['Value'] = value
data_frame = data_frame[data_frame['Value'].apply(lambda x: x != -1)]

rows_after = len(data_frame)

print(data_frame)
print('\nRemoved %d rows' % (rows_before - rows_after))

       Time Code Value
0       549   58   100
3      1028   62   119
5      1371   48   123
6       455   58   216
10     1016   62   211
...     ...  ...   ...
29314   720   60   151
29317  1320   48   265
29318   480   58   248
29323  1320   48   145
29324   480   58   259

[12447 rows x 3 columns]

Removed 16883 rows


#Predict

##Import

In [None]:
import matplotlib
import numpy

from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, precision_score
from sklearn import svm

##Pandas to axis

In [None]:
# X-axis: [time, code]
# Y-axis: value
tp_codes = numpy.column_stack((data_frame['Time'].tolist(),  data_frame['Code'].tolist()))
values = data_frame['Value'].tolist()

##Train and test split

In [None]:
poly = PolynomialFeatures(degree=4)
poly_features = poly.fit_transform(tp_codes)
tp_codes_train, tp_codes_test, values_train, values_test = train_test_split(poly_features, values)

##Regressions

###Linear

In [None]:
regr = linear_model.LinearRegression()
regr.fit(tp_codes_train, values_train)
value_pred = regr.predict(tp_codes_test)

print('\nLinear regression: ')
print("Mean squared error: %.2f" % mean_squared_error(values_test, value_pred))
print("Coefficient of determination: %.2f" % r2_score(values_test, value_pred))

###SVR

In [None]:
sv = svm.SVR(kernel='poly')
sv.fit(tp_codes_train, values_train)
value_pred_svr = sv.predict(tp_codes_test)

print('\nScalar vector regression: ')
print("Mean squared error: %.2f" % mean_squared_error(values_test, value_pred_svr))
print("Coefficient of determination: %.2f" % r2_score(values_test, value_pred_svr))

###SVC

In [7]:
svc = svm.SVC(kernel='poly')
svc.fit(tp_codes_train, values_train)
value_pred_svc = svc.predict(tp_codes_test)

print('\nScalar vector classification: ')
print("Mean squared error: %.2f" % mean_squared_error(values_test, value_pred_svc))
print("Coefficient of determination: %.2f" % r2_score(values_test, value_pred_svr))


Scalar vector classification: 
Mean squared error: 10848.63
Coefficient of determination: -0.01


###Logistical

In [6]:
log = linear_model.LogisticRegression(max_iter=100)
log.fit(tp_codes_train, values_train)
value_pred_log = log.predict(tp_codes_test)

print('\nLogistical regression: ')
print("Mean squared error: %.2f" % mean_squared_error(values_test, value_pred_log))
print("Coefficient of determination: %.2f" % r2_score(values_test, value_pred_log))


Linear regression: 
Mean squared error: 6663.96
Coefficient of determination: 0.02

Scalar vector regression: 
Mean squared error: 6888.05
Coefficient of determination: -0.01

Scalar vector classification: 
Mean squared error: 10848.63
Coefficient of determination: -0.01

Logistical regression: 
Mean squared error: 15095.84
Coefficient of determination: -1.22





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Process
#### Read
The process started with reading all the data and putting in a unified data set using Pandas. Columns where Date, Time, Code and Value. 

#### Clean
While reading the table there were a lot of unusable data. First of all, the contained quite a lot of garbage values, that without context was inadaqute for use in a model. For example, hours and minutes where sometimes represented as numbers largers than 24 and 60 respectively. 
The solution for clearing out such values was to try to parse the data, and if it isn't possible, throw away that row in the table. 

Furthermore, many codes where also not relevant for what we are trying to model. So a set of 'valid codes' where created and every row in the table that did not belong to that set was removed.

This process reduced the number of rows in the table with rougly 17000. 

#### Prediction
When predicting using the cleaned data set, we use the built in `test_train_split`. We then fed the output train and test set into fit and predict methods respectively. 




### Real world
Ethical implications of using a model a described would be mostly related due to its inaccuracy. Deploying such a model could come with numerous problems with its predictions, since it would lead to unreliable conclusions for the patient.


### Conclusion
It has been shown above that we get relatively inaccurate results through the small coefficients of determination. Different types of models where used (through a Model Selection process) to find suitable one with higher accuracy, but without any success in find a model with higher coeffcient that 0.02 (each models accuracy is printed when running the code).