# 5170 Final Project
Used the 400 sensor data files to estimate the prediction

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn # import scikit-learn
from sklearn import preprocessing # import preprocessing utilites
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [2]:
# this part only need to run once when the project is initiated
# to obtain all the sensor name that appeared in 400 data files
# =========================================

# sensornames = {"test_sample"}

# for i in range(1, 400):
#   pfname = str(i).zfill(3)
#   fpath = f'/content/drive/MyDrive/Colab Notebooks/5170/SmartHome/data/{pfname}.txt'

#   file_size = os.path.getsize(fpath)
#   if file_size > 11:
#     rdata = pd.read_csv(fpath, sep=' ', header=None, names=["time", "sensor", "reads", "col04"])
#     new_list = list(rdata['sensor'].value_counts().to_dict().keys())
#     # print(new_list)
#     sensornames.update(new_list)

# print(len(sensornames))
# print(sensornames)

# switch_sensor = []
# for name in sensornames:
#   if len(name) == 4 or len(name) == 5:
#     switch_sensor.append(name)

# print(len(switch_sensor))

# switch_sensor.sort()
# print(switch_sensor)

# num_sensor = []
# for name in sensornames:
#   if len(name) == 5:
#     if name[0]=="M" or name[0]=="m":
#       switch_sensor.append(name)
#     if name[0]=="L" or name[0]=="l":
#       num_sensor.append(name)

# print(len(num_sensor))
# print(num_sensor)
# =========================================

In [3]:
# status sensor only record ON/OFF status

status_sensors = [
    'D001', 'D002', 'D003', 'D004', 'D005', 'D006', 'D007', 'D008', 'D009', 'D010', 'D011', 'D012', 'D013', 'D014', 'D015', 'D016', 'D017', 'D018', 
    'E002', 'E003', 'F001', 'F002', 
    'I001', 'I002', 'I006', 'I010', 'I011', 'I012', 
    'L001', 'L002', 'L003', 'L004', 'L005', 'L006', 'L007', 'L008', 'L009', 'L010', 'L011', 
    'M001', 'M002', 'M003', 'M004', 'M005', 'M006', 'M007', 'M008', 'M009', 'M010', 'M011', 'M012', 'M013', 'M014', 'M015', 'M016', 'M017', 'M018', 'M019', 'M020', 'M021', 'M022', 'M023', 'M024', 'M025', 'M026', 'M027', 'M028', 'M029', 'M030', 'M031', 'M032', 'M033', 'M034', 'M035', 'M036', 'M037', 'M038', 'M039', 'M040', 'M041', 'M042', 'M043', 'M044', 'M045', 'M046', 'M047', 'M048', 'M049', 'M050', 'M051', 'M052', 
    'MA201', 'MA202', 'MA203', 'MA204', 'MA205', 'MA207', 
    'SS001', 'SS002', 'SS003', 'SS004', 'SS005', 'SS006', 'SS007', 'SS008', 'SS009', 'SS010', 'SS011', 'SS012', 'SS015', 'SS016', 'SS017', 'SS018', 'SS019', 'SS020', 'SS021', 
]

In [4]:
# digit sensor record the sensor value of that moment

digit_sensors = [
    'LL002', 'LL003', 'LL004', 'LL005', 'LL006', 'LL007', 'LL008', 'LL009', 'LL010', 'LL011', 
    'LS001', 'LS002', 'LS003', 'LS004', 'LS005', 'LS006', 'LS007', 'LS008', 'LS009', 'LS010', 'LS011', 'LS012', 'LS013', 'LS014', 'LS015', 'LS016', 'LS017', 'LS018', 'LS019', 'LS020', 'LS021', 'LS022', 'LS023', 'LS024', 'LS025', 'LS026', 'LS027', 'LS028', 'LS029', 'LS030', 'LS031', 'LS032', 'LS033', 'LS034', 'LS035', 'LS036', 'LS037', 'LS038', 'LS039', 'LS042', 'LS043', 'LS044', 'LS045', 'LS046', 'LS047', 'LS048', 'LS049', 'LS050', 'LS051', 'LS201', 'LS202', 'LS203', 'LS204', 'LS205', 'LS206', 'LS207', 
    'P001',  
    'T001', 'T002', 'T003', 'T004', 'T005', 'T101', 'T102', 'T103', 'T104', 'T105', 'T106', 'T107', 'T108', 'T109', 'T110', 'T111'
]

In [5]:
all_column = status_sensors + digit_sensors
(all_column).sort()

In [6]:
# build the dataframe to record the data from 400 files

sensor_data = pd.DataFrame(columns = ["id"] + all_column)

In [7]:
sensor_data.columns

Index(['id', 'D001', 'D002', 'D003', 'D004', 'D005', 'D006', 'D007', 'D008',
       'D009',
       ...
       'T102', 'T103', 'T104', 'T105', 'T106', 'T107', 'T108', 'T109', 'T110',
       'T111'],
      dtype='object', length=200)

In [8]:
# retrieved data from 400 files

for i in range(1, 401):
  pfname = str(i).zfill(3)
  # print("File name: {}".format(pfname))

  fpath = f'/content/drive/MyDrive/Colab Notebooks/5170/SmartHome/data/{pfname}.txt'
  file_size = os.path.getsize(fpath)
  
  if file_size > 11:    # if the file contained valid data
    # get the row sonsor data from txt file
    rdata = pd.read_csv(fpath, sep=' ', header=None, names=["time", "sensor", "reads", "col04"])
    # get how many sensor appeared in the file
    sensor_dict = rdata['sensor'].value_counts().to_dict()
    # print(sensor_dict)
    # sensornames.update(new_list)
    for k,v in sensor_dict.items():
      if k in digit_sensors:
        # count the sum of the data value
        total_value = rdata.loc[rdata["sensor"] == k, "reads"].astype(float).sum()
        # calculate average value of sensor
        sensor_dict[k] = round(total_value/v, 4)

    new_row = {}
    new_row["id"] = i
    for key in all_column:
    # if the sensor appeared in the file, get the number of it appears or the avg
    # otherwise set it to zero
      if key not in sensor_dict.keys():
        new_row[key] = 0.0
      else:
        new_row[key] = float(sensor_dict[key])
    
    # add the data to the dataframe
    sensor_data = sensor_data.append(new_row, ignore_index=True)

In [9]:
print(sensor_data.shape)

(361, 200)


In [10]:
sensor_data.head()

Unnamed: 0,id,D001,D002,D003,D004,D005,D006,D007,D008,D009,...,T102,T103,T104,T105,T106,T107,T108,T109,T110,T111
0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,12.0,0.0,0.0,8.0,4.0,2.0,26.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6.0,6.0,0.0,0.0,0.0,8.0,8.0,58.0,0.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7.0,4.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.0,2.0,0.0,0.0,0.0,4.0,0.0,26.0,0.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_diag=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/5170/SmartHome/data/00_diagnosis.txt', sep=' ', names=['id','diagnosis'])

In [12]:
# Merge two df into one
df = sensor_data.merge(df_diag, on = 'id', how = 'left')

In [13]:
df = df.drop(['id'], axis=1)

In [14]:
df.shape

(361, 200)

In [15]:
df.head()

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,T103,T104,T105,T106,T107,T108,T109,T110,T111,diagnosis
0,2.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,9.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
1,12.0,0.0,0.0,8.0,4.0,2.0,26.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2,6.0,0.0,0.0,0.0,8.0,8.0,58.0,0.0,16.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,4.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
4,2.0,0.0,0.0,0.0,4.0,0.0,26.0,0.0,18.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [16]:
y = df['diagnosis']
X = df.drop(['diagnosis'], axis = 1)

In [17]:
X.head()

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,T102,T103,T104,T105,T106,T107,T108,T109,T110,T111
0,2.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,9.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.0,0.0,0.0,8.0,4.0,2.0,26.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6.0,0.0,0.0,0.0,8.0,8.0,58.0,0.0,16.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,0.0,0.0,0.0,4.0,0.0,26.0,0.0,18.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
s_model = scaler.fit(X)
X_scaled = s_model.transform(X)

In [19]:
# With the use of mapping function, we replace label in the form of string to an integer. 
# Mapping result 1,2,6 -> 1 (means might have some medical issue)
# Mapping result 3,4,5,7,8,9,10 -> 0 (means should be without medical issue)

output_map = {3: 0, 4: 0, 5: 0, 7: 0, 8: 0, 9: 0, 1: 1, 2: 1, 6: 1, 10: 0}
y_relabel = y.map(output_map)

In [20]:
# display the result after mapping
y_relabel.value_counts()

0    268
1     93
Name: diagnosis, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_relabel, test_size=0.2)

In [22]:
# from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
y_resampled.value_counts()

0    214
1    214
Name: diagnosis, dtype: int64

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# helper method to print basic model metrics
def metrics(y_true, y_pred):
    # print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    print('\nReport:\n', classification_report(y_true, y_pred))

In [24]:
# train the model
clf = GradientBoostingClassifier(n_estimators = 200, max_depth = 200)
clf.fit(X_resampled, y_resampled)

GradientBoostingClassifier(max_depth=200, n_estimators=200)

In [25]:
#resample the validation dataset

X_test_scaled = s_model.transform(X_test)
X_test_resampled, y_test_resampled = SMOTE().fit_resample(X_test_scaled, y_test)



In [26]:
# performance with resample validation set

y_re_pred = clf.predict(X_test)
metrics(y_test, y_re_pred)


Report:
               precision    recall  f1-score   support

           0       0.75      0.70      0.72        54
           1       0.27      0.32      0.29        19

    accuracy                           0.60        73
   macro avg       0.51      0.51      0.51        73
weighted avg       0.62      0.60      0.61        73

