# 5170 Final Project

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn # import scikit-learn
from sklearn import preprocessing # import preprocessing utilites
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv(r"C:\[Lab]\ml\smarthome\data\00_activityscores.txt", 
                 sep=' ', header=None,
                 names=["id", "act01", "act02", "act03", "act04", "act05", "act06", "act07", "act08", "sum", "acc", "seq", "space"])

In [3]:
data.head(5)

Unnamed: 0,id,act01,act02,act03,act04,act05,act06,act07,act08,sum,acc,seq,space
0,1,2,1,2,1,1,1,1,1,10,?,?,
1,2,?,?,?,?,?,?,?,?,?,?,?,
2,3,?,?,?,?,?,?,?,?,?,?,?,
3,5,1,1,1,1,1,1,2,2,10,?,?,
4,6,1,1,1,1,1,1,2,4,12,11,4,


In [4]:
df = data.drop(["acc", "seq","space"], axis = 1)

In [5]:
print(df['sum'].value_counts()['?'])
print(df.shape)

42
(388, 10)


In [6]:
# df.drop(df[df['Sum']== '?'].index, inplace = True)
df = df[df['sum'] != '?']
print(df.shape)

(346, 10)


In [7]:
print(df['act08'].value_counts()['?'])
print(df.shape)

8
(346, 10)


In [8]:
# remove all the ? from original dataset
for col in ["act01", "act02", "act03", "act04", "act05", "act06", "act07", "act08"]:
  df = df[df[col] != '?']

In [9]:
print(df.shape)

(335, 10)


In [10]:
# replace ? with NaN
df.replace('?', None)

Unnamed: 0,id,act01,act02,act03,act04,act05,act06,act07,act08,sum
0,1,2,1,2,1,1,1,1,1,10
3,5,1,1,1,1,1,1,2,2,10
4,6,1,1,1,1,1,1,2,4,12
5,7,2,1,1,1,1,1,2,2,11
6,8,1,4,1,1,1,1,2,2,13
...,...,...,...,...,...,...,...,...,...,...
383,394,1,1,4,2,1,1,2,1,13
384,395,1,1,1,1,1,1,1,1,8
385,397,1,1,1,2,1,1,1,1,9
386,399,1,1,1,1,1,2,2,1,10


In [11]:
# check NaN in DF
df.isna().sum()

id       0
act01    0
act02    0
act03    0
act04    0
act05    0
act06    0
act07    0
act08    0
sum      0
dtype: int64

In [12]:
# read the diagnosis result
df_diag=pd.read_csv(r"C:\[Lab]\ml\smarthome\data\00_diagnosis.txt", sep=' ', names=['id','diagnosis'])

In [13]:
df_diag.head()

Unnamed: 0,id,diagnosis
0,1,3
1,2,3
2,3,2
3,4,8
4,5,5


In [14]:
print(df_diag.shape)
df_diag.id.nunique()

(400, 2)


400

In [15]:
# Merge two df into one
df = df.merge(df_diag, on = 'id', how = 'left')

In [16]:
# display the original diagnosis value distribution
df['diagnosis'].value_counts()

4     83
2     56
8     55
5     40
3     33
7     28
1     16
6     15
9      8
10     1
Name: diagnosis, dtype: int64

In [17]:
# remove diag 10 from dataframe
df = df[df['diagnosis'] != 10]

In [18]:
# display diagnosis value distribution
df['diagnosis'].value_counts()

4    83
2    56
8    55
5    40
3    33
7    28
1    16
6    15
9     8
Name: diagnosis, dtype: int64

In [19]:
# Diagnosis Result List
# *1 = dementia
# *2 = MCI (Mild Cognitive Impairment)
# 3 = middle age 45-59
# 4 = young-old 60-74
# 5 = old-old 75+
# *6 = other medical
# 7 = watch/at risk - follow longitudinally
# 8 = younger adult
# 9 = younger adult, English second language
# 10 = diagnosis not available

In [20]:
y=df['diagnosis']
X=df.drop(['id','diagnosis'], axis = 1)

In [21]:
# With the use of mapping function, we replace label in the form of string to an integer. 
# Mapping result 1,2,6 -> 1 (means might have some medical issue)
# Mapping result 3,4,5,7,8,9 -> 0 (means should be without medical issue)

output_map = {3: 0, 4: 0, 5: 0, 7: 0, 8: 0, 9: 0, 1: 1, 2: 1, 6: 1}
y_relabel = y.map(output_map)

In [22]:
# display the result after mapping
y_relabel.value_counts()

0    247
1     87
Name: diagnosis, dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y_relabel, test_size=0.2)

In [24]:
y_train.value_counts()

0    196
1     71
Name: diagnosis, dtype: int64

In [25]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

y_resampled.value_counts()

1    196
0    196
Name: diagnosis, dtype: int64

In [26]:
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

In [27]:
# from sklearn.externals import joblib
import joblib
# import sklearn.external.joblib as extjoblib

scaler_file = "scaler.save"
joblib.dump(scaler, scaler_file) 

# And to load...
scaler = joblib.load(scaler_file) 

In [28]:
X_test_scaled = scaler.transform(X_test)

In [29]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [30]:
# helper method to print basic model metrics
def metrics(y_true, y_pred):
    # print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    print('\nReport:\n', classification_report(y_true, y_pred))

In [31]:
clf = GradientBoostingClassifier(n_estimators = 100, max_depth = 100)
clf.fit(X_train_scaled, y_train)

GradientBoostingClassifier(max_depth=100)

In [None]:
modelname = "xgb_clf.save"
joblib.dump(clf, modelname)

In [32]:
y_test.value_counts()

0    51
1    16
Name: diagnosis, dtype: int64

In [33]:
#resample the validation dataset
X_test_resampled, y_test_resampled = SMOTE().fit_resample(X_test_scaled, y_test)

In [34]:
y_test_resampled.value_counts()

1    51
0    51
Name: diagnosis, dtype: int64

In [35]:
# performance WITH resample validation set
y_re_pred = clf.predict(X_test_resampled)
metrics(y_test_resampled, y_re_pred)


Report:
               precision    recall  f1-score   support

           0       0.61      0.80      0.69        51
           1       0.71      0.49      0.58        51

    accuracy                           0.65       102
   macro avg       0.66      0.65      0.64       102
weighted avg       0.66      0.65      0.64       102

