In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import warnings

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

In [22]:
clinical = pd.read_csv('clinicalvol2.csv')
clinical

Unnamed: 0,fried,part_id,activity_regular,gait_speed_4m,gait_get_up,cognitive_total_score,mmse_total_score,depression_total_score,health_rate,leisure_out,age,pain_perception
0,0,1001,2.0,7.00,18.00,25.0,30,1.0,3.0,7.0,75,4.2
1,1,1002,2.0,11.00,60.00,27.0,28,8.0,3.0,7.0,73,3.3
2,1,1003,1.0,11.00,21.00,26.0,27,1.0,4.0,7.0,72,3.4
3,2,1004,1.0,14.30,24.70,23.0,24,4.0,3.0,3.0,88,7.3
4,1,1005,3.0,8.00,42.00,24.0,27,3.0,3.0,7.0,83,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
535,1,3593,3.0,2.30,6.20,27.0,27,2.0,4.0,0.0,75,1.0
536,0,3594,3.0,2.70,8.40,27.0,29,2.0,4.0,5.0,71,2.0
537,0,3600,3.0,2.28,6.06,21.0,27,3.0,4.0,7.0,77,3.9
538,1,3601,2.0,2.85,8.22,26.0,28,0.0,4.0,14.0,84,4.9


In [23]:
X = clinical.values[:, 1:]
y = clinical.values[:, 0].reshape(-1,1)
X = np.array(X)
y = np.array(y)

In [24]:
# split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [25]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((486, 11), (54, 11), (486, 1), (54, 1))

In [26]:
dtc = DecisionTreeClassifier()
knn = KNeighborsClassifier()
rfc = RandomForestClassifier()

In [27]:
# cross validate dtc
scores = cross_val_score(dtc, X_train, y_train, cv=10, scoring='accuracy')
print(f'Accuracy for each fold: {scores}')
print(f'Accuracy score: {scores.mean()}')

Accuracy for each fold: [0.51020408 0.65306122 0.48979592 0.57142857 0.6122449  0.55102041
 0.64583333 0.54166667 0.4375     0.5       ]
Accuracy score: 0.5512755102040816


In [28]:
# cross validate k-NN

scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
print(f'Accuracy for each fold: {scores}')
print(f'Accuracy score: {scores.mean()}')

Accuracy for each fold: [0.53061224 0.59183673 0.63265306 0.59183673 0.40816327 0.55102041
 0.54166667 0.60416667 0.47916667 0.54166667]
Accuracy score: 0.5472789115646259


In [29]:
# cross validate rfc

scores = cross_val_score(rfc, X_train, y_train, cv=10, scoring='accuracy')
print(f'Accuracy for each fold: {scores}')
print(f'Accuracy score: {scores.mean()}')

Accuracy for each fold: [0.65306122 0.6122449  0.55102041 0.67346939 0.59183673 0.51020408
 0.625      0.5625     0.54166667 0.5625    ]
Accuracy score: 0.5883503401360545


In [30]:
dtc.fit(X_train, y_train)
pred_test = dtc.predict(X_test)
pred_train = dtc.predict(X_train)
print(f'accuracy (test_set): {accuracy_score(y_test, pred_test)}')
print(f'accuracy (train_test): {accuracy_score(y_train, pred_train)}')

accuracy (test_set): 0.5925925925925926
accuracy (train_test): 1.0


In [31]:
knn.fit(X_train, y_train)
pred_test = knn.predict(X_test)
pred_train = knn.predict(X_train)
print(f'accuracy (test_set): {accuracy_score(y_test, pred_test)}')
print(f'accuracy (train_test): {accuracy_score(y_train, pred_train)}')

accuracy (test_set): 0.6111111111111112
accuracy (train_test): 0.6790123456790124


In [32]:
rfc.fit(X_train, y_train)
pred_test = rfc.predict(X_test)
pred_train = rfc.predict(X_train)
print(f'accuracy (test_set): {accuracy_score(y_test, pred_test)}')
print(f'accuracy (train_test): {accuracy_score(y_train, pred_train)}')

accuracy (test_set): 0.7037037037037037
accuracy (train_test): 1.0


In [33]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [34]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [35]:
#rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(X_train, y_train)

In [36]:
#rf_random.best_params_

In [37]:
rfc = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1,
                             max_features='sqrt', max_depth=50, bootstrap=True)
rfc.fit(X_train, y_train)
pred_test_rfr = rfc.predict(X_test)
pred_train_rfr = rfc.predict(X_train)
print(f'accuracy (test_set): {accuracy_score(y_test, pred_test_rfr)}')
print(f'accuracy (train_test): {accuracy_score(y_train, pred_train_rfr)}')

accuracy (test_set): 0.7222222222222222
accuracy (train_test): 1.0


In [38]:
y_pred = knn.predict(X_test)
#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.61      0.74      0.67        23
         1.0       0.53      0.41      0.46        22
         2.0       0.78      0.78      0.78         9

    accuracy                           0.61        54
   macro avg       0.64      0.64      0.64        54
weighted avg       0.60      0.61      0.60        54



In [39]:
#List Hyperparameters that we want to tune.
#leaf_size = list(range(1,50))
#n_neighbors = list(range(1,30))
#p=[1,2]
##Convert to dictionary
#hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
##Create new KNN object
#knn_2 = KNeighborsClassifier()
##Use GridSearch
#clf = GridSearchCV(knn_2, hyperparameters, cv=10)
##Fit the model
#best_model = clf.fit(X_train,y_train)
##Print The value of best Hyperparameters
#print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
#print('Best p:', best_model.best_estimator_.get_params()['p'])
#print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

In [40]:
knn = KNeighborsClassifier(leaf_size=1, p=2, n_neighbors=26)
knn.fit(X_train, y_train)
pred_test = knn.predict(X_test)
pred_train = knn.predict(X_train)
print(f'accuracy (test_set): {accuracy_score(y_test, pred_test)}')
print(f'accuracy (train_test): {accuracy_score(y_train, pred_train)}')

accuracy (test_set): 0.7037037037037037
accuracy (train_test): 0.6358024691358025
