In [None]:
import pandas as pd
import numpy as np
from imblearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2
from scipy.stats import randint
import sklearn.metrics as skmet
from sklearn.model_selection import cross_validate

import os
import sys

# KNN 分类

KNN Classification

In [None]:
from sklearn.model_selection import RandomizedSearchCV,\
  RepeatedStratifiedKFold
  
sys.path.append(os.getcwd() + "/helperfunctions")
import healthinfo as hi

pd.set_option('display.width', 78)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.3f}'.format

# load the health information data
X_train = hi.X_train
X_test = hi.X_test
y_train = hi.y_train
y_test = hi.y_test
new_cols = hi.new_cols

new_cols

# run a knn model with 5 nearest neighbors
knn_example = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)

pipe0 = make_pipeline(hi.coltrans, hi.smotenc, knn_example)

scores = cross_validate(pipe0, X_train,
  y_train.values.ravel(), \
  scoring=['accuracy','precision','recall','f1'], \
  cv=kf, n_jobs=-1)

print("accuracy: %.2f, sensitivity: %.2f, precision: %.2f, f1: %.2f"  %
  (np.mean(scores['test_accuracy']),\
  np.mean(scores['test_recall']),\
  np.mean(scores['test_precision']),\
  np.mean(scores['test_f1'])))

# do some hyperparameter tuning
knn = KNeighborsClassifier(n_jobs=-1)

pipe1 = make_pipeline(hi.coltrans, hi.smotenc,
   SelectKBest(score_func=chi2), knn)

knn_params = {
 'selectkbest__k':
    randint(1, len(new_cols)),
 'kneighborsclassifier__n_neighbors':
    randint(5, 300),
 'kneighborsclassifier__metric':
    ['euclidean','manhattan','minkowski']
}

rs = RandomizedSearchCV(pipe1, knn_params, cv=5, scoring="roc_auc")
rs.fit(X_train, y_train.values.ravel())

selected = rs.best_estimator_['selectkbest'].\
  get_support()
selected.sum()
new_cols[selected]
rs.best_params_
rs.best_score_


pred = rs.predict(X_test)

print("accuracy: %.2f, sensitivity: %.2f, specificity: %.2f, precision: %.2f"  %
  (skmet.accuracy_score(y_test.values.ravel(), pred),
  skmet.recall_score(y_test.values.ravel(), pred),
  skmet.recall_score(y_test.values.ravel(), pred, pos_label=0),
  skmet.precision_score(y_test.values.ravel(), pred)))


cm = skmet.confusion_matrix(y_test, pred)
cmplot = skmet.ConfusionMatrixDisplay(confusion_matrix=cm,
  display_labels=['Negative', 'Positive'])
cmplot.plot()
cmplot.ax_.set(title='Heart Disease Prediction Confusion Matrix', 
  xlabel='Predicted Value', ylabel='Actual Value')

# KNN 多类别

knn multiclass

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTENC

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans

pd.set_option('display.width', 78)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

# load the machine failure data
machinefailuretype = pd.read_csv("data/machinefailuretype.csv")
machinefailuretype.info()
machinefailuretype.head()
machinefailuretype.failtype.value_counts(dropna=False).sort_index()
machinefailuretype.machinetype.\
  value_counts(dropna=False).sort_index()

def setcode(typetext):
  if (typetext=="No Failure"):
    typecode = 1
  elif (typetext=="Heat Dissipation Failure"):
    typecode = 2
  elif (typetext=="Power Failure"):
    typecode = 3
  elif (typetext=="Overstrain Failure"):
    typecode = 4
  else:
    typecode = 5
  return typecode

machinefailuretype["failtypecode"] = \
  machinefailuretype.apply(lambda x: setcode(x.failtype), axis=1)

machinefailuretype.groupby(['failtypecode','failtype']).size().\
  reset_index()
  
# take a look at some of the data

# identify numeric and categorical data
num_cols = ['airtemp','processtemperature','rotationalspeed',
  'torque','toolwear']
cat_cols = ['machinetype']

machinefailuretype[num_cols].agg(['min','median','max']).T

# create training and testing DataFrames
X_train, X_test, y_train, y_test =  \
  train_test_split(machinefailuretype[num_cols + cat_cols],\
  machinefailuretype[['failtypecode']], test_size=0.2, random_state=0)


# setup column transformations
ohe = OneHotEncoder(drop='first')

cattrans = make_pipeline(ohe)
standtrans = make_pipeline(OutlierTrans(3),SimpleImputer(strategy="median"),
  MinMaxScaler())
coltrans = ColumnTransformer(
  transformers=[
    ("cat", cattrans, cat_cols),
    ("stand", standtrans, num_cols),
  ]
)

coltrans.fit(X_train.sample(1000))

new_cat_cols = \
  coltrans.\
  named_transformers_['cat'].\
  named_steps['onehotencoder'].\
  get_feature_names_out(cat_cols)

new_cols = np.concatenate((new_cat_cols, np.array(num_cols)))

print(new_cols)

# construct a pipeline with preprocessing, feature selection, and logistic model
catcolscnt = new_cat_cols.shape[0]
smotenc = SMOTENC(categorical_features=np.arange(0,catcolscnt), random_state=0)

knn = KNeighborsClassifier(n_jobs=-1)

pipe1 = make_pipeline(coltrans, smotenc, SelectKBest(score_func=chi2), knn)

knn_params = {
 'selectkbest__k': np.arange(1, len(new_cols)),
 'kneighborsclassifier__n_neighbors': np.arange(5, 175, 2),
 'kneighborsclassifier__metric': ['euclidean','manhattan','minkowski']
}

rs = RandomizedSearchCV(pipe1, knn_params, cv=5, scoring="roc_auc_ovr_weighted")

rs.fit(X_train, y_train.values.ravel())

rs.best_params_
rs.best_score_


selected = rs.best_estimator_['selectkbest'].get_support()
selected.sum()
new_cols[selected]
rs.best_params_
rs.best_score_

new_cols[rs.best_estimator_['selectkbest'].get_support()]

pred = rs.predict(X_test)

cm = skmet.confusion_matrix(y_test, pred)
cmplot = skmet.ConfusionMatrixDisplay(confusion_matrix=cm,
   display_labels=['None', 'Heat','Power','Overstrain','Other'])
cmplot.plot()
cmplot.ax_.set(title='Machine Failure Type Confusion Matrix', 
  xlabel='Predicted Value', ylabel='Actual Value')

print(skmet.classification_report(y_test, pred,
  target_names=['None', 'Heat','Power','Overstrain','Other']))

# KNN 多类信件

knn multiclass letters