In [1]:
import pandas as pd
import numpy as np

## Data loading

In [2]:
bank_data=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data',names=['A'+str(i) for i in range(1,17)])
bank_data.describe(include=['object', 'float', 'int'])

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
count,690,690,690.0,690,690,690,690,690.0,690,690,690.0,690,690,690.0,690.0,690
unique,3,350,,4,4,15,10,,2,2,,2,3,171.0,,2
top,b,?,,u,g,c,v,,t,f,,f,g,0.0,,-
freq,468,12,,519,519,137,399,,361,395,,374,625,132.0,,383
mean,,,4.758725,,,,,2.223406,,,2.4,,,,1017.385507,
std,,,4.978163,,,,,3.346513,,,4.86294,,,,5210.102598,
min,,,0.0,,,,,0.0,,,0.0,,,,0.0,
25%,,,1.0,,,,,0.165,,,0.0,,,,0.0,
50%,,,2.75,,,,,1.0,,,0.0,,,,5.0,
75%,,,7.2075,,,,,2.625,,,3.0,,,,395.5,


In [3]:
diabetes_data = pd.read_csv('https://raw.githubusercontent.com/praisan/hello-world/master/diabetes.csv')
diabetes_data.describe(include=['object', 'float', 'int'])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Missing value

In [4]:
from sklearn.base import TransformerMixin
class ModeMedianImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with median of column.

        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].mode() if X[c].dtype == np.dtype('O') else X[c].median() for c in X],index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [15]:
class BankDataCastType(TransformerMixin):
  def transform(self, X, y=None):
        X=X.replace('?',np.nan)
        X.A2=X.A2.astype(float)
        X.A3=X.A3.astype(float)
        X.A8=X.A8.astype(float)
        X.A11=X.A11.astype(float)
        X.A14=X.A14.astype(float)
        X.A15=X.A15.astype(float)
        return X

In [6]:
diabetes_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']]=diabetes_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0,np.nan)

In [8]:
diabetes_data.describe(include=['object', 'float', 'int'])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Standardization

In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [10]:
normalize_zscore=StandardScaler(with_mean=True, with_std=True)
normalize_minmax=MinMaxScaler(feature_range=(0, 1))

## Train-Test

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Train test split
x = diabetes_data.drop(['Outcome'], axis=1)
y=np.ravel(diabetes_data['Outcome'])
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=50)

In [16]:
# Train test split
bank_data['A16']=bank_data['A16'].map({'-':1,'+':0})
x = bank_data.drop(['A16'], axis=1)
y=np.ravel(bank_data['A16'])
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=50)

## Decision Tree Classifier

In [None]:
# ตัวอย่าง classifier ที่มีข้นตอนของการทำ standardization แล้วใช้ logistic regression 
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
missing_model=ModeMedianImputer()
classifier_dt=DecisionTreeClassifier(criterion='gini')

models = Pipeline([
  ('missing', missing_model),
  ('zscore', normalize_zscore),
  ('dt_model', classifier_dt)
])

# ฝึกโมเดลใน Pipeline ด้วยข้อมูลชุดฝึกสอน
_=models.fit(x_train, y_train)

#### Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# นำ Pipeline มาทำนายข้อมูล

y_predict_new_data = models.predict(x_test)

# วัดประสิทธิภาพ

print(classification_report(y_test, y_predict_new_data))

cm_scaled = np.array(confusion_matrix(y_test, y_predict_new_data, labels=[0,1]))
confusion = pd.DataFrame(cm_scaled, index=['Negative', 'Positive'], columns=['Predicted Negative', 'Predicted Positive'])
print('Confusion matrix')
confusion

              precision    recall  f1-score   support

           0       0.77      0.82      0.79       101
           1       0.61      0.53      0.57        53

    accuracy                           0.72       154
   macro avg       0.69      0.68      0.68       154
weighted avg       0.71      0.72      0.72       154

Confusion matrix


Unnamed: 0,Predicted Negative,Predicted Positive
Negative,83,18
Positive,25,28


In [None]:
def show_performance(models,x_test):
  # นำ Pipeline มาทำนายข้อมูล
  y_predict_new_data = models.predict(x_test)
  # วัดประสิทธิภาพ
  print(classification_report(y_test, y_predict_new_data))
  cm_scaled = np.array(confusion_matrix(y_test, y_predict_new_data, labels=[0,1]))
  confusion = pd.DataFrame(cm_scaled, index=['Negative', 'Positive'], columns=['Predicted Negative', 'Predicted Positive'])
  print('Confusion matrix')
  print(confusion)

#### Other model

In [None]:
classifier_dt_entropy=DecisionTreeClassifier()

In [None]:
models = Pipeline([
  ('missing', missing_model),
  ('zscore', normalize_zscore),
  ('dt_model', classifier_dt_entropy)
])

# ฝึกโมเดลใน Pipeline ด้วยข้อมูลชุดฝึกสอน
_=models.fit(x_train, y_train)

In [None]:
show_performance(models,x_test)

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       101
           1       0.64      0.47      0.54        53

    accuracy                           0.73       154
   macro avg       0.70      0.67      0.67       154
weighted avg       0.72      0.73      0.72       154

Confusion matrix
          Predicted Negative  Predicted Positive
Negative                  87                  14
Positive                  28                  25


#### Parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
  {'criterion': ['entropy','gini']}
 ]

In [None]:
 prediction_model = GridSearchCV(
        DecisionTreeClassifier(class_weight={0:1,1:2}), 
        param_grid, 
        scoring='f1_micro',
        cv=5)

In [None]:
models = Pipeline([
  ('missing', missing_model),
  ('zscore', normalize_zscore),
  ('dt_model', prediction_model)
])
_=models.fit(x_train, y_train)
show_performance(models,x_test)

              precision    recall  f1-score   support

           0       0.77      0.74      0.75       101
           1       0.54      0.57      0.55        53

    accuracy                           0.68       154
   macro avg       0.65      0.65      0.65       154
weighted avg       0.69      0.68      0.68       154

Confusion matrix
          Predicted Negative  Predicted Positive
Negative                  75                  26
Positive                  23                  30


In [None]:
print('Best DT model')
print('n_neighbors: '+str(models.steps[2][1].best_estimator_.criterion))

Best DT model
n_neighbors: entropy


---
# ใช้ Model อื่นที่สามารถนำมาใช้ในการแยกประเภทได้


* [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html#classification)

 * ตัวอย่างการใช้
```
from sklearn.svm import SVC
model = SVC(kernel='rbf')
```
 * ตัวอย่างการใช้กับ Pipeline
```
from sklearn.svm import SVC
models = Pipeline([
  ('zscore', StandardScaler()),
  ('c_model', SVC(kernel='rbf'))
```
* [Neural Network](https://scikit-learn.org/stable/modules/neural_networks_supervised.html#classification)
 * ตัวอย่างการใช้
```
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
```
 * ตัวอย่างการใช้กับ Pipeline
```
from sklearn.neural_network import MLPClassifier
models = Pipeline([
  ('zscore', StandardScaler()),
  ('c_model', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1))
```



