# Machine Learning

In [1]:
import pandas as pd
import numpy as np

import catboost as cb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from joblib import dump, load

df: dataframe that only contain feature and dependent variable ***(fea_df in 'pandas cheatsheet')***

y_col: column name of dependent variable (e.g. 'Car_Type')
 
sample_size: number of sample for model training (e.g. 100)

model_file_name: joblib file (e.g. catboost_OP_20201027.joblib)

In [2]:
from sklearn.datasets import load_iris
iris= load_iris()

df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                  columns= iris['feature_names'] + ['Species'])

In [3]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [4]:
df['Species'] = df['Species'].astype('category')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   sepal length (cm)  150 non-null    float64 
 1   sepal width (cm)   150 non-null    float64 
 2   petal length (cm)  150 non-null    float64 
 3   petal width (cm)   150 non-null    float64 
 4   Species            150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB
None


## Model Training

In [5]:
def Training_Model(df, sample_size, y_col, model_file_name):
    
    # check if model exist
    try:
        load(model_file_name)
    except FileNotFoundError:
        print("The model is not exist, now bulid & save model")
        
        print('random select number of rows:', sample_size)
        df_train = df.sample(n=sample_size).copy()    
        print('sample of training dataset shape:',df_train.shape)        
        
        print('-----------------------------------------')
        print('value count of suspicious in sample df:')
        print(df_train[y_col].value_counts())    
    
        print('-----------------------------------------')
        
        print('set x, y')
        train_y = df_train[y_col]
        train_x = df_train.drop(y_col,axis=1)
    
    
        print('get index of category column')
        cat_col = train_x.select_dtypes(include=['category']).columns
        cat_col_index = []
        for col in cat_col:
            cat_col_index.append(train_x.columns.get_loc(col))
        print('index of category column:',cat_col_index)
    
    
        print('start training model')
        model = cb.CatBoostClassifier()
        
        parameters = {'depth': [6,8,10],
                      'learning_rate': [0.01, 0.05, 0.1],
                      'iterations' : [30, 50, 100]}
        
        grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 2, scoring='balanced_accuracy', n_jobs=-1)
        # state index of category column
        grid.fit(train_x, train_y, cat_features=cat_col_index) 
        
        # Save model in current folder 
        dump(grid, model_file_name)
    else:
        print("Reload existing model")
    
        grid = load(model_file_name)
    
    cv = pd.DataFrame(grid.cv_results_)
    
    print('-----------------------------------------')
    
    print('selected model paramatar')
    print(grid.best_params_)
            
    return grid, cv


In [6]:
model, cv_df = Training_Model(df=df, sample_size=100, y_col='Species', model_file_name='catboost_iris.joblib')

Reload existing model
-----------------------------------------
selected model paramatar
{'depth': 6, 'iterations': 30, 'learning_rate': 0.05}


## Prediction

In [7]:
# prediction for concated df (eclaim + hist)
df['pre_class'] = model.predict(df.drop('Species', axis=1, inplace=False))

In [8]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,pre_class
0,5.1,3.5,1.4,0.2,0.0,0.0
1,4.9,3.0,1.4,0.2,0.0,0.0
2,4.7,3.2,1.3,0.2,0.0,0.0
3,4.6,3.1,1.5,0.2,0.0,0.0
4,5.0,3.6,1.4,0.2,0.0,0.0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0,2.0
146,6.3,2.5,5.0,1.9,2.0,2.0
147,6.5,3.0,5.2,2.0,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0,2.0


In [9]:
probs = model.predict_proba(df.drop('Species', axis=1, inplace=False))
probs_df = pd.DataFrame(probs)

probs_df

Unnamed: 0,0,1,2
0,0.781736,0.110712,0.107552
1,0.759116,0.123506,0.117379
2,0.766767,0.119040,0.114194
3,0.768586,0.118117,0.113297
4,0.778298,0.112147,0.109555
...,...,...,...
145,0.118328,0.139360,0.742311
146,0.147481,0.189949,0.662570
147,0.113873,0.132506,0.753621
148,0.167173,0.178456,0.654371


In [10]:
new_df = df.join(probs_df)
new_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,pre_class,0,1,2
0,5.1,3.5,1.4,0.2,0.0,0.0,0.781736,0.110712,0.107552
1,4.9,3.0,1.4,0.2,0.0,0.0,0.759116,0.123506,0.117379
2,4.7,3.2,1.3,0.2,0.0,0.0,0.766767,0.119040,0.114194
3,4.6,3.1,1.5,0.2,0.0,0.0,0.768586,0.118117,0.113297
4,5.0,3.6,1.4,0.2,0.0,0.0,0.778298,0.112147,0.109555
...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0,2.0,0.118328,0.139360,0.742311
146,6.3,2.5,5.0,1.9,2.0,2.0,0.147481,0.189949,0.662570
147,6.5,3.0,5.2,2.0,2.0,2.0,0.113873,0.132506,0.753621
148,6.2,3.4,5.4,2.3,2.0,2.0,0.167173,0.178456,0.654371
